mosesdecoder/misc/GenerateSequences.cpp

515 lines
10 KiB
C++
Raw Normal View History

2013-08-27 16:22:01 +04:00
#include <fstream>
#include <iostream>
#include <vector>
#include <sstream>
#include <map>
#include <set>
#include <cstdlib>
2013-09-22 18:47:54 +04:00
2013-08-27 16:22:01 +04:00
using namespace std;
int stringToInteger(string s)
{
2013-08-28 14:06:27 +04:00
istringstream buffer(s);
int some_int;
buffer >> some_int;
return some_int;
2013-08-27 16:22:01 +04:00
}
void loadInput(const char * fileName, vector <string> & input)
{
2013-09-27 12:35:24 +04:00
ifstream sr (fileName);
string line;
if(sr.is_open()) {
while(getline(sr , line )) {
input.push_back(line);
}
sr.close();
} else {
cout<<"Unable to read "<<fileName<<endl;
exit(1);
}
2013-08-27 16:22:01 +04:00
}
2013-09-22 18:47:54 +04:00
void getWords(string s, vector <string> & currInput)
2013-08-27 16:22:01 +04:00
{
2013-09-27 12:35:24 +04:00
istringstream iss(s);
currInput.clear();
do {
string sub;
iss >> sub;
currInput.push_back(sub);
2013-08-27 16:22:01 +04:00
2013-09-27 12:35:24 +04:00
} while (iss);
2013-08-27 16:22:01 +04:00
2013-09-27 12:35:24 +04:00
currInput.pop_back();
2013-08-27 16:22:01 +04:00
}
string getTranslation(int index, vector < pair <string , vector <int> > > & gCepts , vector <string> & currF , map <string,int> & singletons)
{
2013-08-28 14:06:27 +04:00
string translation = "";
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
vector <int> fSide = gCepts[index].second;
vector <int> :: iterator iter;
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
for (iter = fSide.begin(); iter != fSide.end(); iter++) {
if (iter != fSide.begin())
translation += "^_^";
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
translation+= currF[*iter];
}
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
if(singletons.find(translation)==singletons.end()) {
return "_TRANS_" + gCepts[index].first + "_TO_" + translation + " ";
}
else {
return "_TRANS_SLF_ ";
}
2013-08-27 16:22:01 +04:00
}
int closestGap(map <int,string> gap,int j1, int & gp)
{
2013-08-28 14:06:27 +04:00
int dist=1172;
int value=-1;
int temp=0;
gp=0;
int opGap=0;
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
map <int,string> :: iterator iter;
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
iter=gap.end();
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
do {
iter--;
//cout<<"Trapped "<<iter->first<<endl;
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
if(iter->first==j1 and iter->second=="Unfilled") {
opGap++;
gp = opGap;
return j1;
}
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
if(iter->second =="Unfilled") {
opGap++;
temp = iter->first - j1;
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
if(temp<0)
temp=temp * -1;
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
if(dist>temp && iter->first < j1) {
dist=temp;
value=iter->first;
gp=opGap;
}
}
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
} while(iter!=gap.begin());
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
//cout<<"Out"<<endl;
return value;
2013-08-27 16:22:01 +04:00
}
void generateStory(vector <pair <string , vector <int> > > & gCepts, set <int> & targetNullWords, vector<string> & currF, map <string,int> & singletons)
{
2013-08-28 14:06:27 +04:00
int fl = 0;
int i = 0; // Current English string position
int j = 0; // Current French Position
int N = gCepts.size(); // Total number of English words
int k = 0; // Number of already generate French words
int E = 0; // Position after most rightward French word generate so far
int j1 = 0; // Next french translation;
int Li =0; // Links of word i
int Lj=0; // Links of word j
map <int,int > generated;
map <int,string> gap;
map <int,int> :: iterator iter;
int gp=0;
//vector <string> iterator :: iterF;
while (targetNullWords.find(j) != targetNullWords.end()) {
cout<<"_INS_"<<currF[j]<<" ";
generated[j]=-1; // This word is generated -1 means unlinked ...
j=j+1;
}
while (i < gCepts.size() && gCepts[i].second.size() == 0) {
cout<<"_DEL_"<<gCepts[i].first<<" ";
i=i+1;
}
E=j; // Update the position of most rightward French word
while (i<N) {
//cout<<"I am sending to the link "<<i<<" with 0 "<<endl;
//j1 = getLink(i,0,Li,k);
Li = gCepts[i].second.size();
j1 = gCepts[i].second[k];
//cout<<"i = "<<i<<" j1 = "<<j1<<" j = "<<j<<" E = "<<E<<endl;
if(j<j1) { // reordering needed ...
iter = generated.find(j);
if( iter == generated.end()) { // fj is not generated ...
cout<<"_INS_GAP_ ";
gap[j] = "Unfilled";
}
if (j==E) {
j=j1;
} else {
cout<<"_JMP_FWD_ ";
j=E;
}
}
if(j1<j) {
iter = generated.find(j);
if(j<E && iter == generated.end()) { // fj is not generated ...
cout<<"_INS_GAP_ ";
gap[j]="Unfilled";
}
j=closestGap(gap,j1,gp);
//cout<<j<<endl;
cout<<"_JMP_BCK_"<<gp<<" ";
if(j==j1)
gap[j]="Filled";
}
if(j<j1) {
cout<<"_INS_GAP_ ";
gap[j] = "Unfilled";
j=j1;
}
if(k==0) {
cout<<getTranslation(i, gCepts,currF,singletons);
} else {
cout<<"_CONT_CEPT_ ";
}
generated[j]=i;
j=j+1;
k=k+1;
while(targetNullWords.find(j) != targetNullWords.end()) { // fj is unlinked word ...
//cout<<"Came here"<<j<<k<<endl;
cout<<"_INS_"<<currF[j]<<" ";
generated[j]=-1; // This word is generated -1 means unlinked ...
j=j+1;
}
if(E<j)
E=j;
//cout<<" Li "<<Li<<endl;
if(k==Li) {
i=i+1;
k=0;
while(i < gCepts.size() && gCepts[i].second.size() == 0) { // ei is unliked word ...
cout<<"_DEL_"<<gCepts[i].first<<" ";
i=i+1;
}
}
}
cout<<endl;
2013-08-27 16:22:01 +04:00
}
void ceptsInGenerativeStoryFormat(vector < pair < set <int> , set <int> > > & ceptsInPhrase , vector < pair < string , vector <int> > > & gCepts , set <int> & sourceNullWords, vector <string> & currE)
{
2013-08-28 14:06:27 +04:00
gCepts.clear();
set <int> eSide;
set <int> fSide;
std::set <int> :: iterator iter;
string english;
vector <int> germanIndex;
int engIndex = 0;
int prev;
int curr;
set <int> engDone;
for (int i = 0; i< ceptsInPhrase.size(); i++) {
english = "";
germanIndex.clear();
fSide = ceptsInPhrase[i].first;
eSide = ceptsInPhrase[i].second;
while(engIndex < *eSide.begin()) {
// cout<<engIndex<<" "<<*eSide.begin()<<endl;
while(engDone.find(engIndex) != engDone.end())
engIndex++;
while(sourceNullWords.find(engIndex) != sourceNullWords.end()) {
english = currE[engIndex];
engIndex++;
gCepts.push_back(make_pair (english , germanIndex));
english = "";
}
}
for (iter = eSide.begin(); iter != eSide.end(); iter++) {
curr = *iter;
if(iter != eSide.begin()) {
english += "^_^";
if (prev == curr-1) {
prev++;
engIndex++;
} else
engDone.insert(curr);
} else {
prev = curr;
//engIndex++;
engIndex = prev+1;
}
english +=currE[curr];
}
for (iter = fSide.begin(); iter != fSide.end(); iter++) {
germanIndex.push_back(*iter);
}
gCepts.push_back(make_pair (english , germanIndex));
// cout<<engIndex<<endl;
}
english = "";
germanIndex.clear();
//for (int i = 0; i< currE.size(); i++)
// cout<<i<<" "<<currE[i]<<endl;
while(engIndex < currE.size()) {
// cout<<engIndex<<" "<<currE.size()-1<<endl;
while(engDone.find(engIndex) != engDone.end())
engIndex++;
while(sourceNullWords.find(engIndex) != sourceNullWords.end()) {
english = currE[engIndex];
//cout<<"Here "<<engIndex<<english<<" "<<germanIndex.size()<<endl;
engIndex++;
gCepts.push_back(make_pair (english , germanIndex));
english = "";
}
}
2013-08-27 16:22:01 +04:00
}
void printCepts(vector < pair < string , vector <int> > > & gCepts , vector <string> & currF)
{
2013-08-28 14:06:27 +04:00
string eSide;
vector <int> fSide;
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
for (int i = 0; i < gCepts.size(); i++) {
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
fSide = gCepts[i].second;
eSide = gCepts[i].first;
cout<<eSide;
cout<<" <---> ";
for (int j = 0; j < fSide.size(); j++) {
cout<<currF[fSide[j]]<<" ";
}
cout<<endl;
}
2013-08-27 16:22:01 +04:00
}
void getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT)
{
2013-08-28 14:06:27 +04:00
set <int> :: iterator iter;
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
int sz = eSide.size();
vector <int> t;
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
for (iter = eSide.begin(); iter != eSide.end(); iter++) {
t = tS[*iter];
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
for (int i = 0; i < t.size(); i++) {
fSide.insert(t[i]);
}
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
}
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
for (iter = fSide.begin(); iter != fSide.end(); iter++) {
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
t = sT[*iter];
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
for (int i = 0 ; i<t.size(); i++) {
eSide.insert(t[i]);
}
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
}
2013-08-27 16:22:01 +04:00
2013-08-28 14:06:27 +04:00
if (eSide.size () > sz) {
getMeCepts(eSide,fSide,tS,sT);
}
2013-08-27 16:22:01 +04:00
}
void constructCepts(vector < pair < set <int> , set <int> > > & ceptsInPhrase, set <int> & sourceNullWords, set <int> & targetNullWords, vector <string> & alignment, int eSize, int fSize)
{
2013-08-28 14:06:27 +04:00
ceptsInPhrase.clear();
sourceNullWords.clear();
targetNullWords.clear();
vector <int> align;
std::map <int , vector <int> > sT;
std::map <int , vector <int> > tS;
std::set <int> eSide;
std::set <int> fSide;
std::set <int> :: iterator iter;
std :: map <int , vector <int> > :: iterator iter2;
std :: pair < set <int> , set <int> > cept;
int src;
int tgt;
ceptsInPhrase.clear();
for (int j=0; j<alignment.size(); j+=2) {
align.push_back(stringToInteger(alignment[j+1]));
align.push_back(stringToInteger(alignment[j]));
}
for (int i = 0; i < align.size(); i+=2) {
src = align[i];
tgt = align[i+1];
tS[tgt].push_back(src);
sT[src].push_back(tgt);
}
for (int i = 0; i< fSize; i++) {
if (sT.find(i) == sT.end()) {
targetNullWords.insert(i);
}
}
for (int i = 0; i< eSize; i++) {
if (tS.find(i) == tS.end()) {
sourceNullWords.insert(i);
}
}
while (tS.size() != 0 && sT.size() != 0) {
iter2 = tS.begin();
eSide.clear();
fSide.clear();
eSide.insert (iter2->first);
getMeCepts(eSide, fSide, tS , sT);
for (iter = eSide.begin(); iter != eSide.end(); iter++) {
iter2 = tS.find(*iter);
tS.erase(iter2);
}
for (iter = fSide.begin(); iter != fSide.end(); iter++) {
iter2 = sT.find(*iter);
sT.erase(iter2);
}
cept = make_pair (fSide , eSide);
ceptsInPhrase.push_back(cept);
}
2013-08-27 16:22:01 +04:00
}
int main(int argc, char * argv[])
{
2013-09-27 12:35:24 +04:00
vector <string> e;
vector <string> f;
vector <string> a;
vector <string> singletons;
map <string,int> sTons;
vector < pair < set <int> , set <int> > > ceptsInPhrase;
vector < pair < string , vector <int> > > gCepts;
set <int> sourceNullWords;
set <int> targetNullWords;
vector <string> currE;
vector <string> currF;
vector <string> currA;
loadInput(argv[4],singletons);
for(int i=0; i<singletons.size(); i++)
sTons[singletons[i]]=i;
loadInput(argv[1],e);
loadInput(argv[2],f);
loadInput(argv[3],a);
for (int i=0; i<a.size(); i++) {
getWords(e[i],currE);
getWords(f[i],currF);
getWords(a[i],currA);
constructCepts(ceptsInPhrase, sourceNullWords , targetNullWords, currA , currE.size(), currF.size());
//cout<<"CC done"<<endl;
ceptsInGenerativeStoryFormat(ceptsInPhrase , gCepts , sourceNullWords, currE);
//cout<<"format done"<<endl;
// printCepts(gCepts, currF);
generateStory(gCepts, targetNullWords ,currF,sTons);
/*
cout<<"________________________________________"<<endl;
cout<<"Press any integer to continue ..."<<endl;
int xx;
cin>>xx;
*/
}
return 0;
2013-08-27 16:22:01 +04:00
}