2010-04-14 19:50:17 +04:00
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2009 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
2011-03-08 18:45:03 +03:00
|
|
|
#include <string.h>
|
2010-04-14 19:50:17 +04:00
|
|
|
#include <fstream>
|
|
|
|
#include <vector>
|
|
|
|
#include <string>
|
|
|
|
#include <iostream>
|
|
|
|
#include <cstdlib>
|
2010-11-23 17:15:54 +03:00
|
|
|
#include "InputFileStream.h"
|
2012-05-11 21:28:59 +04:00
|
|
|
#include "OutputFileStream.h"
|
2010-04-14 19:50:17 +04:00
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
2015-02-04 12:18:09 +03:00
|
|
|
std::vector<std::string> tokenize( const char [] );
|
|
|
|
|
2014-06-08 19:23:14 +04:00
|
|
|
vector< string > splitLine(const char *line)
|
2010-04-14 19:50:17 +04:00
|
|
|
{
|
|
|
|
vector< string > item;
|
|
|
|
int start=0;
|
2011-02-24 16:57:11 +03:00
|
|
|
int i=0;
|
2010-04-14 19:50:17 +04:00
|
|
|
for(; line[i] != '\0'; i++) {
|
2011-02-24 16:57:11 +03:00
|
|
|
if (line[i] == ' ' &&
|
|
|
|
line[i+1] == '|' &&
|
|
|
|
line[i+2] == '|' &&
|
|
|
|
line[i+3] == '|' &&
|
|
|
|
line[i+4] == ' ') {
|
|
|
|
if (start > i) start = i; // empty item
|
|
|
|
item.push_back( string( line+start, i-start ) );
|
|
|
|
start = i+5;
|
|
|
|
i += 3;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
item.push_back( string( line+start, i-start ) );
|
|
|
|
|
2010-04-14 19:50:17 +04:00
|
|
|
return item;
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
bool getLine( istream &fileP, vector< string > &item )
|
2010-04-14 19:50:17 +04:00
|
|
|
{
|
2011-02-24 16:57:11 +03:00
|
|
|
if (fileP.eof())
|
|
|
|
return false;
|
2015-01-14 14:07:42 +03:00
|
|
|
|
2014-06-08 19:23:14 +04:00
|
|
|
string line;
|
|
|
|
if (getline(fileP, line)) {
|
|
|
|
item = splitLine(line.c_str());
|
2015-02-04 12:18:09 +03:00
|
|
|
return true;
|
2015-01-14 14:07:42 +03:00
|
|
|
} else {
|
2014-06-08 19:23:14 +04:00
|
|
|
return false;
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int main(int argc, char* argv[])
|
2010-04-14 19:50:17 +04:00
|
|
|
{
|
|
|
|
cerr << "Starting..." << endl;
|
2010-11-23 17:15:54 +03:00
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
char* &fileNameDirect = argv[1];
|
|
|
|
Moses::InputFileStream fileDirect(fileNameDirect);
|
|
|
|
|
|
|
|
|
|
|
|
//fileDirect.open(fileNameDirect);
|
|
|
|
if (fileDirect.fail()) {
|
2010-04-14 19:50:17 +04:00
|
|
|
cerr << "ERROR: could not open extract file " << fileNameDirect << endl;
|
|
|
|
exit(1);
|
|
|
|
}
|
2011-02-24 16:57:11 +03:00
|
|
|
istream &fileDirectP = fileDirect;
|
2010-04-14 19:50:17 +04:00
|
|
|
|
|
|
|
char* &fileNameConsolidated = argv[2];
|
2011-03-07 05:44:34 +03:00
|
|
|
ostream *fileConsolidated;
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
|
|
if (strcmp(fileNameConsolidated, "-") == 0) {
|
|
|
|
fileConsolidated = &cout;
|
|
|
|
} else {
|
2012-05-11 21:28:59 +04:00
|
|
|
Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
|
2013-05-29 21:16:15 +04:00
|
|
|
bool success = outputFile->Open(fileNameConsolidated);
|
|
|
|
if (!success) {
|
|
|
|
cerr << "ERROR: could not open file phrase table file "
|
|
|
|
<< fileNameConsolidated << endl;
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
fileConsolidated = outputFile;
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
int i=0;
|
2010-04-14 19:50:17 +04:00
|
|
|
while(true) {
|
2011-02-24 16:57:11 +03:00
|
|
|
i++;
|
2010-04-14 19:50:17 +04:00
|
|
|
if (i%1000 == 0) cerr << "." << flush;
|
|
|
|
if (i%10000 == 0) cerr << ":" << flush;
|
|
|
|
if (i%100000 == 0) cerr << "!" << flush;
|
2011-02-24 16:57:11 +03:00
|
|
|
|
|
|
|
vector< string > itemDirect;
|
|
|
|
if (! getLine(fileDirectP, itemDirect ))
|
|
|
|
break;
|
|
|
|
|
2015-02-04 12:18:09 +03:00
|
|
|
vector< string > count = tokenize( itemDirect[4].c_str() );
|
|
|
|
float countEF = atof(count[0].c_str());
|
|
|
|
float countF = atof(count[1].c_str());
|
|
|
|
float prob = countF/countEF;
|
|
|
|
|
|
|
|
(*fileConsolidated) << itemDirect[0] << " ||| " // source
|
|
|
|
<< itemDirect[1] << " ||| " // target
|
|
|
|
<< prob << " ||| " // prob
|
|
|
|
<< itemDirect[2] << "||| " // alignment
|
|
|
|
<< itemDirect[4] << " " << countEF // counts
|
|
|
|
<< " ||| " << endl;
|
2011-02-24 16:57:11 +03:00
|
|
|
}
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
fileConsolidated->flush();
|
|
|
|
if (fileConsolidated != &cout) {
|
|
|
|
delete fileConsolidated;
|
|
|
|
}
|
|
|
|
|
2010-04-14 19:50:17 +04:00
|
|
|
cerr << "Finished" << endl;
|
|
|
|
}
|
|
|
|
|