mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-11-14 10:47:21 +03:00
246 lines
7.3 KiB
C++
246 lines
7.3 KiB
C++
/***********************************************************************
|
|
Moses - factored phrase-based language decoder
|
|
Copyright (C) 2009 University of Edinburgh
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
***********************************************************************/
|
|
|
|
#include <cstdio>
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
|
|
#include "tables-core.h"
|
|
#include "SafeGetline.h"
|
|
#include "InputFileStream.h"
|
|
|
|
#define LINE_MAX_LENGTH 10000
|
|
|
|
using namespace std;
|
|
|
|
bool hierarchicalFlag = false;
|
|
bool onlyDirectFlag = false;
|
|
bool phraseCountFlag = true;
|
|
bool logProbFlag = false;
|
|
char line[LINE_MAX_LENGTH];
|
|
|
|
void processFiles( char*, char*, char* );
|
|
bool getLine( istream &fileP, vector< string > &item );
|
|
string reverseAlignment(const string &alignments);
|
|
vector< string > splitLine();
|
|
|
|
inline void Tokenize(std::vector<std::string> &output
|
|
, const std::string& str
|
|
, const std::string& delimiters = " \t")
|
|
{
|
|
// Skip delimiters at beginning.
|
|
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
|
|
// Find first "non-delimiter".
|
|
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
|
|
|
|
while (std::string::npos != pos || std::string::npos != lastPos) {
|
|
// Found a token, add it to the vector.
|
|
output.push_back(str.substr(lastPos, pos - lastPos));
|
|
// Skip delimiters. Note the "not_of"
|
|
lastPos = str.find_first_not_of(delimiters, pos);
|
|
// Find next "non-delimiter"
|
|
pos = str.find_first_of(delimiters, lastPos);
|
|
}
|
|
}
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
cerr << "Consolidate v2.0 written by Philipp Koehn\n"
|
|
<< "consolidating direct and indirect rule tables\n";
|
|
|
|
if (argc < 4) {
|
|
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect]\n";
|
|
exit(1);
|
|
}
|
|
char* &fileNameDirect = argv[1];
|
|
char* &fileNameIndirect = argv[2];
|
|
char* &fileNameConsolidated = argv[3];
|
|
|
|
for(int i=4; i<argc; i++) {
|
|
if (strcmp(argv[i],"--Hierarchical") == 0) {
|
|
hierarchicalFlag = true;
|
|
cerr << "processing hierarchical rules\n";
|
|
} else if (strcmp(argv[i],"--OnlyDirect") == 0) {
|
|
onlyDirectFlag = true;
|
|
cerr << "only including direct translation scores p(e|f)\n";
|
|
} else if (strcmp(argv[i],"--NoPhraseCount") == 0) {
|
|
phraseCountFlag = false;
|
|
cerr << "not including the phrase count feature\n";
|
|
} else if (strcmp(argv[i],"--LogProb") == 0) {
|
|
logProbFlag = true;
|
|
cerr << "using log-probabilities\n";
|
|
} else {
|
|
cerr << "ERROR: unknown option " << argv[i] << endl;
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated );
|
|
}
|
|
|
|
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated )
|
|
{
|
|
// open input files
|
|
Moses::InputFileStream fileDirect(fileNameDirect);
|
|
Moses::InputFileStream fileIndirect(fileNameIndirect);
|
|
|
|
if (fileDirect.fail()) {
|
|
cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl;
|
|
exit(1);
|
|
}
|
|
istream &fileDirectP = fileDirect;
|
|
|
|
if (fileIndirect.fail()) {
|
|
cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl;
|
|
exit(1);
|
|
}
|
|
istream &fileIndirectP = fileIndirect;
|
|
|
|
// open output file: consolidated phrase table
|
|
ofstream fileConsolidated;
|
|
fileConsolidated.open(fileNameConsolidated);
|
|
if (fileConsolidated.fail()) {
|
|
cerr << "ERROR: could not open output file " << fileNameConsolidated << endl;
|
|
exit(1);
|
|
}
|
|
|
|
// loop through all extracted phrase translations
|
|
int i=0;
|
|
while(true) {
|
|
i++;
|
|
if (i%100000 == 0) cerr << "." << flush;
|
|
|
|
vector< string > itemDirect, itemIndirect;
|
|
if (! getLine(fileIndirectP,itemIndirect) ||
|
|
! getLine(fileDirectP, itemDirect ))
|
|
break;
|
|
|
|
// direct: target source alignment probabilities
|
|
// indirect: source target probabilities
|
|
|
|
// consistency checks
|
|
if (itemDirect[0].compare( itemIndirect[0] ) != 0) {
|
|
cerr << "ERROR: target phrase does not match in line " << i << ": '"
|
|
<< itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl;
|
|
exit(1);
|
|
}
|
|
|
|
if (itemDirect[1].compare( itemIndirect[1] ) != 0) {
|
|
cerr << "ERROR: source phrase does not match in line " << i << ": '"
|
|
<< itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl;
|
|
exit(1);
|
|
}
|
|
|
|
// output hierarchical phrase pair (with separated labels)
|
|
fileConsolidated << itemDirect[1] << " ||| " << itemDirect[0];
|
|
|
|
// probs
|
|
fileConsolidated << " ||| ";
|
|
if (!onlyDirectFlag) {
|
|
fileConsolidated << itemDirect[2]; // prob indirect
|
|
}
|
|
fileConsolidated << " " << itemIndirect[2]; // prob direct
|
|
if (phraseCountFlag) {
|
|
fileConsolidated << " " << (logProbFlag ? 1 : 2.718); // phrase count feature
|
|
}
|
|
|
|
// alignment
|
|
fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
|
|
|
|
// counts, for debugging
|
|
vector<string> directCounts = tokenize(itemDirect[4].c_str());
|
|
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
|
|
fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
|
|
// output rule count if present in either file
|
|
if (indirectCounts.size() > 1) {
|
|
fileConsolidated << " " << indirectCounts[1];
|
|
} else if (directCounts.size() > 1) {
|
|
fileConsolidated << " " << directCounts[1];
|
|
}
|
|
|
|
fileConsolidated << endl;
|
|
}
|
|
fileDirect.Close();
|
|
fileIndirect.Close();
|
|
fileConsolidated.close();
|
|
}
|
|
|
|
bool getLine( istream &fileP, vector< string > &item )
|
|
{
|
|
if (fileP.eof())
|
|
return false;
|
|
|
|
SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
|
if (fileP.eof())
|
|
return false;
|
|
|
|
item = splitLine();
|
|
|
|
return true;
|
|
}
|
|
|
|
vector< string > splitLine()
|
|
{
|
|
vector< string > item;
|
|
bool betweenWords = true;
|
|
int start=0;
|
|
int i=0;
|
|
for(; line[i] != '\0'; i++) {
|
|
if (line[i] == ' ' &&
|
|
line[i+1] == '|' &&
|
|
line[i+2] == '|' &&
|
|
line[i+3] == '|' &&
|
|
line[i+4] == ' ') {
|
|
if (start > i) start = i; // empty item
|
|
item.push_back( string( line+start, i-start ) );
|
|
start = i+5;
|
|
i += 3;
|
|
}
|
|
}
|
|
item.push_back( string( line+start, i-start ) );
|
|
|
|
return item;
|
|
}
|
|
|
|
string reverseAlignment(const string &alignments)
|
|
{
|
|
stringstream ret("");
|
|
|
|
vector<string> alignToks = tokenize(alignments.c_str());
|
|
|
|
for (size_t i = 0; i < alignToks.size(); ++i)
|
|
{
|
|
string &alignPair = alignToks[i];
|
|
vector<string> alignPoints;
|
|
Tokenize(alignPoints, alignPair, "-");
|
|
assert(alignPoints.size() == 2);
|
|
|
|
ret << alignPoints[1] << "-" << alignPoints[0] << " ";
|
|
}
|
|
|
|
return ret.str();
|
|
}
|
|
|
|
|