mosesdecoder/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp
2015-10-02 20:11:35 +01:00

223 lines
6.8 KiB
C++

/*
Moses - statistical machine translation system
Copyright (C) 2005-2015 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <vector>
#include <string>
#include <sstream>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/unordered_map.hpp>
#include <boost/unordered_set.hpp>
#include <boost/program_options.hpp>
#include "util/file_stream.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
namespace po = boost::program_options;
int main(int argc, char **argv)
{
util::FileStream err(2);
std::string filenameNBestListIn, filenameFeatureNamesOut, filenameIgnoreFeatureNames;
size_t maxNBestSize;
try {
po::options_description descr("Usage");
descr.add_options()
("help,h", "produce help message")
("n-best-list,n", po::value<std::string>(&filenameNBestListIn)->required(),
"input n-best list file")
("write-feature-names-file,f", po::value<std::string>(&filenameFeatureNamesOut)->required(),
"output file for mapping between feature names and indices")
("ignore-features-file,i", po::value<std::string>(&filenameIgnoreFeatureNames)->required(),
"input file containing list of feature names to be ignored")
("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
"limit of n-best list entries to be considered")
;
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, descr), vm);
if (vm.count("help")) {
std::ostringstream os;
os << descr;
std::cout << os.str() << '\n';
exit(0);
}
po::notify(vm);
} catch(std::exception& e) {
err << "Error: " << e.what() << '\n';
err.flush();
exit(1);
}
util::FilePiece ifsNBest(filenameNBestListIn.c_str());
util::FilePiece ifsIgnoreFeatureNames(filenameIgnoreFeatureNames.c_str());
util::scoped_fd fdFeatureNames(util::CreateOrThrow(filenameFeatureNamesOut.c_str()));
util::FileStream ofsFeatureNames(fdFeatureNames.get());
util::FileStream ofsNBest(1);
boost::unordered_set<std::string> ignoreFeatureNames;
StringPiece line;
while ( ifsIgnoreFeatureNames.ReadLineOrEOF(line) )
{
if ( !line.empty() ) {
util::TokenIter<util::AnyCharacter> item(line, " \t=");
if ( item != item.end() )
{
ignoreFeatureNames.insert(item->as_string());
}
err << "ignoring " << *item << '\n';
}
}
size_t maxFeatureNamesIdx = 0;
boost::unordered_map<std::string, size_t> featureNames;
size_t sentenceIndex = 0;
size_t nBestSizeCount = 0;
size_t globalIndex = 0;
while ( ifsNBest.ReadLineOrEOF(line) )
{
util::TokenIter<util::MultiCharacter> item(line, " ||| ");
if ( item == item.end() )
{
err << "Error: flawed content in " << filenameNBestListIn << '\n';
exit(1);
}
size_t sentenceIndexCurrent = atol( item->as_string().c_str() );
if ( sentenceIndex != sentenceIndexCurrent )
{
nBestSizeCount = 0;
sentenceIndex = sentenceIndexCurrent;
}
if ( nBestSizeCount < maxNBestSize )
{
// process n-best list entry
StringPiece scores;
StringPiece decoderScore;
for (size_t nItem=1; nItem<=3; ++nItem)
{
if ( ++item == item.end() ) {
err << "Error: flawed content in " << filenameNBestListIn << '\n';
exit(1);
}
if (nItem == 2) {
scores = *item;
}
if (nItem == 3) {
decoderScore = *item;
}
}
ofsNBest << sentenceIndex << ' '
<< decoderScore;
util::TokenIter<util::SingleCharacter> token(scores, ' ');
std::string featureNameCurrent("ERROR");
std::string featureNameCurrentBase("ERROR");
bool ignore = false;
int scoreComponentIndex = 0;
while ( token != token.end() )
{
if ( token->ends_with("=") )
{
scoreComponentIndex = 0;
featureNameCurrent = token->substr(0,token->size()-1).as_string();
size_t idx = featureNameCurrent.find_first_of('_');
if ( idx == StringPiece::npos ) {
featureNameCurrentBase = featureNameCurrent;
} else {
featureNameCurrentBase = featureNameCurrent.substr(0,idx+1);
}
ignore = false;
if ( ignoreFeatureNames.find(featureNameCurrentBase) != ignoreFeatureNames.end() )
{
ignore = true;
} else {
if ( (featureNameCurrent.compare(featureNameCurrentBase)) &&
(ignoreFeatureNames.find(featureNameCurrent) != ignoreFeatureNames.end()) )
{
ignore = true;
}
}
}
else
{
if ( !ignore )
{
float featureValueCurrent = atof( token->as_string().c_str() );;
if ( scoreComponentIndex > 0 )
{
std::ostringstream oss;
oss << scoreComponentIndex;
featureNameCurrent.append("+");
}
if ( featureValueCurrent != 0 )
{
boost::unordered_map<std::string, size_t>::iterator featureName = featureNames.find(featureNameCurrent);
if ( featureName == featureNames.end() )
{
std::pair< boost::unordered_map<std::string, size_t>::iterator, bool> inserted =
featureNames.insert( std::make_pair(featureNameCurrent, maxFeatureNamesIdx) );
++maxFeatureNamesIdx;
featureName = inserted.first;
}
ofsNBest << ' ' << featureName->second // feature name index
<< ' ' << *token; // feature value
}
++scoreComponentIndex;
}
}
++token;
}
ofsNBest << '\n';
++nBestSizeCount;
}
++globalIndex;
}
ofsFeatureNames << maxFeatureNamesIdx << '\n';
for (boost::unordered_map<std::string, size_t>::const_iterator featureNamesIt=featureNames.begin();
featureNamesIt!=featureNames.end(); ++featureNamesIt)
{
ofsFeatureNames << featureNamesIt->second << ' ' << featureNamesIt->first << '\n';
}
}