mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-02 17:09:36 +03:00
1311 lines
50 KiB
C++
1311 lines
50 KiB
C++
/***********************************************************************
|
|
Moses - factored phrase-based language decoder
|
|
Copyright (C) 2009 University of Edinburgh
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
***********************************************************************/
|
|
|
|
#include <sstream>
|
|
#include <assert.h>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <map>
|
|
#include <set>
|
|
#include <vector>
|
|
#include <algorithm>
|
|
#include <boost/algorithm/string/predicate.hpp>
|
|
#include <boost/unordered_map.hpp>
|
|
|
|
#include "ScoreFeature.h"
|
|
#include "tables-core.h"
|
|
#include "ExtractionPhrasePair.h"
|
|
#include "score.h"
|
|
#include "InputFileStream.h"
|
|
#include "OutputFileStream.h"
|
|
|
|
#include "moses/Util.h"
|
|
|
|
using namespace boost::algorithm;
|
|
using namespace MosesTraining;
|
|
|
|
namespace MosesTraining
|
|
{
|
|
LexicalTable lexTable;
|
|
bool inverseFlag = false;
|
|
bool hierarchicalFlag = false;
|
|
bool pcfgFlag = false;
|
|
bool phraseOrientationFlag = false;
|
|
bool treeFragmentsFlag = false;
|
|
bool partsOfSpeechFlag = false;
|
|
bool sourceSyntaxLabelsFlag = false;
|
|
bool sourceSyntaxLabelCountsLHSFlag = false;
|
|
bool targetSyntacticPreferencesFlag = false;
|
|
bool unpairedExtractFormatFlag = false;
|
|
bool conditionOnTargetLhsFlag = false;
|
|
bool wordAlignmentFlag = true;
|
|
bool goodTuringFlag = false;
|
|
bool kneserNeyFlag = false;
|
|
bool logProbFlag = false;
|
|
int negLogProb = 1;
|
|
#define COC_MAX 10
|
|
bool lexFlag = true;
|
|
bool unalignedFlag = false;
|
|
bool unalignedFWFlag = false;
|
|
bool crossedNonTerm = false;
|
|
bool spanLength = false;
|
|
bool ruleLength = false;
|
|
bool nonTermContext = false;
|
|
bool nonTermContextTarget = false;
|
|
bool targetConstituentBoundariesFlag = false;
|
|
|
|
int countOfCounts[COC_MAX+1];
|
|
int totalDistinct = 0;
|
|
float minCount = 0;
|
|
float minCountHierarchical = 0;
|
|
bool phraseOrientationPriorsFlag = false;
|
|
|
|
boost::unordered_map<std::string,float> sourceLHSCounts;
|
|
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
|
|
std::set<std::string> sourceLabelSet;
|
|
std::map<std::string,size_t> sourceLabels;
|
|
std::vector<std::string> sourceLabelsByIndex;
|
|
|
|
std::set<std::string> partsOfSpeechSet;
|
|
|
|
boost::unordered_map<std::string,float> targetSyntacticPreferencesLHSCounts;
|
|
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts;
|
|
std::set<std::string> targetSyntacticPreferencesLabelSet;
|
|
std::map<std::string,size_t> targetSyntacticPreferencesLabels;
|
|
std::vector<std::string> targetSyntacticPreferencesLabelsByIndex;
|
|
|
|
std::vector<float> orientationClassPriorsL2R(4,0); // mono swap dleft dright
|
|
std::vector<float> orientationClassPriorsR2L(4,0); // mono swap dleft dright
|
|
|
|
Vocabulary vcbT;
|
|
Vocabulary vcbS;
|
|
|
|
} // namespace
|
|
|
|
|
|
void processLine( std::string line,
|
|
int lineID, bool includeSentenceIdFlag, int &sentenceId,
|
|
PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
|
|
std::string &additionalPropertiesString,
|
|
float &count, float &pcfgSum );
|
|
void writeCountOfCounts( const std::string &fileNameCountOfCounts );
|
|
void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
|
|
const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
|
|
const std::string &fileNameLeftHandSideSourceLabelCounts,
|
|
const std::string &fileNameLeftHandSideTargetSourceLabelCounts );
|
|
void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName );
|
|
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
|
|
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
|
|
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
|
|
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
|
double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
|
|
std::set<std::string> functionWordList;
|
|
void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, std::vector<float> &orientationClassPriorsL2R, std::vector<float> &orientationClassPriorsR2L);
|
|
void loadFunctionWords( const std::string &fileNameFunctionWords );
|
|
double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
|
int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
|
void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
|
|
void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, std::ostream &out );
|
|
void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment );
|
|
size_t NumNonTerminal(const PHRASE *phraseSource);
|
|
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
std::cerr << "Score v2.1 -- "
|
|
<< "scoring methods for extracted rules" << std::endl;
|
|
|
|
ScoreFeatureManager featureManager;
|
|
if (argc < 4) {
|
|
std::cerr <<
|
|
"syntax: score extract lex phrase-table "
|
|
"[--Inverse] "
|
|
"[--Hierarchical] "
|
|
"[--LogProb] "
|
|
"[--NegLogProb] "
|
|
"[--NoLex] "
|
|
"[--GoodTuring] "
|
|
"[--KneserNey] "
|
|
"[--NoWordAlignment] "
|
|
"[--UnalignedPenalty] "
|
|
"[--UnalignedFunctionWordPenalty function-word-file] "
|
|
"[--MinCountHierarchical count] "
|
|
"[--PartsOfSpeech] "
|
|
"[--PCFG] "
|
|
"[--TreeFragments] "
|
|
"[--SourceLabels] "
|
|
"[--SourceLabelCountsLHS] "
|
|
"[--TargetSyntacticPreferences] "
|
|
"[--UnpairedExtractFormat] "
|
|
"[--ConditionOnTargetLHS] "
|
|
"[--CrossedNonTerm]"
|
|
<< std::endl;
|
|
std::cerr << featureManager.usage() << std::endl;
|
|
exit(1);
|
|
}
|
|
std::string fileNameExtract = argv[1];
|
|
std::string fileNameLex = argv[2];
|
|
std::string fileNamePhraseTable = argv[3];
|
|
std::string fileNameSourceLabelSet;
|
|
std::string fileNamePartsOfSpeechSet;
|
|
std::string fileNameCountOfCounts;
|
|
std::string fileNameFunctionWords;
|
|
std::string fileNameLeftHandSideSourceLabelCounts;
|
|
std::string fileNameLeftHandSideTargetSourceLabelCounts;
|
|
std::string fileNameTargetSyntacticPreferencesLabelSet;
|
|
std::string fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts;
|
|
std::string fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts;
|
|
std::string fileNamePhraseOrientationPriors;
|
|
// All unknown args are passed to feature manager.
|
|
std::vector<std::string> featureArgs;
|
|
|
|
for(int i=4; i<argc; i++) {
|
|
if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
|
|
inverseFlag = true;
|
|
std::cerr << "using inverse mode" << std::endl;
|
|
} else if (strcmp(argv[i],"--Hierarchical") == 0) {
|
|
hierarchicalFlag = true;
|
|
std::cerr << "processing hierarchical rules" << std::endl;
|
|
} else if (strcmp(argv[i],"--PCFG") == 0) {
|
|
pcfgFlag = true;
|
|
std::cerr << "including PCFG scores" << std::endl;
|
|
} else if (strcmp(argv[i],"--PhraseOrientation") == 0) {
|
|
phraseOrientationFlag = true;
|
|
std::cerr << "including phrase orientation information" << std::endl;
|
|
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
|
|
treeFragmentsFlag = true;
|
|
std::cerr << "including tree fragment information from syntactic parse" << std::endl;
|
|
} else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
|
|
partsOfSpeechFlag = true;
|
|
std::cerr << "including parts-of-speech information from syntactic parse" << std::endl;
|
|
fileNamePartsOfSpeechSet = std::string(fileNamePhraseTable) + ".partsOfSpeech";
|
|
std::cerr << "writing parts-of-speech set to file " << fileNamePartsOfSpeechSet << std::endl;
|
|
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
|
|
sourceSyntaxLabelsFlag = true;
|
|
std::cerr << "including source label information" << std::endl;
|
|
fileNameSourceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.src";
|
|
std::cerr << "writing source syntax label set to file " << fileNameSourceLabelSet << std::endl;
|
|
} else if (strcmp(argv[i],"--SourceLabelCountsLHS") == 0) {
|
|
sourceSyntaxLabelCountsLHSFlag = true;
|
|
fileNameLeftHandSideSourceLabelCounts = std::string(fileNamePhraseTable) + ".src.lhs";
|
|
fileNameLeftHandSideTargetSourceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-src.lhs";
|
|
std::cerr << "counting left-hand side source labels and writing them to files " << fileNameLeftHandSideSourceLabelCounts << " and " << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
|
|
} else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
|
|
targetSyntacticPreferencesFlag = true;
|
|
std::cerr << "including target syntactic preferences information" << std::endl;
|
|
fileNameTargetSyntacticPreferencesLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
|
|
std::cerr << "writing target syntactic preferences label set to file " << fileNameTargetSyntacticPreferencesLabelSet << std::endl;
|
|
fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
|
|
fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
|
|
std::cerr << "counting left-hand side target syntactic preferences labels and writing them to files "
|
|
<< fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts
|
|
<< " and "
|
|
<< fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts
|
|
<< std::endl;
|
|
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
|
|
unpairedExtractFormatFlag = true;
|
|
std::cerr << "processing unpaired extract format" << std::endl;
|
|
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
|
|
conditionOnTargetLhsFlag = true;
|
|
std::cerr << "processing unpaired extract format" << std::endl;
|
|
} else if (strcmp(argv[i],"--NoWordAlignment") == 0) {
|
|
wordAlignmentFlag = false;
|
|
std::cerr << "omitting word alignment" << std::endl;
|
|
} else if (strcmp(argv[i],"--NoLex") == 0) {
|
|
lexFlag = false;
|
|
std::cerr << "not computing lexical translation score" << std::endl;
|
|
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
|
|
goodTuringFlag = true;
|
|
fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
|
|
std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
|
|
} else if (strcmp(argv[i],"--KneserNey") == 0) {
|
|
kneserNeyFlag = true;
|
|
fileNameCountOfCounts = std::string(fileNamePhraseTable) + ".coc";
|
|
std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
|
|
} else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
|
|
unalignedFlag = true;
|
|
std::cerr << "using unaligned word penalty" << std::endl;
|
|
} else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
|
|
unalignedFWFlag = true;
|
|
if (i+1==argc) {
|
|
std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
|
|
exit(1);
|
|
}
|
|
fileNameFunctionWords = argv[++i];
|
|
std::cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << std::endl;
|
|
} else if (strcmp(argv[i],"--LogProb") == 0) {
|
|
logProbFlag = true;
|
|
std::cerr << "using log-probabilities" << std::endl;
|
|
} else if (strcmp(argv[i],"--NegLogProb") == 0) {
|
|
logProbFlag = true;
|
|
negLogProb = -1;
|
|
std::cerr << "using negative log-probabilities" << std::endl;
|
|
} else if (strcmp(argv[i],"--MinCount") == 0) {
|
|
minCount = std::atof( argv[++i] );
|
|
std::cerr << "dropping all phrase pairs occurring less than " << minCount << " times" << std::endl;
|
|
minCount -= 0.00001; // account for rounding
|
|
} else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
|
|
minCountHierarchical = std::atof( argv[++i] );
|
|
std::cerr << "dropping all hierarchical phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl;
|
|
minCountHierarchical -= 0.00001; // account for rounding
|
|
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
|
|
crossedNonTerm = true;
|
|
std::cerr << "crossed non-term reordering feature" << std::endl;
|
|
} else if (strcmp(argv[i],"--PhraseOrientationPriors") == 0) {
|
|
phraseOrientationPriorsFlag = true;
|
|
if (i+1==argc) {
|
|
std::cerr << "ERROR: specify priors file for phrase orientation!" << std::endl;
|
|
exit(1);
|
|
}
|
|
fileNamePhraseOrientationPriors = argv[++i];
|
|
std::cerr << "smoothing phrase orientation with priors from " << fileNamePhraseOrientationPriors << std::endl;
|
|
} else if (strcmp(argv[i],"--SpanLength") == 0) {
|
|
spanLength = true;
|
|
std::cerr << "span length feature" << std::endl;
|
|
} else if (strcmp(argv[i],"--RuleLength") == 0) {
|
|
ruleLength = true;
|
|
std::cerr << "rule length feature" << std::endl;
|
|
} else if (strcmp(argv[i],"--NonTermContext") == 0) {
|
|
nonTermContext = true;
|
|
std::cerr << "non-term context" << std::endl;
|
|
} else if (strcmp(argv[i],"--NonTermContextTarget") == 0) {
|
|
nonTermContextTarget = true;
|
|
std::cerr << "non-term context (target)" << std::endl;
|
|
} else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
|
|
targetConstituentBoundariesFlag = true;
|
|
std::cerr << "including target constituent boundaries information" << std::endl;
|
|
} else {
|
|
featureArgs.push_back(argv[i]);
|
|
++i;
|
|
for (; i < argc && strncmp(argv[i], "--", 2); ++i) {
|
|
featureArgs.push_back(argv[i]);
|
|
}
|
|
if (i != argc) --i; //roll back, since we found another -- argument
|
|
}
|
|
}
|
|
|
|
MaybeLog maybeLogProb(logProbFlag, negLogProb);
|
|
|
|
// configure extra features
|
|
if (!inverseFlag) {
|
|
featureManager.configure(featureArgs);
|
|
}
|
|
|
|
// lexical translation table
|
|
if (lexFlag) {
|
|
lexTable.load( fileNameLex );
|
|
}
|
|
|
|
// function word list
|
|
if (unalignedFWFlag) {
|
|
loadFunctionWords( fileNameFunctionWords );
|
|
}
|
|
|
|
// compute count of counts for Good Turing discounting
|
|
if (goodTuringFlag || kneserNeyFlag) {
|
|
for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
|
|
}
|
|
|
|
if (phraseOrientationPriorsFlag) {
|
|
loadOrientationPriors(fileNamePhraseOrientationPriors,orientationClassPriorsL2R,orientationClassPriorsR2L);
|
|
}
|
|
|
|
// sorted phrase extraction file
|
|
Moses::InputFileStream extractFile(fileNameExtract);
|
|
|
|
if (extractFile.fail()) {
|
|
std::cerr << "ERROR: could not open extract file " << fileNameExtract << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
// output file: phrase translation table
|
|
std::ostream *phraseTableFile;
|
|
|
|
if (fileNamePhraseTable == "-") {
|
|
phraseTableFile = &std::cout;
|
|
} else {
|
|
Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
|
|
bool success = outputFile->Open(fileNamePhraseTable);
|
|
if (!success) {
|
|
std::cerr << "ERROR: could not open file phrase table file "
|
|
<< fileNamePhraseTable << std::endl;
|
|
exit(1);
|
|
}
|
|
phraseTableFile = outputFile;
|
|
}
|
|
|
|
// loop through all extracted phrase translations
|
|
std::string line, lastLine;
|
|
ExtractionPhrasePair *phrasePair = NULL;
|
|
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
|
|
std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
|
|
|
|
int tmpSentenceId;
|
|
PHRASE *tmpPhraseSource, *tmpPhraseTarget;
|
|
ALIGNMENT *tmpTargetToSourceAlignment;
|
|
std::string tmpAdditionalPropertiesString;
|
|
float tmpCount=0.0f, tmpPcfgSum=0.0f;
|
|
|
|
int i=0;
|
|
if ( getline(extractFile, line) ) {
|
|
++i;
|
|
tmpPhraseSource = new PHRASE();
|
|
tmpPhraseTarget = new PHRASE();
|
|
tmpTargetToSourceAlignment = new ALIGNMENT();
|
|
processLine( std::string(line),
|
|
i, featureManager.includeSentenceId(), tmpSentenceId,
|
|
tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
|
tmpAdditionalPropertiesString,
|
|
tmpCount, tmpPcfgSum);
|
|
phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
|
|
tmpTargetToSourceAlignment,
|
|
tmpCount, tmpPcfgSum );
|
|
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
|
|
featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
|
|
phrasePairsWithSameSource.push_back( phrasePair );
|
|
if ( hierarchicalFlag ) {
|
|
phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
|
|
}
|
|
lastLine = line;
|
|
}
|
|
|
|
while ( getline(extractFile, line) ) {
|
|
|
|
// Print progress dots to stderr.
|
|
if ( ++i % 100000 == 0 ) {
|
|
std::cerr << "." << std::flush;
|
|
}
|
|
|
|
// identical to last line? just add count
|
|
if (line == lastLine) {
|
|
phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
|
|
continue;
|
|
} else {
|
|
lastLine = line;
|
|
}
|
|
|
|
tmpPhraseSource = new PHRASE();
|
|
tmpPhraseTarget = new PHRASE();
|
|
tmpTargetToSourceAlignment = new ALIGNMENT();
|
|
tmpAdditionalPropertiesString.clear();
|
|
processLine( std::string(line),
|
|
i, featureManager.includeSentenceId(), tmpSentenceId,
|
|
tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
|
tmpAdditionalPropertiesString,
|
|
tmpCount, tmpPcfgSum);
|
|
|
|
bool matchesPrevious = false;
|
|
bool sourceMatch = true;
|
|
bool targetMatch = true;
|
|
bool alignmentMatch = true; // be careful with these,
|
|
// ExtractionPhrasePair::Matches() checks them in order and does not continue with the others
|
|
// once the first of them has been found to have to be set to false
|
|
|
|
if ( hierarchicalFlag ) {
|
|
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin();
|
|
iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) {
|
|
if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
|
sourceMatch, targetMatch, alignmentMatch ) ) {
|
|
matchesPrevious = true;
|
|
phrasePair = (*iter);
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
if ( phrasePair->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
|
sourceMatch, targetMatch, alignmentMatch ) ) {
|
|
matchesPrevious = true;
|
|
}
|
|
}
|
|
|
|
if ( matchesPrevious ) {
|
|
delete tmpPhraseSource;
|
|
delete tmpPhraseTarget;
|
|
if ( !phrasePair->Add( tmpTargetToSourceAlignment,
|
|
tmpCount, tmpPcfgSum ) ) {
|
|
delete tmpTargetToSourceAlignment;
|
|
}
|
|
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
|
|
featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
|
|
} else {
|
|
|
|
if ( !phrasePairsWithSameSource.empty() &&
|
|
!sourceMatch ) {
|
|
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
|
|
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
|
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
|
delete *iter;
|
|
}
|
|
phrasePairsWithSameSource.clear();
|
|
if ( hierarchicalFlag ) {
|
|
phrasePairsWithSameSourceAndTarget.clear();
|
|
}
|
|
}
|
|
|
|
if ( hierarchicalFlag ) {
|
|
if ( !phrasePairsWithSameSourceAndTarget.empty() &&
|
|
!targetMatch ) {
|
|
phrasePairsWithSameSourceAndTarget.clear();
|
|
}
|
|
}
|
|
|
|
phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
|
|
tmpTargetToSourceAlignment,
|
|
tmpCount, tmpPcfgSum );
|
|
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
|
|
featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
|
|
phrasePairsWithSameSource.push_back(phrasePair);
|
|
|
|
if ( hierarchicalFlag ) {
|
|
phrasePairsWithSameSourceAndTarget.push_back(phrasePair);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
// We've been printing progress dots to stderr. End the line.
|
|
std::cerr << std::endl;
|
|
|
|
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
|
|
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
|
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
|
delete *iter;
|
|
}
|
|
phrasePairsWithSameSource.clear();
|
|
|
|
|
|
phraseTableFile->flush();
|
|
if (phraseTableFile != &std::cout) {
|
|
delete phraseTableFile;
|
|
}
|
|
|
|
// output count of count statistics
|
|
if (goodTuringFlag || kneserNeyFlag) {
|
|
writeCountOfCounts( fileNameCountOfCounts );
|
|
}
|
|
|
|
// source syntax labels
|
|
if (sourceSyntaxLabelsFlag && !inverseFlag) {
|
|
writeLabelSet( sourceLabelSet, fileNameSourceLabelSet );
|
|
}
|
|
if (sourceSyntaxLabelsFlag && sourceSyntaxLabelCountsLHSFlag && !inverseFlag) {
|
|
writeLeftHandSideLabelCounts( sourceLHSCounts,
|
|
targetLHSAndSourceLHSJointCounts,
|
|
fileNameLeftHandSideSourceLabelCounts,
|
|
fileNameLeftHandSideTargetSourceLabelCounts );
|
|
}
|
|
|
|
// parts-of-speech
|
|
if (partsOfSpeechFlag && !inverseFlag) {
|
|
writeLabelSet( partsOfSpeechSet, fileNamePartsOfSpeechSet );
|
|
}
|
|
|
|
// target syntactic preferences labels
|
|
if (targetSyntacticPreferencesFlag && !inverseFlag) {
|
|
writeLabelSet( targetSyntacticPreferencesLabelSet, fileNameTargetSyntacticPreferencesLabelSet );
|
|
writeLeftHandSideLabelCounts( targetSyntacticPreferencesLHSCounts,
|
|
ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
|
|
fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts,
|
|
fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts );
|
|
}
|
|
}
|
|
|
|
|
|
void processLine( std::string line,
|
|
int lineID, bool includeSentenceIdFlag, int &sentenceId,
|
|
PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
|
|
std::string &additionalPropertiesString,
|
|
float &count, float &pcfgSum )
|
|
{
|
|
size_t foundAdditionalProperties = line.rfind("|||");
|
|
foundAdditionalProperties = line.find("{{",foundAdditionalProperties);
|
|
if (foundAdditionalProperties != std::string::npos) {
|
|
additionalPropertiesString = line.substr(foundAdditionalProperties);
|
|
line = line.substr(0,foundAdditionalProperties);
|
|
} else {
|
|
additionalPropertiesString.clear();
|
|
}
|
|
|
|
phraseSource->clear();
|
|
phraseTarget->clear();
|
|
targetToSourceAlignment->clear();
|
|
|
|
std::vector<std::string> token;
|
|
Moses::Tokenize( token, line );
|
|
int item = 1;
|
|
for ( size_t j=0; j<token.size(); ++j ) {
|
|
if (token[j] == "|||") {
|
|
++item;
|
|
} else if (item == 1) { // source phrase
|
|
phraseSource->push_back( vcbS.storeIfNew( token[j] ) );
|
|
} else if (item == 2) { // target phrase
|
|
phraseTarget->push_back( vcbT.storeIfNew( token[j] ) );
|
|
} else if (item == 3) { // alignment
|
|
int s,t;
|
|
sscanf(token[j].c_str(), "%d-%d", &s, &t);
|
|
if ((size_t)t >= phraseTarget->size() || (size_t)s >= phraseSource->size()) {
|
|
std::cerr << "WARNING: phrase pair " << lineID
|
|
<< " has alignment point (" << s << ", " << t << ")"
|
|
<< " out of bounds (" << phraseSource->size() << ", " << phraseTarget->size() << ")"
|
|
<< std::endl;
|
|
} else {
|
|
// first alignment point? -> initialize
|
|
if ( targetToSourceAlignment->size() == 0 ) {
|
|
size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
|
|
targetToSourceAlignment->resize(numberOfTargetSymbols);
|
|
}
|
|
// add alignment point
|
|
targetToSourceAlignment->at(t).insert(s);
|
|
}
|
|
} else if (includeSentenceIdFlag && item == 4) { // optional sentence id
|
|
sscanf(token[j].c_str(), "%d", &sentenceId);
|
|
} else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
|
|
sscanf(token[j].c_str(), "%f", &count);
|
|
} else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score
|
|
float pcfgScore = std::atof( token[j].c_str() );
|
|
pcfgSum = pcfgScore * count;
|
|
}
|
|
}
|
|
|
|
if ( targetToSourceAlignment->size() == 0 ) {
|
|
size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size());
|
|
targetToSourceAlignment->resize(numberOfTargetSymbols);
|
|
}
|
|
|
|
if (item + (includeSentenceIdFlag?-1:0) == 3) {
|
|
count = 1.0;
|
|
}
|
|
if (item < 3 || item > (includeSentenceIdFlag?7:6)) {
|
|
std::cerr << "ERROR: faulty line " << lineID << ": " << line << std::endl;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
void writeCountOfCounts( const std::string &fileNameCountOfCounts )
|
|
{
|
|
// open file
|
|
Moses::OutputFileStream countOfCountsFile;
|
|
bool success = countOfCountsFile.Open(fileNameCountOfCounts);
|
|
if (!success) {
|
|
std::cerr << "ERROR: could not open count-of-counts file "
|
|
<< fileNameCountOfCounts << std::endl;
|
|
return;
|
|
}
|
|
|
|
// Kneser-Ney needs the total number of phrase pairs
|
|
countOfCountsFile << totalDistinct << std::endl;
|
|
|
|
// write out counts
|
|
for(int i=1; i<=COC_MAX; i++) {
|
|
countOfCountsFile << countOfCounts[ i ] << std::endl;
|
|
}
|
|
countOfCountsFile.Close();
|
|
}
|
|
|
|
|
|
void writeLeftHandSideLabelCounts( const boost::unordered_map<std::string,float> &countsLabelLHS,
|
|
const boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > &jointCountsLabelLHS,
|
|
const std::string &fileNameLeftHandSideSourceLabelCounts,
|
|
const std::string &fileNameLeftHandSideTargetSourceLabelCounts )
|
|
{
|
|
// open file
|
|
Moses::OutputFileStream leftHandSideSourceLabelCounts;
|
|
bool success = leftHandSideSourceLabelCounts.Open(fileNameLeftHandSideSourceLabelCounts);
|
|
if (!success) {
|
|
std::cerr << "ERROR: could not open left-hand side label counts file "
|
|
<< fileNameLeftHandSideSourceLabelCounts << std::endl;
|
|
return;
|
|
}
|
|
|
|
// write source left-hand side counts
|
|
for (boost::unordered_map<std::string,float>::const_iterator iter=sourceLHSCounts.begin();
|
|
iter!=sourceLHSCounts.end(); ++iter) {
|
|
leftHandSideSourceLabelCounts << iter->first << " " << iter->second << std::endl;
|
|
}
|
|
|
|
leftHandSideSourceLabelCounts.Close();
|
|
|
|
// open file
|
|
Moses::OutputFileStream leftHandSideTargetSourceLabelCounts;
|
|
success = leftHandSideTargetSourceLabelCounts.Open(fileNameLeftHandSideTargetSourceLabelCounts);
|
|
if (!success) {
|
|
std::cerr << "ERROR: could not open left-hand side label joint counts file "
|
|
<< fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
|
|
return;
|
|
}
|
|
|
|
// write source left-hand side / target left-hand side joint counts
|
|
for (boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::const_iterator iter=targetLHSAndSourceLHSJointCounts.begin();
|
|
iter!=targetLHSAndSourceLHSJointCounts.end(); ++iter) {
|
|
for (boost::unordered_map<std::string,float>::const_iterator iter2=(iter->second)->begin();
|
|
iter2!=(iter->second)->end(); ++iter2) {
|
|
leftHandSideTargetSourceLabelCounts << iter->first << " "<< iter2->first << " " << iter2->second << std::endl;
|
|
}
|
|
}
|
|
|
|
leftHandSideTargetSourceLabelCounts.Close();
|
|
}
|
|
|
|
|
|
void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fileName )
|
|
{
|
|
// open file
|
|
Moses::OutputFileStream out;
|
|
bool success = out.Open(fileName);
|
|
if (!success) {
|
|
std::cerr << "ERROR: could not open file "
|
|
<< fileName << " for writing" << std::endl;
|
|
return;
|
|
}
|
|
|
|
for (std::set<std::string>::const_iterator iter=labelSet.begin();
|
|
iter!=labelSet.end(); ++iter) {
|
|
out << *iter << std::endl;
|
|
}
|
|
|
|
out.Close();
|
|
}
|
|
|
|
|
|
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile,
|
|
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
|
|
{
|
|
if (phrasePairsWithSameSource.size() == 0) {
|
|
return;
|
|
}
|
|
|
|
float totalSource = 0;
|
|
|
|
//std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
|
|
|
|
// loop through phrase pairs
|
|
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
|
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
|
// add to total count
|
|
totalSource += (*iter)->GetCount();
|
|
}
|
|
|
|
// output the distinct phrase pairs, one at a time
|
|
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
|
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
|
// add to total count
|
|
outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
|
|
}
|
|
}
|
|
|
|
void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
|
float totalCount, int distinctCount,
|
|
std::ostream &phraseTableFile,
|
|
const ScoreFeatureManager& featureManager,
|
|
const MaybeLog& maybeLogProb )
|
|
{
|
|
assert(phrasePair.IsValid());
|
|
|
|
const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
|
|
float count = phrasePair.GetCount();
|
|
|
|
std::map< std::string, float > domainCount;
|
|
|
|
// collect count of count statistics
|
|
if (goodTuringFlag || kneserNeyFlag) {
|
|
totalDistinct++;
|
|
int countInt = count + 0.99999;
|
|
if ((countInt <= COC_MAX) &&
|
|
(countInt > 0))
|
|
countOfCounts[ countInt ]++;
|
|
}
|
|
|
|
// output phrases
|
|
const PHRASE *phraseSource = phrasePair.GetSource();
|
|
const PHRASE *phraseTarget = phrasePair.GetTarget();
|
|
|
|
// do not output if count below threshold
|
|
if (count < minCount) {
|
|
return;
|
|
}
|
|
|
|
// do not output if hierarchical and count below threshold
|
|
if (hierarchicalFlag && count < minCountHierarchical) {
|
|
for(size_t j=0; j<phraseSource->size()-1; ++j) {
|
|
if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
|
|
return;
|
|
}
|
|
}
|
|
|
|
// compute PCFG score
|
|
float pcfgScore = 0;
|
|
if (pcfgFlag && !inverseFlag) {
|
|
pcfgScore = phrasePair.GetPcfgScore() / count;
|
|
}
|
|
|
|
// source phrase (unless inverse)
|
|
if (!inverseFlag) {
|
|
printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
|
|
phraseTableFile << " ||| ";
|
|
}
|
|
|
|
// target phrase
|
|
printTargetPhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
|
|
phraseTableFile << " ||| ";
|
|
|
|
// source phrase (if inverse)
|
|
if (inverseFlag) {
|
|
printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile);
|
|
phraseTableFile << " ||| ";
|
|
}
|
|
|
|
// alignment
|
|
if ( hierarchicalFlag ) {
|
|
// always output alignment if hiero style
|
|
assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
|
|
std::vector<std::string> alignment;
|
|
for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
|
|
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
|
|
if ( bestAlignmentT2S->at(j).size() != 1 ) {
|
|
std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
|
|
phraseTableFile.flush();
|
|
assert(bestAlignmentT2S->at(j).size() == 1);
|
|
}
|
|
size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
|
|
//phraseTableFile << sourcePos << "-" << j << " ";
|
|
std::stringstream point;
|
|
point << sourcePos << "-" << j;
|
|
alignment.push_back(point.str());
|
|
} else {
|
|
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
|
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
|
size_t sourcePos = *setIter;
|
|
std::stringstream point;
|
|
point << sourcePos << "-" << j;
|
|
alignment.push_back(point.str());
|
|
}
|
|
}
|
|
}
|
|
// now print all alignments, sorted by source index
|
|
sort(alignment.begin(), alignment.end());
|
|
for (size_t i = 0; i < alignment.size(); ++i) {
|
|
phraseTableFile << alignment[i] << " ";
|
|
}
|
|
} else if ( !inverseFlag && wordAlignmentFlag) {
|
|
// alignment info in pb model
|
|
for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
|
|
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
|
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
|
size_t sourcePos = *setIter;
|
|
phraseTableFile << sourcePos << "-" << j << " ";
|
|
}
|
|
}
|
|
}
|
|
|
|
phraseTableFile << " ||| ";
|
|
|
|
// lexical translation probability
|
|
if (lexFlag) {
|
|
double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S );
|
|
phraseTableFile << maybeLogProb( lexScore );
|
|
}
|
|
|
|
// unaligned word penalty
|
|
if (unalignedFlag) {
|
|
double penalty = computeUnalignedPenalty( bestAlignmentT2S );
|
|
phraseTableFile << " " << maybeLogProb( penalty );
|
|
}
|
|
|
|
// unaligned function word penalty
|
|
if (unalignedFWFlag) {
|
|
double penalty = computeUnalignedFWPenalty( phraseTarget, bestAlignmentT2S );
|
|
phraseTableFile << " " << maybeLogProb( penalty );
|
|
}
|
|
|
|
if (crossedNonTerm && !inverseFlag) {
|
|
phraseTableFile << " " << calcCrossedNonTerm( phraseTarget, bestAlignmentT2S );
|
|
}
|
|
|
|
// target-side PCFG score
|
|
if (pcfgFlag && !inverseFlag) {
|
|
phraseTableFile << " " << maybeLogProb( pcfgScore );
|
|
}
|
|
|
|
// extra features
|
|
ScoreFeatureContext context(phrasePair, maybeLogProb);
|
|
std::vector<float> extraDense;
|
|
std::map<std::string,float> extraSparse;
|
|
featureManager.addFeatures(context, extraDense, extraSparse);
|
|
for (size_t i = 0; i < extraDense.size(); ++i) {
|
|
phraseTableFile << " " << extraDense[i];
|
|
}
|
|
|
|
for (std::map<std::string,float>::const_iterator i = extraSparse.begin();
|
|
i != extraSparse.end(); ++i) {
|
|
phraseTableFile << " " << i->first << " " << i->second;
|
|
}
|
|
|
|
// counts
|
|
phraseTableFile << " ||| " << totalCount << " " << count;
|
|
if (kneserNeyFlag)
|
|
phraseTableFile << " " << distinctCount;
|
|
|
|
phraseTableFile << " |||";
|
|
|
|
// tree fragments
|
|
if (treeFragmentsFlag && !inverseFlag) {
|
|
const std::string *bestTreeFragment = phrasePair.FindBestPropertyValue("Tree");
|
|
if (bestTreeFragment) {
|
|
phraseTableFile << " {{Tree " << *bestTreeFragment << "}}";
|
|
}
|
|
}
|
|
|
|
// parts-of-speech
|
|
if (partsOfSpeechFlag && !inverseFlag) {
|
|
phrasePair.UpdateVocabularyFromValueTokens("POS", partsOfSpeechSet);
|
|
const std::string *bestPartOfSpeech = phrasePair.FindBestPropertyValue("POS");
|
|
if (bestPartOfSpeech) {
|
|
phraseTableFile << " {{POS " << *bestPartOfSpeech << "}}";
|
|
}
|
|
}
|
|
|
|
// syntax labels
|
|
if ((sourceSyntaxLabelsFlag || targetSyntacticPreferencesFlag) && !inverseFlag) {
|
|
unsigned nNTs = 1;
|
|
for(size_t j=0; j<phraseSource->size()-1; ++j) {
|
|
if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
|
|
++nNTs;
|
|
}
|
|
// source syntax labels
|
|
if (sourceSyntaxLabelsFlag) {
|
|
std::string sourceLabelCounts;
|
|
sourceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("SourceLabels",
|
|
sourceLabelSet,
|
|
sourceLHSCounts,
|
|
targetLHSAndSourceLHSJointCounts,
|
|
vcbT);
|
|
if ( !sourceLabelCounts.empty() ) {
|
|
phraseTableFile << " {{SourceLabels "
|
|
<< phraseSource->size() // for convenience: number of symbols in this rule (incl. left hand side NT)
|
|
<< " "
|
|
<< count // rule count
|
|
<< sourceLabelCounts
|
|
<< "}}";
|
|
}
|
|
}
|
|
// target syntactic preferences labels
|
|
if (targetSyntacticPreferencesFlag) {
|
|
std::string targetSyntacticPreferencesLabelCounts;
|
|
targetSyntacticPreferencesLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
|
|
targetSyntacticPreferencesLabelSet,
|
|
targetSyntacticPreferencesLHSCounts,
|
|
ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
|
|
vcbT);
|
|
if (!targetSyntacticPreferencesLabelCounts.empty()) {
|
|
phraseTableFile << " {{TargetPreferences "
|
|
<< nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
|
|
<< " "
|
|
<< count // rule count
|
|
<< targetSyntacticPreferencesLabelCounts
|
|
<< "}}";
|
|
}
|
|
}
|
|
}
|
|
|
|
// phrase orientation
|
|
if (phraseOrientationFlag && !inverseFlag) {
|
|
phraseTableFile << " {{Orientation ";
|
|
phrasePair.CollectAllPhraseOrientations("Orientation",orientationClassPriorsL2R,orientationClassPriorsR2L,0.5,phraseTableFile);
|
|
phraseTableFile << "}}";
|
|
}
|
|
|
|
if (spanLength && !inverseFlag) {
|
|
std::string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
|
|
if (!propValue.empty()) {
|
|
phraseTableFile << " {{SpanLength " << propValue << "}}";
|
|
}
|
|
}
|
|
|
|
if (ruleLength && !inverseFlag) {
|
|
std::string propValue = phrasePair.CollectAllPropertyValues("RuleLength");
|
|
if (!propValue.empty()) {
|
|
phraseTableFile << " {{RuleLength " << propValue << "}}";
|
|
}
|
|
}
|
|
|
|
if (nonTermContext && !inverseFlag) {
|
|
std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContext");
|
|
if (!propValue.empty() && propValue.size() < 50000) {
|
|
size_t nNTs = NumNonTerminal(phraseSource);
|
|
phraseTableFile << " {{NonTermContext " << nNTs << " " << propValue << "}}";
|
|
}
|
|
}
|
|
|
|
if (nonTermContextTarget && !inverseFlag) {
|
|
std::string propValue = phrasePair.CollectAllPropertyValues("NonTermContextTarget");
|
|
if (!propValue.empty() && propValue.size() < 50000) {
|
|
size_t nNTs = NumNonTerminal(phraseSource);
|
|
phraseTableFile << " {{NonTermContextTarget " << nNTs << " " << propValue << "}}";
|
|
}
|
|
}
|
|
|
|
// target constituent boundaries
|
|
if (targetConstituentBoundariesFlag && !inverseFlag) {
|
|
const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft");
|
|
if (!targetConstituentBoundariesLeftValues.empty()) {
|
|
phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}";
|
|
}
|
|
const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent");
|
|
if (!targetConstituentBoundariesRightAdjacentValues.empty()) {
|
|
phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}";
|
|
}
|
|
}
|
|
|
|
phraseTableFile << std::endl;
|
|
}
|
|
|
|
size_t NumNonTerminal(const PHRASE *phraseSource)
|
|
{
|
|
size_t nNTs = 0;
|
|
for(size_t j=0; j<phraseSource->size()-1; ++j) {
|
|
if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
|
|
++nNTs;
|
|
}
|
|
return nNTs;
|
|
}
|
|
|
|
void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
|
|
std::vector<float> &orientationClassPriorsL2R,
|
|
std::vector<float> &orientationClassPriorsR2L)
|
|
{
|
|
assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright
|
|
|
|
std::cerr << "Loading phrase orientation priors from " << fileNamePhraseOrientationPriors;
|
|
Moses::InputFileStream inFile(fileNamePhraseOrientationPriors);
|
|
if (inFile.fail()) {
|
|
std::cerr << " - ERROR: could not open file" << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
std::string line;
|
|
size_t linesRead = 0;
|
|
float l2rSum = 0;
|
|
float r2lSum = 0;
|
|
while (getline(inFile, line)) {
|
|
std::istringstream tokenizer(line);
|
|
std::string key;
|
|
tokenizer >> key;
|
|
|
|
bool l2rFlag = false;
|
|
bool r2lFlag = false;
|
|
if (starts_with(key, "L2R_")) {
|
|
l2rFlag = true;
|
|
}
|
|
if (starts_with(key, "R2L_")) {
|
|
r2lFlag = true;
|
|
}
|
|
if (!l2rFlag && !r2lFlag) {
|
|
std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
|
|
}
|
|
key.erase(0,4);
|
|
|
|
int orientationClassId = -1;
|
|
if (!key.compare("mono")) {
|
|
orientationClassId = 0;
|
|
}
|
|
if (!key.compare("swap")) {
|
|
orientationClassId = 1;
|
|
}
|
|
if (!key.compare("dleft")) {
|
|
orientationClassId = 2;
|
|
}
|
|
if (!key.compare("dright")) {
|
|
orientationClassId = 3;
|
|
}
|
|
if (orientationClassId == -1) {
|
|
std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
|
|
}
|
|
|
|
float count;
|
|
tokenizer >> count;
|
|
|
|
if (l2rFlag) {
|
|
orientationClassPriorsL2R[orientationClassId] += count;
|
|
l2rSum += count;
|
|
}
|
|
if (r2lFlag) {
|
|
orientationClassPriorsR2L[orientationClassId] += count;
|
|
r2lSum += count;
|
|
}
|
|
|
|
++linesRead;
|
|
}
|
|
|
|
// normalization: return prior probabilities, not counts
|
|
if (l2rSum != 0) {
|
|
for (std::vector<float>::iterator orientationClassPriorsL2RIt = orientationClassPriorsL2R.begin();
|
|
orientationClassPriorsL2RIt != orientationClassPriorsL2R.end(); ++orientationClassPriorsL2RIt) {
|
|
*orientationClassPriorsL2RIt /= l2rSum;
|
|
}
|
|
}
|
|
if (r2lSum != 0) {
|
|
for (std::vector<float>::iterator orientationClassPriorsR2LIt = orientationClassPriorsR2L.begin();
|
|
orientationClassPriorsR2LIt != orientationClassPriorsR2L.end(); ++orientationClassPriorsR2LIt) {
|
|
*orientationClassPriorsR2LIt /= r2lSum;
|
|
}
|
|
}
|
|
|
|
std::cerr << " - read " << linesRead << " lines from orientation priors file" << std::endl;
|
|
inFile.Close();
|
|
}
|
|
|
|
|
|
|
|
bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource )
|
|
{
|
|
for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) {
|
|
if (currTarget == targetPos) {
|
|
// skip
|
|
} else {
|
|
const std::set<size_t> &sourceSet = alignmentTargetToSource->at(currTarget);
|
|
for (std::set<size_t>::const_iterator iter = sourceSet.begin();
|
|
iter != sourceSet.end(); ++iter) {
|
|
size_t currSource = *iter;
|
|
|
|
if ((currTarget < targetPos && currSource > sourcePos)
|
|
|| (currTarget > targetPos && currSource < sourcePos)
|
|
) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
|
|
{
|
|
assert(phraseTarget->size() >= alignmentTargetToSource->size() );
|
|
|
|
for (size_t targetPos = 0; targetPos < alignmentTargetToSource->size(); ++targetPos) {
|
|
|
|
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(targetPos) ))) {
|
|
const std::set<size_t> &alignmentPoints = alignmentTargetToSource->at(targetPos);
|
|
assert( alignmentPoints.size() == 1 );
|
|
size_t sourcePos = *alignmentPoints.begin();
|
|
bool ret = calcCrossedNonTerm(targetPos, sourcePos, alignmentTargetToSource);
|
|
if (ret)
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource )
|
|
{
|
|
// unaligned word counter
|
|
double unaligned = 1.0;
|
|
// only checking target words - source words are caught when computing inverse
|
|
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
|
|
const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
|
if (srcIndices.empty()) {
|
|
unaligned *= 2.718;
|
|
}
|
|
}
|
|
return unaligned;
|
|
}
|
|
|
|
|
|
double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
|
|
{
|
|
// unaligned word counter
|
|
double unaligned = 1.0;
|
|
// only checking target words - source words are caught when computing inverse
|
|
for(size_t ti=0; ti<alignmentTargetToSource->size(); ++ti) {
|
|
const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
|
if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) {
|
|
unaligned *= 2.718;
|
|
}
|
|
}
|
|
return unaligned;
|
|
}
|
|
|
|
void loadFunctionWords( const std::string &fileName )
|
|
{
|
|
std::cerr << "Loading function word list from " << fileName;
|
|
Moses::InputFileStream inFile(fileName);
|
|
if (inFile.fail()) {
|
|
std::cerr << " - ERROR: could not open file" << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
std::string line;
|
|
while(getline(inFile, line)) {
|
|
std::vector<std::string> token;
|
|
Moses::Tokenize( token, line );
|
|
if (token.size() > 0)
|
|
functionWordList.insert( token[0] );
|
|
}
|
|
|
|
std::cerr << " - read " << functionWordList.size() << " function words" << std::endl;
|
|
inFile.Close();
|
|
}
|
|
|
|
|
|
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource )
|
|
{
|
|
// lexical translation probability
|
|
double lexScore = 1.0;
|
|
int null = vcbS.getWordID("NULL");
|
|
// all target words have to be explained
|
|
for(size_t ti=0; ti<alignmentTargetToSource->size(); ti++) {
|
|
const std::set< size_t > & srcIndices = alignmentTargetToSource->at(ti);
|
|
if (srcIndices.empty()) {
|
|
// explain unaligned word by NULL
|
|
lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) );
|
|
} else {
|
|
// go through all the aligned words to compute average
|
|
double thisWordScore = 0;
|
|
for (std::set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
|
|
thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) );
|
|
}
|
|
lexScore *= thisWordScore / (double)srcIndices.size();
|
|
}
|
|
}
|
|
return lexScore;
|
|
}
|
|
|
|
|
|
void LexicalTable::load( const std::string &fileName )
|
|
{
|
|
std::cerr << "Loading lexical translation table from " << fileName;
|
|
Moses::InputFileStream inFile(fileName);
|
|
if (inFile.fail()) {
|
|
std::cerr << " - ERROR: could not open file" << std::endl;
|
|
exit(1);
|
|
}
|
|
|
|
std::string line;
|
|
int i=0;
|
|
while(getline(inFile, line)) {
|
|
i++;
|
|
if (i%100000 == 0) std::cerr << "." << std::flush;
|
|
|
|
std::vector<std::string> token;
|
|
Moses::Tokenize( token, line );
|
|
if (token.size() != 3) {
|
|
std::cerr << "line " << i << " in " << fileName
|
|
<< " has wrong number of tokens, skipping:" << std::endl
|
|
<< token.size() << " " << token[0] << " " << line << std::endl;
|
|
continue;
|
|
}
|
|
|
|
double prob = std::atof( token[2].c_str() );
|
|
WORD_ID wordT = vcbT.storeIfNew( token[0] );
|
|
WORD_ID wordS = vcbS.storeIfNew( token[1] );
|
|
ltable[ wordS ][ wordT ] = prob;
|
|
}
|
|
std::cerr << std::endl;
|
|
}
|
|
|
|
|
|
void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
|
const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
|
|
{
|
|
// get corresponding target non-terminal and output pair
|
|
ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT();
|
|
invertAlignment(phraseSource, phraseTarget, targetToSourceAlignment, sourceToTargetAlignment);
|
|
// output source symbols, except root, in rule table format
|
|
for (std::size_t i = 0; i < phraseSource->size()-1; ++i) {
|
|
const std::string &word = vcbS.getWord(phraseSource->at(i));
|
|
if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
|
|
out << word << " ";
|
|
continue;
|
|
}
|
|
const std::set<std::size_t> &alignmentPoints = sourceToTargetAlignment->at(i);
|
|
assert(alignmentPoints.size() == 1);
|
|
size_t j = *(alignmentPoints.begin());
|
|
if (inverseFlag) {
|
|
out << vcbT.getWord(phraseTarget->at(j)) << word << " ";
|
|
} else {
|
|
out << word << vcbT.getWord(phraseTarget->at(j)) << " ";
|
|
}
|
|
}
|
|
// output source root symbol
|
|
if (conditionOnTargetLhsFlag && !inverseFlag) {
|
|
out << "[X]";
|
|
} else {
|
|
out << vcbS.getWord(phraseSource->back());
|
|
}
|
|
delete sourceToTargetAlignment;
|
|
}
|
|
|
|
|
|
void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
|
const ALIGNMENT *targetToSourceAlignment, std::ostream &out)
|
|
{
|
|
// output target symbols, except root, in rule table format
|
|
for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) {
|
|
const std::string &word = vcbT.getWord(phraseTarget->at(i));
|
|
if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
|
|
out << word << " ";
|
|
continue;
|
|
}
|
|
// get corresponding source non-terminal and output pair
|
|
std::set<std::size_t> alignmentPoints = targetToSourceAlignment->at(i);
|
|
assert(alignmentPoints.size() == 1);
|
|
int j = *(alignmentPoints.begin());
|
|
if (inverseFlag) {
|
|
out << word << vcbS.getWord(phraseSource->at(j)) << " ";
|
|
} else {
|
|
out << vcbS.getWord(phraseSource->at(j)) << word << " ";
|
|
}
|
|
}
|
|
// output target root symbol
|
|
if (conditionOnTargetLhsFlag) {
|
|
if (inverseFlag) {
|
|
out << "[X]";
|
|
} else {
|
|
out << vcbS.getWord(phraseSource->back());
|
|
}
|
|
} else {
|
|
out << vcbT.getWord(phraseTarget->back());
|
|
}
|
|
}
|
|
|
|
|
|
void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
|
const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment)
|
|
{
|
|
// typedef std::vector< std::set<size_t> > ALIGNMENT;
|
|
|
|
outSourceToTargetAlignment->clear();
|
|
size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size());
|
|
outSourceToTargetAlignment->resize(numberOfSourceSymbols);
|
|
// add alignment point
|
|
for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) {
|
|
for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin();
|
|
setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) {
|
|
size_t sourcePosition = *setIter;
|
|
outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition);
|
|
}
|
|
}
|
|
}
|
|
|