2011-09-07 20:42:46 +04:00
/***********************************************************************
Moses - factored phrase - based language decoder
Copyright ( C ) 2009 University of Edinburgh
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2.1 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
# include <sstream>
# include <assert.h>
2015-07-24 22:43:29 +03:00
# include <cstdlib>
2011-09-07 20:42:46 +04:00
# include <cstring>
2014-01-29 22:37:42 +04:00
# include <map>
2011-09-07 20:42:46 +04:00
# include <set>
2014-01-29 22:37:42 +04:00
# include <vector>
2012-04-30 08:30:29 +04:00
# include <algorithm>
2015-02-05 19:23:47 +03:00
# include <boost/algorithm/string/predicate.hpp>
2014-06-11 22:27:18 +04:00
# include <boost/unordered_map.hpp>
2011-09-07 20:42:46 +04:00
2012-11-03 03:30:51 +04:00
# include "ScoreFeature.h"
2011-09-07 20:42:46 +04:00
# include "tables-core.h"
2014-01-29 22:37:42 +04:00
# include "ExtractionPhrasePair.h"
2011-09-07 20:42:46 +04:00
# include "score.h"
# include "InputFileStream.h"
2012-05-23 19:38:31 +04:00
# include "OutputFileStream.h"
2011-09-07 20:42:46 +04:00
2015-03-09 21:49:32 +03:00
# include "moses/Util.h"
2015-02-05 19:23:47 +03:00
using namespace boost : : algorithm ;
2012-06-30 18:43:47 +04:00
using namespace MosesTraining ;
2011-09-07 20:42:46 +04:00
2012-06-30 18:43:47 +04:00
namespace MosesTraining
2011-09-07 20:42:46 +04:00
{
LexicalTable lexTable ;
bool inverseFlag = false ;
bool hierarchicalFlag = false ;
2012-05-25 20:29:47 +04:00
bool pcfgFlag = false ;
2014-07-28 21:27:12 +04:00
bool phraseOrientationFlag = false ;
2013-09-13 19:10:21 +04:00
bool treeFragmentsFlag = false ;
2015-03-05 00:40:56 +03:00
bool partsOfSpeechFlag = false ;
2014-06-11 22:27:18 +04:00
bool sourceSyntaxLabelsFlag = false ;
bool sourceSyntaxLabelCountsLHSFlag = false ;
2016-01-10 02:02:31 +03:00
bool targetSyntacticPreferencesFlag = false ;
2012-05-27 15:43:16 +04:00
bool unpairedExtractFormatFlag = false ;
bool conditionOnTargetLhsFlag = false ;
2012-11-15 16:35:43 +04:00
bool wordAlignmentFlag = true ;
2011-09-07 20:42:46 +04:00
bool goodTuringFlag = false ;
bool kneserNeyFlag = false ;
bool logProbFlag = false ;
int negLogProb = 1 ;
2012-11-03 03:30:51 +04:00
# define COC_MAX 10
2011-09-07 20:42:46 +04:00
bool lexFlag = true ;
bool unalignedFlag = false ;
bool unalignedFWFlag = false ;
2012-08-25 03:47:57 +04:00
bool crossedNonTerm = false ;
2014-06-12 16:26:01 +04:00
bool spanLength = false ;
2015-03-10 18:28:45 +03:00
bool ruleLength = false ;
2014-06-13 20:04:41 +04:00
bool nonTermContext = false ;
2015-03-10 18:28:45 +03:00
bool nonTermContextTarget = false ;
2016-02-12 20:46:57 +03:00
bool targetConstituentBoundariesFlag = false ;
2014-06-13 20:04:41 +04:00
2011-09-07 20:42:46 +04:00
int countOfCounts [ COC_MAX + 1 ] ;
int totalDistinct = 0 ;
2015-07-24 21:42:15 +03:00
float minCount = 0 ;
2011-09-07 20:42:46 +04:00
float minCountHierarchical = 0 ;
2014-07-28 21:27:12 +04:00
bool phraseOrientationPriorsFlag = false ;
2014-01-29 22:37:42 +04:00
2014-06-11 22:27:18 +04:00
boost : : unordered_map < std : : string , float > sourceLHSCounts ;
boost : : unordered_map < std : : string , boost : : unordered_map < std : : string , float > * > targetLHSAndSourceLHSJointCounts ;
2014-01-29 22:37:42 +04:00
std : : set < std : : string > sourceLabelSet ;
2015-01-14 14:07:42 +03:00
std : : map < std : : string , size_t > sourceLabels ;
2014-01-29 22:37:42 +04:00
std : : vector < std : : string > sourceLabelsByIndex ;
2011-09-07 20:42:46 +04:00
2015-03-05 00:40:56 +03:00
std : : set < std : : string > partsOfSpeechSet ;
2016-01-10 02:02:31 +03:00
boost : : unordered_map < std : : string , float > targetSyntacticPreferencesLHSCounts ;
boost : : unordered_map < std : : string , boost : : unordered_map < std : : string , float > * > ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts ;
std : : set < std : : string > targetSyntacticPreferencesLabelSet ;
std : : map < std : : string , size_t > targetSyntacticPreferencesLabels ;
std : : vector < std : : string > targetSyntacticPreferencesLabelsByIndex ;
2014-06-11 22:27:18 +04:00
2015-01-21 00:41:41 +03:00
std : : vector < float > orientationClassPriorsL2R ( 4 , 0 ) ; // mono swap dleft dright
std : : vector < float > orientationClassPriorsR2L ( 4 , 0 ) ; // mono swap dleft dright
2014-07-28 21:27:12 +04:00
2012-06-30 18:43:47 +04:00
Vocabulary vcbT ;
Vocabulary vcbS ;
2013-05-29 21:16:15 +04:00
2012-06-30 18:43:47 +04:00
} // namespace
2014-01-29 22:37:42 +04:00
void processLine ( std : : string line ,
2015-01-14 14:07:42 +03:00
int lineID , bool includeSentenceIdFlag , int & sentenceId ,
2014-01-29 22:37:42 +04:00
PHRASE * phraseSource , PHRASE * phraseTarget , ALIGNMENT * targetToSourceAlignment ,
std : : string & additionalPropertiesString ,
float & count , float & pcfgSum ) ;
void writeCountOfCounts ( const std : : string & fileNameCountOfCounts ) ;
2014-06-11 22:27:18 +04:00
void writeLeftHandSideLabelCounts ( const boost : : unordered_map < std : : string , float > & countsLabelLHS ,
const boost : : unordered_map < std : : string , boost : : unordered_map < std : : string , float > * > & jointCountsLabelLHS ,
2015-01-14 14:07:42 +03:00
const std : : string & fileNameLeftHandSideSourceLabelCounts ,
2014-06-11 22:27:18 +04:00
const std : : string & fileNameLeftHandSideTargetSourceLabelCounts ) ;
void writeLabelSet ( const std : : set < std : : string > & labelSet , const std : : string & fileName ) ;
2015-08-29 06:48:09 +03:00
void processPhrasePairs ( std : : vector < ExtractionPhrasePair * > & phrasePairsWithSameSource , std : : ostream & phraseTableFile ,
2014-01-29 22:37:42 +04:00
const ScoreFeatureManager & featureManager , const MaybeLog & maybeLogProb ) ;
2015-03-09 21:49:32 +03:00
void outputPhrasePair ( const ExtractionPhrasePair & phrasePair , float , int , std : : ostream & phraseTableFile , const ScoreFeatureManager & featureManager , const MaybeLog & maybeLog ) ;
2014-01-29 22:37:42 +04:00
double computeLexicalTranslation ( const PHRASE * phraseSource , const PHRASE * phraseTarget , const ALIGNMENT * alignmentTargetToSource ) ;
double computeUnalignedPenalty ( const ALIGNMENT * alignmentTargetToSource ) ;
2015-03-09 21:49:32 +03:00
std : : set < std : : string > functionWordList ;
2014-07-28 21:27:12 +04:00
void loadOrientationPriors ( const std : : string & fileNamePhraseOrientationPriors , std : : vector < float > & orientationClassPriorsL2R , std : : vector < float > & orientationClassPriorsR2L ) ;
2015-03-09 21:49:32 +03:00
void loadFunctionWords ( const std : : string & fileNameFunctionWords ) ;
2014-01-29 22:37:42 +04:00
double computeUnalignedFWPenalty ( const PHRASE * phraseTarget , const ALIGNMENT * alignmentTargetToSource ) ;
int calcCrossedNonTerm ( const PHRASE * phraseTarget , const ALIGNMENT * alignmentTargetToSource ) ;
2015-03-09 21:49:32 +03:00
void printSourcePhrase ( const PHRASE * phraseSource , const PHRASE * phraseTarget , const ALIGNMENT * targetToSourceAlignment , std : : ostream & out ) ;
void printTargetPhrase ( const PHRASE * phraseSource , const PHRASE * phraseTarget , const ALIGNMENT * targetToSourceAlignment , std : : ostream & out ) ;
2014-01-29 22:37:42 +04:00
void invertAlignment ( const PHRASE * phraseSource , const PHRASE * phraseTarget , const ALIGNMENT * inTargetToSourceAlignment , ALIGNMENT * outSourceToTargetAlignment ) ;
2015-03-10 18:28:45 +03:00
size_t NumNonTerminal ( const PHRASE * phraseSource ) ;
2014-01-29 22:37:42 +04:00
2012-06-30 18:43:47 +04:00
2011-09-07 20:42:46 +04:00
int main ( int argc , char * argv [ ] )
{
2015-01-14 14:07:42 +03:00
std : : cerr < < " Score v2.1 -- "
2014-01-29 22:37:42 +04:00
< < " scoring methods for extracted rules " < < std : : endl ;
2011-09-07 20:42:46 +04:00
2012-11-03 03:30:51 +04:00
ScoreFeatureManager featureManager ;
2011-09-07 20:42:46 +04:00
if ( argc < 4 ) {
2015-06-02 12:02:39 +03:00
std : : cerr < <
2015-07-24 21:42:15 +03:00
" syntax: score extract lex phrase-table "
" [--Inverse] "
" [--Hierarchical] "
" [--LogProb] "
" [--NegLogProb] "
" [--NoLex] "
" [--GoodTuring] "
" [--KneserNey] "
" [--NoWordAlignment] "
" [--UnalignedPenalty] "
2015-06-04 15:41:46 +03:00
" [--UnalignedFunctionWordPenalty function-word-file] "
2015-07-24 21:42:15 +03:00
" [--MinCountHierarchical count] "
" [--PartsOfSpeech] "
" [--PCFG] "
" [--TreeFragments] "
" [--SourceLabels] "
" [--SourceLabelCountsLHS] "
2016-01-10 02:02:31 +03:00
" [--TargetSyntacticPreferences] "
2015-07-24 21:42:15 +03:00
" [--UnpairedExtractFormat] "
" [--ConditionOnTargetLHS] "
2015-07-25 02:00:40 +03:00
" [--CrossedNonTerm] "
2015-07-24 21:42:15 +03:00
< < std : : endl ;
2014-01-29 22:37:42 +04:00
std : : cerr < < featureManager . usage ( ) < < std : : endl ;
2011-09-07 20:42:46 +04:00
exit ( 1 ) ;
}
2014-01-29 22:37:42 +04:00
std : : string fileNameExtract = argv [ 1 ] ;
std : : string fileNameLex = argv [ 2 ] ;
std : : string fileNamePhraseTable = argv [ 3 ] ;
2014-06-11 22:27:18 +04:00
std : : string fileNameSourceLabelSet ;
2015-03-05 00:40:56 +03:00
std : : string fileNamePartsOfSpeechSet ;
2014-01-29 22:37:42 +04:00
std : : string fileNameCountOfCounts ;
std : : string fileNameFunctionWords ;
2014-06-11 22:27:18 +04:00
std : : string fileNameLeftHandSideSourceLabelCounts ;
std : : string fileNameLeftHandSideTargetSourceLabelCounts ;
2016-01-10 02:02:31 +03:00
std : : string fileNameTargetSyntacticPreferencesLabelSet ;
std : : string fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts ;
std : : string fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts ;
2014-07-28 21:27:12 +04:00
std : : string fileNamePhraseOrientationPriors ;
2015-06-02 12:02:39 +03:00
// All unknown args are passed to feature manager.
std : : vector < std : : string > featureArgs ;
2011-09-07 20:42:46 +04:00
for ( int i = 4 ; i < argc ; i + + ) {
if ( strcmp ( argv [ i ] , " inverse " ) = = 0 | | strcmp ( argv [ i ] , " --Inverse " ) = = 0 ) {
inverseFlag = true ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " using inverse mode " < < std : : endl ;
2011-09-07 20:42:46 +04:00
} else if ( strcmp ( argv [ i ] , " --Hierarchical " ) = = 0 ) {
hierarchicalFlag = true ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " processing hierarchical rules " < < std : : endl ;
2012-05-25 20:29:47 +04:00
} else if ( strcmp ( argv [ i ] , " --PCFG " ) = = 0 ) {
pcfgFlag = true ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " including PCFG scores " < < std : : endl ;
2014-07-28 21:27:12 +04:00
} else if ( strcmp ( argv [ i ] , " --PhraseOrientation " ) = = 0 ) {
phraseOrientationFlag = true ;
std : : cerr < < " including phrase orientation information " < < std : : endl ;
2013-09-13 19:10:21 +04:00
} else if ( strcmp ( argv [ i ] , " --TreeFragments " ) = = 0 ) {
treeFragmentsFlag = true ;
2014-07-28 21:27:12 +04:00
std : : cerr < < " including tree fragment information from syntactic parse " < < std : : endl ;
2015-03-05 00:40:56 +03:00
} else if ( strcmp ( argv [ i ] , " --PartsOfSpeech " ) = = 0 ) {
partsOfSpeechFlag = true ;
std : : cerr < < " including parts-of-speech information from syntactic parse " < < std : : endl ;
fileNamePartsOfSpeechSet = std : : string ( fileNamePhraseTable ) + " .partsOfSpeech " ;
std : : cerr < < " writing parts-of-speech set to file " < < fileNamePartsOfSpeechSet < < std : : endl ;
2014-06-11 22:27:18 +04:00
} else if ( strcmp ( argv [ i ] , " --SourceLabels " ) = = 0 ) {
sourceSyntaxLabelsFlag = true ;
std : : cerr < < " including source label information " < < std : : endl ;
fileNameSourceLabelSet = std : : string ( fileNamePhraseTable ) + " .syntaxLabels.src " ;
std : : cerr < < " writing source syntax label set to file " < < fileNameSourceLabelSet < < std : : endl ;
} else if ( strcmp ( argv [ i ] , " --SourceLabelCountsLHS " ) = = 0 ) {
sourceSyntaxLabelCountsLHSFlag = true ;
fileNameLeftHandSideSourceLabelCounts = std : : string ( fileNamePhraseTable ) + " .src.lhs " ;
fileNameLeftHandSideTargetSourceLabelCounts = std : : string ( fileNamePhraseTable ) + " .tgt-src.lhs " ;
std : : cerr < < " counting left-hand side source labels and writing them to files " < < fileNameLeftHandSideSourceLabelCounts < < " and " < < fileNameLeftHandSideTargetSourceLabelCounts < < std : : endl ;
2016-01-10 02:02:31 +03:00
} else if ( strcmp ( argv [ i ] , " --TargetSyntacticPreferences " ) = = 0 ) {
targetSyntacticPreferencesFlag = true ;
std : : cerr < < " including target syntactic preferences information " < < std : : endl ;
fileNameTargetSyntacticPreferencesLabelSet = std : : string ( fileNamePhraseTable ) + " .syntaxLabels.tgtpref " ;
std : : cerr < < " writing target syntactic preferences label set to file " < < fileNameTargetSyntacticPreferencesLabelSet < < std : : endl ;
fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts = std : : string ( fileNamePhraseTable ) + " .tgtpref.lhs " ;
fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts = std : : string ( fileNamePhraseTable ) + " .tgt-tgtpref.lhs " ;
2016-01-10 03:00:35 +03:00
std : : cerr < < " counting left-hand side target syntactic preferences labels and writing them to files "
< < fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts
< < " and "
< < fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts
2016-01-10 02:02:31 +03:00
< < std : : endl ;
2012-05-27 15:43:16 +04:00
} else if ( strcmp ( argv [ i ] , " --UnpairedExtractFormat " ) = = 0 ) {
unpairedExtractFormatFlag = true ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " processing unpaired extract format " < < std : : endl ;
2012-05-27 15:43:16 +04:00
} else if ( strcmp ( argv [ i ] , " --ConditionOnTargetLHS " ) = = 0 ) {
conditionOnTargetLhsFlag = true ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " processing unpaired extract format " < < std : : endl ;
2012-11-15 16:35:43 +04:00
} else if ( strcmp ( argv [ i ] , " --NoWordAlignment " ) = = 0 ) {
wordAlignmentFlag = false ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " omitting word alignment " < < std : : endl ;
2011-09-07 20:42:46 +04:00
} else if ( strcmp ( argv [ i ] , " --NoLex " ) = = 0 ) {
lexFlag = false ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " not computing lexical translation score " < < std : : endl ;
2011-09-07 20:42:46 +04:00
} else if ( strcmp ( argv [ i ] , " --GoodTuring " ) = = 0 ) {
goodTuringFlag = true ;
2014-01-29 22:37:42 +04:00
fileNameCountOfCounts = std : : string ( fileNamePhraseTable ) + " .coc " ;
std : : cerr < < " adjusting phrase translation probabilities with Good Turing discounting " < < std : : endl ;
2011-09-07 20:42:46 +04:00
} else if ( strcmp ( argv [ i ] , " --KneserNey " ) = = 0 ) {
kneserNeyFlag = true ;
2014-01-29 22:37:42 +04:00
fileNameCountOfCounts = std : : string ( fileNamePhraseTable ) + " .coc " ;
std : : cerr < < " adjusting phrase translation probabilities with Kneser Ney discounting " < < std : : endl ;
2011-09-07 20:42:46 +04:00
} else if ( strcmp ( argv [ i ] , " --UnalignedPenalty " ) = = 0 ) {
unalignedFlag = true ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " using unaligned word penalty " < < std : : endl ;
2011-09-07 20:42:46 +04:00
} else if ( strcmp ( argv [ i ] , " --UnalignedFunctionWordPenalty " ) = = 0 ) {
unalignedFWFlag = true ;
2013-05-29 21:16:15 +04:00
if ( i + 1 = = argc ) {
2015-01-14 14:07:42 +03:00
std : : cerr < < " ERROR: specify function words file for unaligned function word penalty! " < < std : : endl ;
2011-09-07 20:42:46 +04:00
exit ( 1 ) ;
}
fileNameFunctionWords = argv [ + + i ] ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " using unaligned function word penalty with function words from " < < fileNameFunctionWords < < std : : endl ;
2012-11-03 03:30:51 +04:00
} else if ( strcmp ( argv [ i ] , " --LogProb " ) = = 0 ) {
2011-09-07 20:42:46 +04:00
logProbFlag = true ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " using log-probabilities " < < std : : endl ;
2011-09-07 20:42:46 +04:00
} else if ( strcmp ( argv [ i ] , " --NegLogProb " ) = = 0 ) {
logProbFlag = true ;
negLogProb = - 1 ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " using negative log-probabilities " < < std : : endl ;
2015-07-24 21:42:15 +03:00
} else if ( strcmp ( argv [ i ] , " --MinCount " ) = = 0 ) {
2015-07-24 22:43:29 +03:00
minCount = std : : atof ( argv [ + + i ] ) ;
2015-07-24 21:42:15 +03:00
std : : cerr < < " dropping all phrase pairs occurring less than " < < minCount < < " times " < < std : : endl ;
minCount - = 0.00001 ; // account for rounding
2011-09-07 20:42:46 +04:00
} else if ( strcmp ( argv [ i ] , " --MinCountHierarchical " ) = = 0 ) {
2015-07-24 22:43:29 +03:00
minCountHierarchical = std : : atof ( argv [ + + i ] ) ;
2015-07-24 21:42:15 +03:00
std : : cerr < < " dropping all hierarchical phrase pairs occurring less than " < < minCountHierarchical < < " times " < < std : : endl ;
2011-09-07 20:42:46 +04:00
minCountHierarchical - = 0.00001 ; // account for rounding
2012-08-25 03:47:57 +04:00
} else if ( strcmp ( argv [ i ] , " --CrossedNonTerm " ) = = 0 ) {
crossedNonTerm = true ;
2014-01-29 22:37:42 +04:00
std : : cerr < < " crossed non-term reordering feature " < < std : : endl ;
2014-07-28 21:27:12 +04:00
} else if ( strcmp ( argv [ i ] , " --PhraseOrientationPriors " ) = = 0 ) {
phraseOrientationPriorsFlag = true ;
if ( i + 1 = = argc ) {
2015-01-14 14:07:42 +03:00
std : : cerr < < " ERROR: specify priors file for phrase orientation! " < < std : : endl ;
2014-07-28 21:27:12 +04:00
exit ( 1 ) ;
}
fileNamePhraseOrientationPriors = argv [ + + i ] ;
std : : cerr < < " smoothing phrase orientation with priors from " < < fileNamePhraseOrientationPriors < < std : : endl ;
2014-06-12 16:26:01 +04:00
} else if ( strcmp ( argv [ i ] , " --SpanLength " ) = = 0 ) {
spanLength = true ;
std : : cerr < < " span length feature " < < std : : endl ;
2015-03-10 18:28:45 +03:00
} else if ( strcmp ( argv [ i ] , " --RuleLength " ) = = 0 ) {
ruleLength = true ;
std : : cerr < < " rule length feature " < < std : : endl ;
2014-06-13 20:04:41 +04:00
} else if ( strcmp ( argv [ i ] , " --NonTermContext " ) = = 0 ) {
nonTermContext = true ;
std : : cerr < < " non-term context " < < std : : endl ;
2015-03-10 18:28:45 +03:00
} else if ( strcmp ( argv [ i ] , " --NonTermContextTarget " ) = = 0 ) {
nonTermContextTarget = true ;
std : : cerr < < " non-term context (target) " < < std : : endl ;
2016-02-12 20:46:57 +03:00
} else if ( strcmp ( argv [ i ] , " --TargetConstituentBoundaries " ) = = 0 ) {
targetConstituentBoundariesFlag = true ;
std : : cerr < < " including target constituent boundaries information " < < std : : endl ;
2011-09-07 20:42:46 +04:00
} else {
2012-11-03 03:30:51 +04:00
featureArgs . push_back ( argv [ i ] ) ;
2012-11-15 16:35:43 +04:00
+ + i ;
2012-11-03 03:30:51 +04:00
for ( ; i < argc & & strncmp ( argv [ i ] , " -- " , 2 ) ; + + i ) {
featureArgs . push_back ( argv [ i ] ) ;
}
2012-11-15 16:35:43 +04:00
if ( i ! = argc ) - - i ; //roll back, since we found another -- argument
2011-09-07 20:42:46 +04:00
}
}
2012-11-03 03:30:51 +04:00
MaybeLog maybeLogProb ( logProbFlag , negLogProb ) ;
2014-01-29 22:37:42 +04:00
// configure extra features
if ( ! inverseFlag ) {
featureManager . configure ( featureArgs ) ;
}
2012-11-03 03:30:51 +04:00
2011-09-07 20:42:46 +04:00
// lexical translation table
2014-01-29 22:37:42 +04:00
if ( lexFlag ) {
2011-09-07 20:42:46 +04:00
lexTable . load ( fileNameLex ) ;
2014-01-29 22:37:42 +04:00
}
2011-09-07 20:42:46 +04:00
// function word list
2014-01-29 22:37:42 +04:00
if ( unalignedFWFlag ) {
2011-09-07 20:42:46 +04:00
loadFunctionWords ( fileNameFunctionWords ) ;
2014-01-29 22:37:42 +04:00
}
2011-09-07 20:42:46 +04:00
// compute count of counts for Good Turing discounting
if ( goodTuringFlag | | kneserNeyFlag ) {
for ( int i = 1 ; i < = COC_MAX ; i + + ) countOfCounts [ i ] = 0 ;
}
2014-07-28 21:27:12 +04:00
if ( phraseOrientationPriorsFlag ) {
loadOrientationPriors ( fileNamePhraseOrientationPriors , orientationClassPriorsL2R , orientationClassPriorsR2L ) ;
}
2011-09-07 20:42:46 +04:00
// sorted phrase extraction file
Moses : : InputFileStream extractFile ( fileNameExtract ) ;
if ( extractFile . fail ( ) ) {
2014-01-29 22:37:42 +04:00
std : : cerr < < " ERROR: could not open extract file " < < fileNameExtract < < std : : endl ;
2011-09-07 20:42:46 +04:00
exit ( 1 ) ;
}
// output file: phrase translation table
2015-03-09 21:49:32 +03:00
std : : ostream * phraseTableFile ;
2013-05-29 21:16:15 +04:00
if ( fileNamePhraseTable = = " - " ) {
2014-01-29 22:37:42 +04:00
phraseTableFile = & std : : cout ;
2013-05-29 21:16:15 +04:00
} else {
Moses : : OutputFileStream * outputFile = new Moses : : OutputFileStream ( ) ;
bool success = outputFile - > Open ( fileNamePhraseTable ) ;
if ( ! success ) {
2015-01-14 14:07:42 +03:00
std : : cerr < < " ERROR: could not open file phrase table file "
< < fileNamePhraseTable < < std : : endl ;
2013-05-29 21:16:15 +04:00
exit ( 1 ) ;
}
phraseTableFile = outputFile ;
}
2011-09-07 20:42:46 +04:00
// loop through all extracted phrase translations
2015-03-09 21:49:32 +03:00
std : : string line , lastLine ;
2014-01-29 22:37:42 +04:00
ExtractionPhrasePair * phrasePair = NULL ;
2015-08-29 06:48:09 +03:00
std : : vector < ExtractionPhrasePair * > phrasePairsWithSameSource ;
std : : vector < ExtractionPhrasePair * > phrasePairsWithSameSourceAndTarget ; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible
2011-09-07 20:42:46 +04:00
2014-01-29 22:37:42 +04:00
int tmpSentenceId ;
PHRASE * tmpPhraseSource , * tmpPhraseTarget ;
ALIGNMENT * tmpTargetToSourceAlignment ;
std : : string tmpAdditionalPropertiesString ;
float tmpCount = 0.0f , tmpPcfgSum = 0.0f ;
int i = 0 ;
2015-03-09 21:49:32 +03:00
if ( getline ( extractFile , line ) ) {
2014-01-29 22:37:42 +04:00
+ + i ;
tmpPhraseSource = new PHRASE ( ) ;
tmpPhraseTarget = new PHRASE ( ) ;
tmpTargetToSourceAlignment = new ALIGNMENT ( ) ;
2015-01-14 14:07:42 +03:00
processLine ( std : : string ( line ) ,
2014-01-29 22:37:42 +04:00
i , featureManager . includeSentenceId ( ) , tmpSentenceId ,
2015-01-14 14:07:42 +03:00
tmpPhraseSource , tmpPhraseTarget , tmpTargetToSourceAlignment ,
2014-01-29 22:37:42 +04:00
tmpAdditionalPropertiesString ,
tmpCount , tmpPcfgSum ) ;
2015-01-14 14:07:42 +03:00
phrasePair = new ExtractionPhrasePair ( tmpPhraseSource , tmpPhraseTarget ,
2014-01-29 22:37:42 +04:00
tmpTargetToSourceAlignment ,
tmpCount , tmpPcfgSum ) ;
phrasePair - > AddProperties ( tmpAdditionalPropertiesString , tmpCount ) ;
featureManager . addPropertiesToPhrasePair ( * phrasePair , tmpCount , tmpSentenceId ) ;
phrasePairsWithSameSource . push_back ( phrasePair ) ;
if ( hierarchicalFlag ) {
phrasePairsWithSameSourceAndTarget . push_back ( phrasePair ) ;
2011-09-07 20:42:46 +04:00
}
2014-06-08 17:06:33 +04:00
lastLine = line ;
2014-01-29 22:37:42 +04:00
}
2015-03-09 21:49:32 +03:00
while ( getline ( extractFile , line ) ) {
2014-01-29 22:37:42 +04:00
2015-07-16 11:51:16 +03:00
// Print progress dots to stderr.
2014-01-29 22:37:42 +04:00
if ( + + i % 100000 = = 0 ) {
std : : cerr < < " . " < < std : : flush ;
}
2011-09-07 20:42:46 +04:00
2014-01-29 22:37:42 +04:00
// identical to last line? just add count
2014-06-08 17:06:33 +04:00
if ( line = = lastLine ) {
2014-01-29 22:37:42 +04:00
phrasePair - > IncrementPrevious ( tmpCount , tmpPcfgSum ) ;
2011-09-07 20:42:46 +04:00
continue ;
2014-01-29 22:37:42 +04:00
} else {
2014-06-08 17:06:33 +04:00
lastLine = line ;
2011-09-07 20:42:46 +04:00
}
2013-05-29 21:16:15 +04:00
2014-01-29 22:37:42 +04:00
tmpPhraseSource = new PHRASE ( ) ;
tmpPhraseTarget = new PHRASE ( ) ;
tmpTargetToSourceAlignment = new ALIGNMENT ( ) ;
tmpAdditionalPropertiesString . clear ( ) ;
2015-01-14 14:07:42 +03:00
processLine ( std : : string ( line ) ,
2014-01-29 22:37:42 +04:00
i , featureManager . includeSentenceId ( ) , tmpSentenceId ,
2015-01-14 14:07:42 +03:00
tmpPhraseSource , tmpPhraseTarget , tmpTargetToSourceAlignment ,
2014-01-29 22:37:42 +04:00
tmpAdditionalPropertiesString ,
2015-01-14 14:07:42 +03:00
tmpCount , tmpPcfgSum ) ;
2014-01-29 22:37:42 +04:00
bool matchesPrevious = false ;
2015-01-14 14:07:42 +03:00
bool sourceMatch = true ;
bool targetMatch = true ;
bool alignmentMatch = true ; // be careful with these,
2014-01-29 22:37:42 +04:00
// ExtractionPhrasePair::Matches() checks them in order and does not continue with the others
// once the first of them has been found to have to be set to false
if ( hierarchicalFlag ) {
2015-08-29 06:48:09 +03:00
for ( std : : vector < ExtractionPhrasePair * > : : const_iterator iter = phrasePairsWithSameSourceAndTarget . begin ( ) ;
2014-01-29 22:37:42 +04:00
iter ! = phrasePairsWithSameSourceAndTarget . end ( ) ; + + iter ) {
if ( ( * iter ) - > Matches ( tmpPhraseSource , tmpPhraseTarget , tmpTargetToSourceAlignment ,
sourceMatch , targetMatch , alignmentMatch ) ) {
matchesPrevious = true ;
phrasePair = ( * iter ) ;
break ;
}
}
} else {
if ( phrasePair - > Matches ( tmpPhraseSource , tmpPhraseTarget , tmpTargetToSourceAlignment ,
sourceMatch , targetMatch , alignmentMatch ) ) {
matchesPrevious = true ;
}
}
2013-05-29 21:16:15 +04:00
2014-01-29 22:37:42 +04:00
if ( matchesPrevious ) {
delete tmpPhraseSource ;
delete tmpPhraseTarget ;
if ( ! phrasePair - > Add ( tmpTargetToSourceAlignment ,
tmpCount , tmpPcfgSum ) ) {
delete tmpTargetToSourceAlignment ;
}
phrasePair - > AddProperties ( tmpAdditionalPropertiesString , tmpCount ) ;
featureManager . addPropertiesToPhrasePair ( * phrasePair , tmpCount , tmpSentenceId ) ;
2013-05-29 21:16:15 +04:00
} else {
2014-01-29 22:37:42 +04:00
if ( ! phrasePairsWithSameSource . empty ( ) & &
! sourceMatch ) {
processPhrasePairs ( phrasePairsWithSameSource , * phraseTableFile , featureManager , maybeLogProb ) ;
2015-08-29 06:48:09 +03:00
for ( std : : vector < ExtractionPhrasePair * > : : const_iterator iter = phrasePairsWithSameSource . begin ( ) ;
2014-01-29 22:37:42 +04:00
iter ! = phrasePairsWithSameSource . end ( ) ; + + iter ) {
delete * iter ;
}
phrasePairsWithSameSource . clear ( ) ;
if ( hierarchicalFlag ) {
phrasePairsWithSameSourceAndTarget . clear ( ) ;
}
}
if ( hierarchicalFlag ) {
if ( ! phrasePairsWithSameSourceAndTarget . empty ( ) & &
! targetMatch ) {
phrasePairsWithSameSourceAndTarget . clear ( ) ;
}
}
2015-01-14 14:07:42 +03:00
phrasePair = new ExtractionPhrasePair ( tmpPhraseSource , tmpPhraseTarget ,
tmpTargetToSourceAlignment ,
2014-01-29 22:37:42 +04:00
tmpCount , tmpPcfgSum ) ;
phrasePair - > AddProperties ( tmpAdditionalPropertiesString , tmpCount ) ;
featureManager . addPropertiesToPhrasePair ( * phrasePair , tmpCount , tmpSentenceId ) ;
phrasePairsWithSameSource . push_back ( phrasePair ) ;
if ( hierarchicalFlag ) {
phrasePairsWithSameSourceAndTarget . push_back ( phrasePair ) ;
}
2012-08-24 03:54:05 +04:00
}
2011-09-07 20:42:46 +04:00
}
2014-01-29 22:37:42 +04:00
2015-07-16 11:51:16 +03:00
// We've been printing progress dots to stderr. End the line.
std : : cerr < < std : : endl ;
2014-01-29 22:37:42 +04:00
processPhrasePairs ( phrasePairsWithSameSource , * phraseTableFile , featureManager , maybeLogProb ) ;
2015-08-29 06:48:09 +03:00
for ( std : : vector < ExtractionPhrasePair * > : : const_iterator iter = phrasePairsWithSameSource . begin ( ) ;
2014-01-29 22:37:42 +04:00
iter ! = phrasePairsWithSameSource . end ( ) ; + + iter ) {
delete * iter ;
}
phrasePairsWithSameSource . clear ( ) ;
2013-05-29 21:16:15 +04:00
phraseTableFile - > flush ( ) ;
2014-01-29 22:37:42 +04:00
if ( phraseTableFile ! = & std : : cout ) {
2013-05-29 21:16:15 +04:00
delete phraseTableFile ;
}
2011-09-07 20:42:46 +04:00
// output count of count statistics
if ( goodTuringFlag | | kneserNeyFlag ) {
writeCountOfCounts ( fileNameCountOfCounts ) ;
}
2014-06-11 22:27:18 +04:00
// source syntax labels
2015-03-05 00:40:56 +03:00
if ( sourceSyntaxLabelsFlag & & ! inverseFlag ) {
2014-06-11 22:27:18 +04:00
writeLabelSet ( sourceLabelSet , fileNameSourceLabelSet ) ;
}
if ( sourceSyntaxLabelsFlag & & sourceSyntaxLabelCountsLHSFlag & & ! inverseFlag ) {
writeLeftHandSideLabelCounts ( sourceLHSCounts ,
targetLHSAndSourceLHSJointCounts ,
2015-01-14 14:07:42 +03:00
fileNameLeftHandSideSourceLabelCounts ,
2014-06-11 22:27:18 +04:00
fileNameLeftHandSideTargetSourceLabelCounts ) ;
}
2015-03-05 00:40:56 +03:00
// parts-of-speech
if ( partsOfSpeechFlag & & ! inverseFlag ) {
writeLabelSet ( partsOfSpeechSet , fileNamePartsOfSpeechSet ) ;
}
2016-01-10 02:02:31 +03:00
// target syntactic preferences labels
if ( targetSyntacticPreferencesFlag & & ! inverseFlag ) {
writeLabelSet ( targetSyntacticPreferencesLabelSet , fileNameTargetSyntacticPreferencesLabelSet ) ;
writeLeftHandSideLabelCounts ( targetSyntacticPreferencesLHSCounts ,
ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts ,
fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts ,
fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts ) ;
2014-06-11 22:27:18 +04:00
}
2011-09-07 20:42:46 +04:00
}
2014-01-29 22:37:42 +04:00
void processLine ( std : : string line ,
2015-01-14 14:07:42 +03:00
int lineID , bool includeSentenceIdFlag , int & sentenceId ,
2014-01-29 22:37:42 +04:00
PHRASE * phraseSource , PHRASE * phraseTarget , ALIGNMENT * targetToSourceAlignment ,
std : : string & additionalPropertiesString ,
float & count , float & pcfgSum )
{
2014-12-04 18:54:05 +03:00
size_t foundAdditionalProperties = line . rfind ( " ||| " ) ;
foundAdditionalProperties = line . find ( " {{ " , foundAdditionalProperties ) ;
2014-01-29 22:37:42 +04:00
if ( foundAdditionalProperties ! = std : : string : : npos ) {
additionalPropertiesString = line . substr ( foundAdditionalProperties ) ;
line = line . substr ( 0 , foundAdditionalProperties ) ;
} else {
additionalPropertiesString . clear ( ) ;
}
phraseSource - > clear ( ) ;
phraseTarget - > clear ( ) ;
targetToSourceAlignment - > clear ( ) ;
2015-03-09 21:49:32 +03:00
std : : vector < std : : string > token ;
Moses : : Tokenize ( token , line ) ;
2014-01-29 22:37:42 +04:00
int item = 1 ;
for ( size_t j = 0 ; j < token . size ( ) ; + + j ) {
if ( token [ j ] = = " ||| " ) {
+ + item ;
} else if ( item = = 1 ) { // source phrase
phraseSource - > push_back ( vcbS . storeIfNew ( token [ j ] ) ) ;
} else if ( item = = 2 ) { // target phrase
phraseTarget - > push_back ( vcbT . storeIfNew ( token [ j ] ) ) ;
} else if ( item = = 3 ) { // alignment
int s , t ;
sscanf ( token [ j ] . c_str ( ) , " %d-%d " , & s , & t ) ;
if ( ( size_t ) t > = phraseTarget - > size ( ) | | ( size_t ) s > = phraseSource - > size ( ) ) {
std : : cerr < < " WARNING: phrase pair " < < lineID
< < " has alignment point ( " < < s < < " , " < < t < < " ) "
< < " out of bounds ( " < < phraseSource - > size ( ) < < " , " < < phraseTarget - > size ( ) < < " ) "
< < std : : endl ;
} else {
// first alignment point? -> initialize
if ( targetToSourceAlignment - > size ( ) = = 0 ) {
size_t numberOfTargetSymbols = ( hierarchicalFlag ? phraseTarget - > size ( ) - 1 : phraseTarget - > size ( ) ) ;
targetToSourceAlignment - > resize ( numberOfTargetSymbols ) ;
}
// add alignment point
targetToSourceAlignment - > at ( t ) . insert ( s ) ;
}
} else if ( includeSentenceIdFlag & & item = = 4 ) { // optional sentence id
sscanf ( token [ j ] . c_str ( ) , " %d " , & sentenceId ) ;
} else if ( item + ( includeSentenceIdFlag ? - 1 : 0 ) = = 4 ) { // count
sscanf ( token [ j ] . c_str ( ) , " %f " , & count ) ;
} else if ( item + ( includeSentenceIdFlag ? - 1 : 0 ) = = 5 ) { // target syntax PCFG score
2015-07-24 22:43:29 +03:00
float pcfgScore = std : : atof ( token [ j ] . c_str ( ) ) ;
2014-01-29 22:37:42 +04:00
pcfgSum = pcfgScore * count ;
}
}
if ( targetToSourceAlignment - > size ( ) = = 0 ) {
size_t numberOfTargetSymbols = ( hierarchicalFlag ? phraseTarget - > size ( ) - 1 : phraseTarget - > size ( ) ) ;
targetToSourceAlignment - > resize ( numberOfTargetSymbols ) ;
}
if ( item + ( includeSentenceIdFlag ? - 1 : 0 ) = = 3 ) {
count = 1.0 ;
}
2015-01-21 00:36:55 +03:00
if ( item < 3 | | item > ( includeSentenceIdFlag ? 7 : 6 ) ) {
2015-03-09 21:49:32 +03:00
std : : cerr < < " ERROR: faulty line " < < lineID < < " : " < < line < < std : : endl ;
2014-01-29 22:37:42 +04:00
}
}
2015-03-09 21:49:32 +03:00
void writeCountOfCounts ( const std : : string & fileNameCountOfCounts )
2011-09-07 20:42:46 +04:00
{
// open file
2013-05-29 21:16:15 +04:00
Moses : : OutputFileStream countOfCountsFile ;
2015-03-09 21:49:32 +03:00
bool success = countOfCountsFile . Open ( fileNameCountOfCounts ) ;
2013-05-29 21:16:15 +04:00
if ( ! success ) {
2014-01-29 22:37:42 +04:00
std : : cerr < < " ERROR: could not open count-of-counts file "
< < fileNameCountOfCounts < < std : : endl ;
2011-09-07 20:42:46 +04:00
return ;
2013-05-29 21:16:15 +04:00
}
2011-09-07 20:42:46 +04:00
// Kneser-Ney needs the total number of phrase pairs
2014-01-29 22:37:42 +04:00
countOfCountsFile < < totalDistinct < < std : : endl ;
2011-09-07 20:42:46 +04:00
// write out counts
for ( int i = 1 ; i < = COC_MAX ; i + + ) {
2014-01-29 22:37:42 +04:00
countOfCountsFile < < countOfCounts [ i ] < < std : : endl ;
2011-09-07 20:42:46 +04:00
}
2013-05-29 21:16:15 +04:00
countOfCountsFile . Close ( ) ;
2011-09-07 20:42:46 +04:00
}
2014-06-11 22:27:18 +04:00
void writeLeftHandSideLabelCounts ( const boost : : unordered_map < std : : string , float > & countsLabelLHS ,
const boost : : unordered_map < std : : string , boost : : unordered_map < std : : string , float > * > & jointCountsLabelLHS ,
const std : : string & fileNameLeftHandSideSourceLabelCounts ,
const std : : string & fileNameLeftHandSideTargetSourceLabelCounts )
{
// open file
Moses : : OutputFileStream leftHandSideSourceLabelCounts ;
2015-03-09 21:49:32 +03:00
bool success = leftHandSideSourceLabelCounts . Open ( fileNameLeftHandSideSourceLabelCounts ) ;
2014-06-11 22:27:18 +04:00
if ( ! success ) {
std : : cerr < < " ERROR: could not open left-hand side label counts file "
< < fileNameLeftHandSideSourceLabelCounts < < std : : endl ;
return ;
}
// write source left-hand side counts
for ( boost : : unordered_map < std : : string , float > : : const_iterator iter = sourceLHSCounts . begin ( ) ;
iter ! = sourceLHSCounts . end ( ) ; + + iter ) {
leftHandSideSourceLabelCounts < < iter - > first < < " " < < iter - > second < < std : : endl ;
}
leftHandSideSourceLabelCounts . Close ( ) ;
// open file
Moses : : OutputFileStream leftHandSideTargetSourceLabelCounts ;
2015-03-09 21:49:32 +03:00
success = leftHandSideTargetSourceLabelCounts . Open ( fileNameLeftHandSideTargetSourceLabelCounts ) ;
2014-06-11 22:27:18 +04:00
if ( ! success ) {
std : : cerr < < " ERROR: could not open left-hand side label joint counts file "
< < fileNameLeftHandSideTargetSourceLabelCounts < < std : : endl ;
return ;
}
// write source left-hand side / target left-hand side joint counts
for ( boost : : unordered_map < std : : string , boost : : unordered_map < std : : string , float > * > : : const_iterator iter = targetLHSAndSourceLHSJointCounts . begin ( ) ;
iter ! = targetLHSAndSourceLHSJointCounts . end ( ) ; + + iter ) {
for ( boost : : unordered_map < std : : string , float > : : const_iterator iter2 = ( iter - > second ) - > begin ( ) ;
iter2 ! = ( iter - > second ) - > end ( ) ; + + iter2 ) {
leftHandSideTargetSourceLabelCounts < < iter - > first < < " " < < iter2 - > first < < " " < < iter2 - > second < < std : : endl ;
}
}
leftHandSideTargetSourceLabelCounts . Close ( ) ;
}
void writeLabelSet ( const std : : set < std : : string > & labelSet , const std : : string & fileName )
{
// open file
Moses : : OutputFileStream out ;
2015-03-09 21:49:32 +03:00
bool success = out . Open ( fileName ) ;
2014-06-11 22:27:18 +04:00
if ( ! success ) {
2015-03-05 00:40:56 +03:00
std : : cerr < < " ERROR: could not open file "
< < fileName < < " for writing " < < std : : endl ;
2014-06-11 22:27:18 +04:00
return ;
}
for ( std : : set < std : : string > : : const_iterator iter = labelSet . begin ( ) ;
iter ! = labelSet . end ( ) ; + + iter ) {
out < < * iter < < std : : endl ;
}
out . Close ( ) ;
}
2015-08-29 06:48:09 +03:00
void processPhrasePairs ( std : : vector < ExtractionPhrasePair * > & phrasePairsWithSameSource , std : : ostream & phraseTableFile ,
2014-01-29 22:37:42 +04:00
const ScoreFeatureManager & featureManager , const MaybeLog & maybeLogProb )
{
if ( phrasePairsWithSameSource . size ( ) = = 0 ) {
return ;
}
2013-05-29 21:16:15 +04:00
2011-09-07 20:42:46 +04:00
float totalSource = 0 ;
2014-01-29 22:37:42 +04:00
//std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
2013-05-29 21:16:15 +04:00
2011-09-07 20:42:46 +04:00
// loop through phrase pairs
2015-08-29 06:48:09 +03:00
for ( std : : vector < ExtractionPhrasePair * > : : const_iterator iter = phrasePairsWithSameSource . begin ( ) ;
2014-01-29 22:37:42 +04:00
iter ! = phrasePairsWithSameSource . end ( ) ; + + iter ) {
2011-09-07 20:42:46 +04:00
// add to total count
2014-01-29 22:37:42 +04:00
totalSource + = ( * iter ) - > GetCount ( ) ;
2011-09-07 20:42:46 +04:00
}
// output the distinct phrase pairs, one at a time
2015-08-29 06:48:09 +03:00
for ( std : : vector < ExtractionPhrasePair * > : : const_iterator iter = phrasePairsWithSameSource . begin ( ) ;
2014-01-29 22:37:42 +04:00
iter ! = phrasePairsWithSameSource . end ( ) ; + + iter ) {
// add to total count
outputPhrasePair ( * * iter , totalSource , phrasePairsWithSameSource . size ( ) , phraseTableFile , featureManager , maybeLogProb ) ;
2012-08-25 03:47:57 +04:00
}
}
2015-01-14 14:07:42 +03:00
void outputPhrasePair ( const ExtractionPhrasePair & phrasePair ,
float totalCount , int distinctCount ,
2015-03-09 21:49:32 +03:00
std : : ostream & phraseTableFile ,
2014-01-29 22:37:42 +04:00
const ScoreFeatureManager & featureManager ,
2013-05-29 21:16:15 +04:00
const MaybeLog & maybeLogProb )
2011-09-07 20:42:46 +04:00
{
2014-02-06 23:46:32 +04:00
assert ( phrasePair . IsValid ( ) ) ;
2011-09-07 20:42:46 +04:00
2014-01-29 22:37:42 +04:00
const ALIGNMENT * bestAlignmentT2S = phrasePair . FindBestAlignmentTargetToSource ( ) ;
float count = phrasePair . GetCount ( ) ;
2011-09-07 20:42:46 +04:00
2015-03-09 21:49:32 +03:00
std : : map < std : : string , float > domainCount ;
2012-08-19 02:47:05 +04:00
2011-09-07 20:42:46 +04:00
// collect count of count statistics
if ( goodTuringFlag | | kneserNeyFlag ) {
totalDistinct + + ;
int countInt = count + 0.99999 ;
2015-03-30 19:42:55 +03:00
if ( ( countInt < = COC_MAX ) & &
( countInt > 0 ) )
2011-09-07 20:42:46 +04:00
countOfCounts [ countInt ] + + ;
}
// output phrases
2014-01-29 22:37:42 +04:00
const PHRASE * phraseSource = phrasePair . GetSource ( ) ;
const PHRASE * phraseTarget = phrasePair . GetTarget ( ) ;
2011-09-07 20:42:46 +04:00
2015-07-24 21:42:15 +03:00
// do not output if count below threshold
if ( count < minCount ) {
return ;
}
2011-09-07 20:42:46 +04:00
// do not output if hierarchical and count below threshold
if ( hierarchicalFlag & & count < minCountHierarchical ) {
2014-01-29 22:37:42 +04:00
for ( size_t j = 0 ; j < phraseSource - > size ( ) - 1 ; + + j ) {
if ( isNonTerminal ( vcbS . getWord ( phraseSource - > at ( j ) ) ) )
2011-09-07 20:42:46 +04:00
return ;
}
}
2015-07-24 21:42:15 +03:00
// compute PCFG score
float pcfgScore = 0 ;
if ( pcfgFlag & & ! inverseFlag ) {
pcfgScore = phrasePair . GetPcfgScore ( ) / count ;
}
2011-09-07 20:42:46 +04:00
// source phrase (unless inverse)
2014-01-29 22:37:42 +04:00
if ( ! inverseFlag ) {
printSourcePhrase ( phraseSource , phraseTarget , bestAlignmentT2S , phraseTableFile ) ;
2012-05-27 15:43:16 +04:00
phraseTableFile < < " ||| " ;
2011-09-07 20:42:46 +04:00
}
// target phrase
2014-01-29 22:37:42 +04:00
printTargetPhrase ( phraseSource , phraseTarget , bestAlignmentT2S , phraseTableFile ) ;
2012-05-27 15:43:16 +04:00
phraseTableFile < < " ||| " ;
2011-09-07 20:42:46 +04:00
// source phrase (if inverse)
if ( inverseFlag ) {
2014-01-29 22:37:42 +04:00
printSourcePhrase ( phraseSource , phraseTarget , bestAlignmentT2S , phraseTableFile ) ;
2012-05-27 15:43:16 +04:00
phraseTableFile < < " ||| " ;
2011-09-07 20:42:46 +04:00
}
2014-03-13 22:30:24 +04:00
// alignment
if ( hierarchicalFlag ) {
2015-01-14 14:07:42 +03:00
// always output alignment if hiero style
assert ( phraseTarget - > size ( ) = = bestAlignmentT2S - > size ( ) + 1 ) ;
std : : vector < std : : string > alignment ;
for ( size_t j = 0 ; j < phraseTarget - > size ( ) - 1 ; + + j ) {
if ( isNonTerminal ( vcbT . getWord ( phraseTarget - > at ( j ) ) ) ) {
if ( bestAlignmentT2S - > at ( j ) . size ( ) ! = 1 ) {
std : : cerr < < " Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx]). " < < std : : endl ;
phraseTableFile . flush ( ) ;
assert ( bestAlignmentT2S - > at ( j ) . size ( ) = = 1 ) ;
}
size_t sourcePos = * ( bestAlignmentT2S - > at ( j ) . begin ( ) ) ;
//phraseTableFile << sourcePos << "-" << j << " ";
std : : stringstream point ;
point < < sourcePos < < " - " < < j ;
alignment . push_back ( point . str ( ) ) ;
} else {
for ( std : : set < size_t > : : iterator setIter = ( bestAlignmentT2S - > at ( j ) ) . begin ( ) ;
setIter ! = ( bestAlignmentT2S - > at ( j ) ) . end ( ) ; + + setIter ) {
size_t sourcePos = * setIter ;
2014-03-13 22:30:24 +04:00
std : : stringstream point ;
point < < sourcePos < < " - " < < j ;
alignment . push_back ( point . str ( ) ) ;
}
}
2015-01-14 14:07:42 +03:00
}
// now print all alignments, sorted by source index
sort ( alignment . begin ( ) , alignment . end ( ) ) ;
for ( size_t i = 0 ; i < alignment . size ( ) ; + + i ) {
phraseTableFile < < alignment [ i ] < < " " ;
}
2014-03-13 22:30:24 +04:00
} else if ( ! inverseFlag & & wordAlignmentFlag ) {
2015-01-14 14:07:42 +03:00
// alignment info in pb model
for ( size_t j = 0 ; j < bestAlignmentT2S - > size ( ) ; + + j ) {
for ( std : : set < size_t > : : iterator setIter = ( bestAlignmentT2S - > at ( j ) ) . begin ( ) ;
setIter ! = ( bestAlignmentT2S - > at ( j ) ) . end ( ) ; + + setIter ) {
size_t sourcePos = * setIter ;
phraseTableFile < < sourcePos < < " - " < < j < < " " ;
2014-03-13 22:30:24 +04:00
}
2015-01-14 14:07:42 +03:00
}
2014-03-13 22:30:24 +04:00
}
phraseTableFile < < " ||| " ;
2011-09-07 20:42:46 +04:00
// lexical translation probability
if ( lexFlag ) {
2014-01-29 22:37:42 +04:00
double lexScore = computeLexicalTranslation ( phraseSource , phraseTarget , bestAlignmentT2S ) ;
phraseTableFile < < maybeLogProb ( lexScore ) ;
2011-09-07 20:42:46 +04:00
}
// unaligned word penalty
if ( unalignedFlag ) {
2014-01-29 22:37:42 +04:00
double penalty = computeUnalignedPenalty ( bestAlignmentT2S ) ;
phraseTableFile < < " " < < maybeLogProb ( penalty ) ;
2011-09-07 20:42:46 +04:00
}
// unaligned function word penalty
if ( unalignedFWFlag ) {
2014-01-29 22:37:42 +04:00
double penalty = computeUnalignedFWPenalty ( phraseTarget , bestAlignmentT2S ) ;
phraseTableFile < < " " < < maybeLogProb ( penalty ) ;
2012-08-25 03:47:57 +04:00
}
2013-05-29 21:16:15 +04:00
2012-08-25 03:47:57 +04:00
if ( crossedNonTerm & & ! inverseFlag ) {
2014-01-29 22:37:42 +04:00
phraseTableFile < < " " < < calcCrossedNonTerm ( phraseTarget , bestAlignmentT2S ) ;
2012-08-24 03:54:05 +04:00
}
2013-05-29 21:16:15 +04:00
2012-05-25 20:29:47 +04:00
// target-side PCFG score
if ( pcfgFlag & & ! inverseFlag ) {
2014-01-29 22:37:42 +04:00
phraseTableFile < < " " < < maybeLogProb ( pcfgScore ) ;
2012-08-19 02:47:05 +04:00
}
2012-11-03 03:30:51 +04:00
// extra features
2014-01-29 22:37:42 +04:00
ScoreFeatureContext context ( phrasePair , maybeLogProb ) ;
std : : vector < float > extraDense ;
2015-03-09 21:49:32 +03:00
std : : map < std : : string , float > extraSparse ;
2012-11-03 03:30:51 +04:00
featureManager . addFeatures ( context , extraDense , extraSparse ) ;
for ( size_t i = 0 ; i < extraDense . size ( ) ; + + i ) {
phraseTableFile < < " " < < extraDense [ i ] ;
}
2015-03-09 21:49:32 +03:00
for ( std : : map < std : : string , float > : : const_iterator i = extraSparse . begin ( ) ;
2013-05-29 21:16:15 +04:00
i ! = extraSparse . end ( ) ; + + i ) {
2012-11-03 03:30:51 +04:00
phraseTableFile < < " " < < i - > first < < " " < < i - > second ;
2012-05-25 20:29:47 +04:00
}
2013-09-29 21:58:20 +04:00
2011-09-21 15:04:48 +04:00
// counts
2011-09-07 20:42:46 +04:00
phraseTableFile < < " ||| " < < totalCount < < " " < < count ;
2013-05-29 21:16:15 +04:00
if ( kneserNeyFlag )
2011-09-07 20:42:46 +04:00
phraseTableFile < < " " < < distinctCount ;
2013-05-29 21:16:15 +04:00
2014-06-03 20:10:09 +04:00
phraseTableFile < < " ||| " ;
2013-09-13 19:10:21 +04:00
// tree fragments
if ( treeFragmentsFlag & & ! inverseFlag ) {
2014-01-29 22:37:42 +04:00
const std : : string * bestTreeFragment = phrasePair . FindBestPropertyValue ( " Tree " ) ;
if ( bestTreeFragment ) {
phraseTableFile < < " {{Tree " < < * bestTreeFragment < < " }} " ;
}
2013-09-11 18:46:37 +04:00
}
2015-03-05 00:40:56 +03:00
// parts-of-speech
if ( partsOfSpeechFlag & & ! inverseFlag ) {
phrasePair . UpdateVocabularyFromValueTokens ( " POS " , partsOfSpeechSet ) ;
const std : : string * bestPartOfSpeech = phrasePair . FindBestPropertyValue ( " POS " ) ;
if ( bestPartOfSpeech ) {
phraseTableFile < < " {{POS " < < * bestPartOfSpeech < < " }} " ;
}
}
2014-06-11 22:27:18 +04:00
// syntax labels
2016-01-10 02:02:31 +03:00
if ( ( sourceSyntaxLabelsFlag | | targetSyntacticPreferencesFlag ) & & ! inverseFlag ) {
2014-06-11 22:27:18 +04:00
unsigned nNTs = 1 ;
for ( size_t j = 0 ; j < phraseSource - > size ( ) - 1 ; + + j ) {
if ( isNonTerminal ( vcbS . getWord ( phraseSource - > at ( j ) ) ) )
+ + nNTs ;
}
// source syntax labels
if ( sourceSyntaxLabelsFlag ) {
std : : string sourceLabelCounts ;
sourceLabelCounts = phrasePair . CollectAllLabelsSeparateLHSAndRHS ( " SourceLabels " ,
2015-01-14 14:07:42 +03:00
sourceLabelSet ,
sourceLHSCounts ,
targetLHSAndSourceLHSJointCounts ,
vcbT ) ;
2014-06-11 22:27:18 +04:00
if ( ! sourceLabelCounts . empty ( ) ) {
phraseTableFile < < " {{SourceLabels "
2015-01-23 21:41:18 +03:00
< < phraseSource - > size ( ) // for convenience: number of symbols in this rule (incl. left hand side NT)
2014-06-11 22:27:18 +04:00
< < " "
< < count // rule count
< < sourceLabelCounts
< < " }} " ;
}
}
2016-01-10 02:02:31 +03:00
// target syntactic preferences labels
if ( targetSyntacticPreferencesFlag ) {
std : : string targetSyntacticPreferencesLabelCounts ;
targetSyntacticPreferencesLabelCounts = phrasePair . CollectAllLabelsSeparateLHSAndRHS ( " TargetPreferences " ,
2016-01-10 03:00:35 +03:00
targetSyntacticPreferencesLabelSet ,
targetSyntacticPreferencesLHSCounts ,
ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts ,
vcbT ) ;
2016-01-10 02:02:31 +03:00
if ( ! targetSyntacticPreferencesLabelCounts . empty ( ) ) {
2014-06-11 22:27:18 +04:00
phraseTableFile < < " {{TargetPreferences "
< < nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
< < " "
< < count // rule count
2016-01-10 02:02:31 +03:00
< < targetSyntacticPreferencesLabelCounts
2014-06-11 22:27:18 +04:00
< < " }} " ;
}
}
}
2014-07-28 21:27:12 +04:00
// phrase orientation
if ( phraseOrientationFlag & & ! inverseFlag ) {
phraseTableFile < < " {{Orientation " ;
phrasePair . CollectAllPhraseOrientations ( " Orientation " , orientationClassPriorsL2R , orientationClassPriorsR2L , 0.5 , phraseTableFile ) ;
phraseTableFile < < " }} " ;
}
2014-06-12 16:26:01 +04:00
if ( spanLength & & ! inverseFlag ) {
2015-03-09 21:49:32 +03:00
std : : string propValue = phrasePair . CollectAllPropertyValues ( " SpanLength " ) ;
2015-01-14 14:07:42 +03:00
if ( ! propValue . empty ( ) ) {
phraseTableFile < < " {{SpanLength " < < propValue < < " }} " ;
}
2014-06-12 16:26:01 +04:00
}
2015-03-10 18:28:45 +03:00
if ( ruleLength & & ! inverseFlag ) {
std : : string propValue = phrasePair . CollectAllPropertyValues ( " RuleLength " ) ;
2015-01-14 14:07:42 +03:00
if ( ! propValue . empty ( ) ) {
2015-03-10 18:28:45 +03:00
phraseTableFile < < " {{RuleLength " < < propValue < < " }} " ;
}
}
if ( nonTermContext & & ! inverseFlag ) {
2015-05-02 13:45:24 +03:00
std : : string propValue = phrasePair . CollectAllPropertyValues ( " NonTermContext " ) ;
2015-03-10 18:28:45 +03:00
if ( ! propValue . empty ( ) & & propValue . size ( ) < 50000 ) {
size_t nNTs = NumNonTerminal ( phraseSource ) ;
phraseTableFile < < " {{NonTermContext " < < nNTs < < " " < < propValue < < " }} " ;
}
}
if ( nonTermContextTarget & & ! inverseFlag ) {
2015-05-02 13:45:24 +03:00
std : : string propValue = phrasePair . CollectAllPropertyValues ( " NonTermContextTarget " ) ;
2015-03-10 18:28:45 +03:00
if ( ! propValue . empty ( ) & & propValue . size ( ) < 50000 ) {
size_t nNTs = NumNonTerminal ( phraseSource ) ;
phraseTableFile < < " {{NonTermContextTarget " < < nNTs < < " " < < propValue < < " }} " ;
2015-01-14 14:07:42 +03:00
}
2014-06-13 20:04:41 +04:00
}
2016-02-12 20:46:57 +03:00
// target constituent boundaries
if ( targetConstituentBoundariesFlag & & ! inverseFlag ) {
const std : : string targetConstituentBoundariesLeftValues = phrasePair . CollectAllPropertyValues ( " TargetConstituentBoundariesLeft " ) ;
if ( ! targetConstituentBoundariesLeftValues . empty ( ) ) {
phraseTableFile < < " {{TargetConstituentBoundariesLeft " < < targetConstituentBoundariesLeftValues < < " }} " ;
}
const std : : string targetConstituentBoundariesRightAdjacentValues = phrasePair . CollectAllPropertyValues ( " TargetConstituentBoundariesRightAdjacent " ) ;
if ( ! targetConstituentBoundariesRightAdjacentValues . empty ( ) ) {
phraseTableFile < < " {{TargetConstituentBoundariesRightAdjacent " < < targetConstituentBoundariesRightAdjacentValues < < " }} " ;
}
}
2014-01-29 22:37:42 +04:00
phraseTableFile < < std : : endl ;
}
2015-03-10 18:28:45 +03:00
size_t NumNonTerminal ( const PHRASE * phraseSource )
{
size_t nNTs = 0 ;
for ( size_t j = 0 ; j < phraseSource - > size ( ) - 1 ; + + j ) {
if ( isNonTerminal ( vcbS . getWord ( phraseSource - > at ( j ) ) ) )
+ + nNTs ;
}
return nNTs ;
}
2014-01-29 22:37:42 +04:00
2015-01-14 14:07:42 +03:00
void loadOrientationPriors ( const std : : string & fileNamePhraseOrientationPriors ,
std : : vector < float > & orientationClassPriorsL2R ,
2014-07-28 21:27:12 +04:00
std : : vector < float > & orientationClassPriorsR2L )
{
2015-01-21 00:41:41 +03:00
assert ( orientationClassPriorsL2R . size ( ) = = 4 & & orientationClassPriorsR2L . size ( ) = = 4 ) ; // mono swap dleft dright
2015-01-14 14:07:42 +03:00
2014-07-28 21:27:12 +04:00
std : : cerr < < " Loading phrase orientation priors from " < < fileNamePhraseOrientationPriors ;
2015-03-09 21:49:32 +03:00
Moses : : InputFileStream inFile ( fileNamePhraseOrientationPriors ) ;
2014-07-28 21:27:12 +04:00
if ( inFile . fail ( ) ) {
std : : cerr < < " - ERROR: could not open file " < < std : : endl ;
exit ( 1 ) ;
}
std : : string line ;
size_t linesRead = 0 ;
float l2rSum = 0 ;
float r2lSum = 0 ;
while ( getline ( inFile , line ) ) {
2015-03-09 21:49:32 +03:00
std : : istringstream tokenizer ( line ) ;
2014-07-28 21:27:12 +04:00
std : : string key ;
tokenizer > > key ;
bool l2rFlag = false ;
bool r2lFlag = false ;
2015-02-05 19:23:47 +03:00
if ( starts_with ( key , " L2R_ " ) ) {
2014-07-28 21:27:12 +04:00
l2rFlag = true ;
}
2015-02-05 19:23:47 +03:00
if ( starts_with ( key , " R2L_ " ) ) {
2014-07-28 21:27:12 +04:00
r2lFlag = true ;
}
if ( ! l2rFlag & & ! r2lFlag ) {
2015-01-14 14:07:42 +03:00
std : : cerr < < " - ERROR: malformed line in orientation priors file " < < std : : endl ;
2014-07-28 21:27:12 +04:00
}
key . erase ( 0 , 4 ) ;
int orientationClassId = - 1 ;
if ( ! key . compare ( " mono " ) ) {
orientationClassId = 0 ;
}
if ( ! key . compare ( " swap " ) ) {
orientationClassId = 1 ;
}
2015-01-21 00:41:41 +03:00
if ( ! key . compare ( " dleft " ) ) {
2014-07-28 21:27:12 +04:00
orientationClassId = 2 ;
}
2015-01-21 00:41:41 +03:00
if ( ! key . compare ( " dright " ) ) {
2014-07-28 21:27:12 +04:00
orientationClassId = 3 ;
}
if ( orientationClassId = = - 1 ) {
2015-01-14 14:07:42 +03:00
std : : cerr < < " - ERROR: malformed line in orientation priors file " < < std : : endl ;
2014-07-28 21:27:12 +04:00
}
float count ;
tokenizer > > count ;
if ( l2rFlag ) {
orientationClassPriorsL2R [ orientationClassId ] + = count ;
l2rSum + = count ;
}
if ( r2lFlag ) {
orientationClassPriorsR2L [ orientationClassId ] + = count ;
r2lSum + = count ;
}
+ + linesRead ;
}
// normalization: return prior probabilities, not counts
if ( l2rSum ! = 0 ) {
for ( std : : vector < float > : : iterator orientationClassPriorsL2RIt = orientationClassPriorsL2R . begin ( ) ;
orientationClassPriorsL2RIt ! = orientationClassPriorsL2R . end ( ) ; + + orientationClassPriorsL2RIt ) {
* orientationClassPriorsL2RIt / = l2rSum ;
}
}
if ( r2lSum ! = 0 ) {
for ( std : : vector < float > : : iterator orientationClassPriorsR2LIt = orientationClassPriorsR2L . begin ( ) ;
orientationClassPriorsR2LIt ! = orientationClassPriorsR2L . end ( ) ; + + orientationClassPriorsR2LIt ) {
* orientationClassPriorsR2LIt / = r2lSum ;
}
}
std : : cerr < < " - read " < < linesRead < < " lines from orientation priors file " < < std : : endl ;
2015-03-09 21:49:32 +03:00
inFile . Close ( ) ;
2014-07-28 21:27:12 +04:00
}
2014-01-29 22:37:42 +04:00
bool calcCrossedNonTerm ( size_t targetPos , size_t sourcePos , const ALIGNMENT * alignmentTargetToSource )
{
for ( size_t currTarget = 0 ; currTarget < alignmentTargetToSource - > size ( ) ; + + currTarget ) {
if ( currTarget = = targetPos ) {
// skip
} else {
const std : : set < size_t > & sourceSet = alignmentTargetToSource - > at ( currTarget ) ;
2015-01-14 14:07:42 +03:00
for ( std : : set < size_t > : : const_iterator iter = sourceSet . begin ( ) ;
2014-01-29 22:37:42 +04:00
iter ! = sourceSet . end ( ) ; + + iter ) {
size_t currSource = * iter ;
if ( ( currTarget < targetPos & & currSource > sourcePos )
| | ( currTarget > targetPos & & currSource < sourcePos )
) {
return true ;
}
}
2013-09-11 18:46:37 +04:00
2014-01-29 22:37:42 +04:00
}
}
return false ;
2011-09-07 20:42:46 +04:00
}
2014-01-29 22:37:42 +04:00
int calcCrossedNonTerm ( const PHRASE * phraseTarget , const ALIGNMENT * alignmentTargetToSource )
{
assert ( phraseTarget - > size ( ) > = alignmentTargetToSource - > size ( ) ) ;
for ( size_t targetPos = 0 ; targetPos < alignmentTargetToSource - > size ( ) ; + + targetPos ) {
if ( isNonTerminal ( vcbT . getWord ( phraseTarget - > at ( targetPos ) ) ) ) {
const std : : set < size_t > & alignmentPoints = alignmentTargetToSource - > at ( targetPos ) ;
assert ( alignmentPoints . size ( ) = = 1 ) ;
size_t sourcePos = * alignmentPoints . begin ( ) ;
bool ret = calcCrossedNonTerm ( targetPos , sourcePos , alignmentTargetToSource ) ;
if ( ret )
return 1 ;
}
}
return 0 ;
}
double computeUnalignedPenalty ( const ALIGNMENT * alignmentTargetToSource )
2011-09-07 20:42:46 +04:00
{
// unaligned word counter
double unaligned = 1.0 ;
// only checking target words - source words are caught when computing inverse
2014-01-29 22:37:42 +04:00
for ( size_t ti = 0 ; ti < alignmentTargetToSource - > size ( ) ; + + ti ) {
2015-03-09 21:49:32 +03:00
const std : : set < size_t > & srcIndices = alignmentTargetToSource - > at ( ti ) ;
2011-09-07 20:42:46 +04:00
if ( srcIndices . empty ( ) ) {
unaligned * = 2.718 ;
}
}
return unaligned ;
}
2014-01-29 22:37:42 +04:00
double computeUnalignedFWPenalty ( const PHRASE * phraseTarget , const ALIGNMENT * alignmentTargetToSource )
2011-09-07 20:42:46 +04:00
{
// unaligned word counter
double unaligned = 1.0 ;
// only checking target words - source words are caught when computing inverse
2014-01-29 22:37:42 +04:00
for ( size_t ti = 0 ; ti < alignmentTargetToSource - > size ( ) ; + + ti ) {
2015-03-09 21:49:32 +03:00
const std : : set < size_t > & srcIndices = alignmentTargetToSource - > at ( ti ) ;
2014-01-29 22:37:42 +04:00
if ( srcIndices . empty ( ) & & functionWordList . find ( vcbT . getWord ( phraseTarget - > at ( ti ) ) ) ! = functionWordList . end ( ) ) {
2011-09-07 20:42:46 +04:00
unaligned * = 2.718 ;
}
}
return unaligned ;
}
2015-03-09 21:49:32 +03:00
void loadFunctionWords ( const std : : string & fileName )
2011-09-07 20:42:46 +04:00
{
2014-01-29 22:37:42 +04:00
std : : cerr < < " Loading function word list from " < < fileName ;
2015-03-09 21:49:32 +03:00
Moses : : InputFileStream inFile ( fileName ) ;
2011-09-07 20:42:46 +04:00
if ( inFile . fail ( ) ) {
2014-01-29 22:37:42 +04:00
std : : cerr < < " - ERROR: could not open file " < < std : : endl ;
2011-09-07 20:42:46 +04:00
exit ( 1 ) ;
}
2015-03-09 21:49:32 +03:00
std : : string line ;
while ( getline ( inFile , line ) ) {
std : : vector < std : : string > token ;
Moses : : Tokenize ( token , line ) ;
2011-09-07 20:42:46 +04:00
if ( token . size ( ) > 0 )
functionWordList . insert ( token [ 0 ] ) ;
}
2014-01-29 22:37:42 +04:00
std : : cerr < < " - read " < < functionWordList . size ( ) < < " function words " < < std : : endl ;
2015-03-09 21:49:32 +03:00
inFile . Close ( ) ;
2011-09-07 20:42:46 +04:00
}
2014-01-29 22:37:42 +04:00
double computeLexicalTranslation ( const PHRASE * phraseSource , const PHRASE * phraseTarget , const ALIGNMENT * alignmentTargetToSource )
2011-09-07 20:42:46 +04:00
{
// lexical translation probability
double lexScore = 1.0 ;
int null = vcbS . getWordID ( " NULL " ) ;
// all target words have to be explained
2014-01-29 22:37:42 +04:00
for ( size_t ti = 0 ; ti < alignmentTargetToSource - > size ( ) ; ti + + ) {
2015-03-09 21:49:32 +03:00
const std : : set < size_t > & srcIndices = alignmentTargetToSource - > at ( ti ) ;
2011-09-07 20:42:46 +04:00
if ( srcIndices . empty ( ) ) {
// explain unaligned word by NULL
2014-01-29 22:37:42 +04:00
lexScore * = lexTable . permissiveLookup ( null , phraseTarget - > at ( ti ) ) ;
2011-09-07 20:42:46 +04:00
} else {
// go through all the aligned words to compute average
double thisWordScore = 0 ;
2015-03-09 21:49:32 +03:00
for ( std : : set < size_t > : : const_iterator p ( srcIndices . begin ( ) ) ; p ! = srcIndices . end ( ) ; + + p ) {
2014-01-29 22:37:42 +04:00
thisWordScore + = lexTable . permissiveLookup ( phraseSource - > at ( * p ) , phraseTarget - > at ( ti ) ) ;
2011-09-07 20:42:46 +04:00
}
lexScore * = thisWordScore / ( double ) srcIndices . size ( ) ;
}
}
return lexScore ;
}
2014-01-29 22:37:42 +04:00
2015-03-09 21:49:32 +03:00
void LexicalTable : : load ( const std : : string & fileName )
2011-09-07 20:42:46 +04:00
{
2014-01-29 22:37:42 +04:00
std : : cerr < < " Loading lexical translation table from " < < fileName ;
2015-03-09 21:49:32 +03:00
Moses : : InputFileStream inFile ( fileName ) ;
2011-09-07 20:42:46 +04:00
if ( inFile . fail ( ) ) {
2014-01-29 22:37:42 +04:00
std : : cerr < < " - ERROR: could not open file " < < std : : endl ;
2011-09-07 20:42:46 +04:00
exit ( 1 ) ;
}
2015-03-09 21:49:32 +03:00
std : : string line ;
2011-09-07 20:42:46 +04:00
int i = 0 ;
2015-03-09 21:49:32 +03:00
while ( getline ( inFile , line ) ) {
2011-09-07 20:42:46 +04:00
i + + ;
2015-03-09 21:49:32 +03:00
if ( i % 100000 = = 0 ) std : : cerr < < " . " < < std : : flush ;
2011-09-07 20:42:46 +04:00
2015-03-09 21:49:32 +03:00
std : : vector < std : : string > token ;
Moses : : Tokenize ( token , line ) ;
2011-09-07 20:42:46 +04:00
if ( token . size ( ) ! = 3 ) {
2015-01-14 14:07:42 +03:00
std : : cerr < < " line " < < i < < " in " < < fileName
< < " has wrong number of tokens, skipping: " < < std : : endl
< < token . size ( ) < < " " < < token [ 0 ] < < " " < < line < < std : : endl ;
2011-09-07 20:42:46 +04:00
continue ;
}
2015-07-24 22:43:29 +03:00
double prob = std : : atof ( token [ 2 ] . c_str ( ) ) ;
2011-09-07 20:42:46 +04:00
WORD_ID wordT = vcbT . storeIfNew ( token [ 0 ] ) ;
WORD_ID wordS = vcbS . storeIfNew ( token [ 1 ] ) ;
ltable [ wordS ] [ wordT ] = prob ;
}
2014-01-29 22:37:42 +04:00
std : : cerr < < std : : endl ;
2011-09-07 20:42:46 +04:00
}
2012-01-12 18:34:52 +04:00
2014-01-29 22:37:42 +04:00
void printSourcePhrase ( const PHRASE * phraseSource , const PHRASE * phraseTarget ,
2015-03-09 21:49:32 +03:00
const ALIGNMENT * targetToSourceAlignment , std : : ostream & out )
2012-05-27 15:43:16 +04:00
{
2014-01-29 22:37:42 +04:00
// get corresponding target non-terminal and output pair
ALIGNMENT * sourceToTargetAlignment = new ALIGNMENT ( ) ;
invertAlignment ( phraseSource , phraseTarget , targetToSourceAlignment , sourceToTargetAlignment ) ;
2012-05-27 15:43:16 +04:00
// output source symbols, except root, in rule table format
2014-01-29 22:37:42 +04:00
for ( std : : size_t i = 0 ; i < phraseSource - > size ( ) - 1 ; + + i ) {
const std : : string & word = vcbS . getWord ( phraseSource - > at ( i ) ) ;
2012-05-27 15:43:16 +04:00
if ( ! unpairedExtractFormatFlag | | ! isNonTerminal ( word ) ) {
out < < word < < " " ;
continue ;
}
2014-01-29 22:37:42 +04:00
const std : : set < std : : size_t > & alignmentPoints = sourceToTargetAlignment - > at ( i ) ;
2012-05-27 15:43:16 +04:00
assert ( alignmentPoints . size ( ) = = 1 ) ;
2014-01-29 22:37:42 +04:00
size_t j = * ( alignmentPoints . begin ( ) ) ;
2012-05-27 15:43:16 +04:00
if ( inverseFlag ) {
2014-01-29 22:37:42 +04:00
out < < vcbT . getWord ( phraseTarget - > at ( j ) ) < < word < < " " ;
2012-05-27 15:43:16 +04:00
} else {
2014-01-29 22:37:42 +04:00
out < < word < < vcbT . getWord ( phraseTarget - > at ( j ) ) < < " " ;
2012-05-27 15:43:16 +04:00
}
}
// output source root symbol
if ( conditionOnTargetLhsFlag & & ! inverseFlag ) {
out < < " [X] " ;
} else {
2014-01-29 22:37:42 +04:00
out < < vcbS . getWord ( phraseSource - > back ( ) ) ;
2012-05-27 15:43:16 +04:00
}
2014-01-29 22:37:42 +04:00
delete sourceToTargetAlignment ;
2012-05-27 15:43:16 +04:00
}
2014-01-29 22:37:42 +04:00
void printTargetPhrase ( const PHRASE * phraseSource , const PHRASE * phraseTarget ,
2015-03-09 21:49:32 +03:00
const ALIGNMENT * targetToSourceAlignment , std : : ostream & out )
2012-05-27 15:43:16 +04:00
{
// output target symbols, except root, in rule table format
2014-01-29 22:37:42 +04:00
for ( std : : size_t i = 0 ; i < phraseTarget - > size ( ) - 1 ; + + i ) {
const std : : string & word = vcbT . getWord ( phraseTarget - > at ( i ) ) ;
2012-05-27 15:43:16 +04:00
if ( ! unpairedExtractFormatFlag | | ! isNonTerminal ( word ) ) {
out < < word < < " " ;
continue ;
}
// get corresponding source non-terminal and output pair
2014-01-29 22:37:42 +04:00
std : : set < std : : size_t > alignmentPoints = targetToSourceAlignment - > at ( i ) ;
2012-05-27 15:43:16 +04:00
assert ( alignmentPoints . size ( ) = = 1 ) ;
int j = * ( alignmentPoints . begin ( ) ) ;
if ( inverseFlag ) {
2014-01-29 22:37:42 +04:00
out < < word < < vcbS . getWord ( phraseSource - > at ( j ) ) < < " " ;
2012-05-27 15:43:16 +04:00
} else {
2014-01-29 22:37:42 +04:00
out < < vcbS . getWord ( phraseSource - > at ( j ) ) < < word < < " " ;
2012-05-27 15:43:16 +04:00
}
}
// output target root symbol
if ( conditionOnTargetLhsFlag ) {
if ( inverseFlag ) {
out < < " [X] " ;
} else {
2014-01-29 22:37:42 +04:00
out < < vcbS . getWord ( phraseSource - > back ( ) ) ;
2012-05-27 15:43:16 +04:00
}
} else {
2014-01-29 22:37:42 +04:00
out < < vcbT . getWord ( phraseTarget - > back ( ) ) ;
2012-05-27 15:43:16 +04:00
}
}
2013-05-29 21:16:15 +04:00
2014-01-29 22:37:42 +04:00
void invertAlignment ( const PHRASE * phraseSource , const PHRASE * phraseTarget ,
2015-01-14 14:07:42 +03:00
const ALIGNMENT * inTargetToSourceAlignment , ALIGNMENT * outSourceToTargetAlignment )
{
// typedef std::vector< std::set<size_t> > ALIGNMENT;
2013-05-29 21:16:15 +04:00
2014-01-29 22:37:42 +04:00
outSourceToTargetAlignment - > clear ( ) ;
size_t numberOfSourceSymbols = ( hierarchicalFlag ? phraseSource - > size ( ) - 1 : phraseSource - > size ( ) ) ;
outSourceToTargetAlignment - > resize ( numberOfSourceSymbols ) ;
// add alignment point
for ( size_t targetPosition = 0 ; targetPosition < inTargetToSourceAlignment - > size ( ) ; + + targetPosition ) {
2015-01-14 14:07:42 +03:00
for ( std : : set < size_t > : : iterator setIter = ( inTargetToSourceAlignment - > at ( targetPosition ) ) . begin ( ) ;
2014-01-29 22:37:42 +04:00
setIter ! = ( inTargetToSourceAlignment - > at ( targetPosition ) ) . end ( ) ; + + setIter ) {
size_t sourcePosition = * setIter ;
outSourceToTargetAlignment - > at ( sourcePosition ) . insert ( targetPosition ) ;
}
}
2012-01-12 18:34:52 +04:00
}
2014-06-11 22:27:18 +04:00