This commit is contained in:
Hieu Hoang 2013-07-09 12:41:08 +01:00
commit 49ede97304
21 changed files with 154 additions and 105 deletions

View File

@ -65,7 +65,7 @@ int main(int argc, char **argv)
sourcePhrase.CreateFromString(Input, input, line, "||dummy_string||", NULL);
TargetPhraseVectorPtr decodedPhraseColl
= pdc.GetTargetPhraseCollectionRaw(sourcePhrase);
= pdc.GetTargetPhraseCollectionRaw(sourcePhrase);
if(decodedPhraseColl != NULL) {
if(reportCounts)

View File

@ -20,24 +20,6 @@ void OpSequenceModel :: readLanguageModel(const char *lmFile)
{
string unkOp = "_TRANS_SLF_";
/*
// Code for SRILM
vector <int> numbers;
int nonWordFlag = 0;
ptrOp = new Api;
ptrOp -> read_lm(lmFile,lmOrder);
numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str())));
unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag);
*/
// Code to load KenLM
OSM = new Model(m_lmPath.c_str());
State startState = OSM->NullContextState();
State endState;
@ -48,36 +30,6 @@ void OpSequenceModel :: readLanguageModel(const char *lmFile)
void OpSequenceModel::Load()
{
/*
// load future cost
//vector <string> input;
ifstream sr (m_featurePath.c_str());
char* tmp;
CHECK(sr.is_open());
vector<FactorType> factorOrder;
factorOrder.push_back(0);
string line;
while (std::getline(sr, line))
{
std::vector<std::string> tokens;
tokens = TokenizeMultiCharSeparator(line, "|||");
CHECK(tokens.size() == 3);
Phrase source, target;
source.CreateFromString(Input, factorOrder, tokens[0], "|", NULL);
target.CreateFromString(Output, factorOrder, tokens[1], "|", NULL);
ParallelPhrase pp(source, target);
Scores scores = Tokenize<float>(tokens[2], " ");
m_futureCost[pp] = scores;
// m_coll[pp] = scores;
}
*/
readLanguageModel(m_lmPath.c_str());
}
@ -284,9 +236,8 @@ std::vector<float> OpSequenceModel::GetFutureScores(const Phrase &source, const
void OpSequenceModel::SetParameter(const std::string& key, const std::string& value)
{
if (key == "feature-path") {
m_featurePath = value;
} else if (key == "path") {
if (key == "path") {
m_lmPath = value;
} else if (key == "order") {
lmOrder = Scan<int>(value);

View File

@ -60,8 +60,7 @@ protected:
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
std::set <int> targetNullWords;
std::string m_featurePath, m_lmPath;
std::string m_lmPath;
};

View File

@ -38,7 +38,7 @@ public:
}
LabelId add(const Key& k) {
std::pair<typename M::iterator,bool> p
=m.insert(std::make_pair(k,data.size()));
=m.insert(std::make_pair(k,data.size()));
if(p.second) data.push_back(k);
CHECK(static_cast<size_t>(p.first->second)<data.size());
return p.first->second;

View File

@ -948,7 +948,7 @@ const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGra
boost::mutex::scoped_lock lock(m_transOptCacheMutex);
#endif
std::map<std::pair<std::pair<size_t, std::string>, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter
= m_transOptCache.find(key);
= m_transOptCache.find(key);
if (iter == m_transOptCache.end())
return NULL;
iter->second.second = clock(); // update last used time

View File

@ -673,7 +673,7 @@ public:
return false;
}
std::map< std::string, std::set< std::string > >::const_iterator lookupIgnoreFF
= m_weightSettingIgnoreFF.find( m_currentWeightSetting );
= m_weightSettingIgnoreFF.find( m_currentWeightSetting );
if (lookupIgnoreFF == m_weightSettingIgnoreFF.end()) {
return false;
}
@ -691,7 +691,7 @@ public:
return false;
}
std::map< std::string, std::set< size_t > >::const_iterator lookupIgnoreDP
= m_weightSettingIgnoreDP.find( m_currentWeightSetting );
= m_weightSettingIgnoreDP.find( m_currentWeightSetting );
if (lookupIgnoreDP == m_weightSettingIgnoreDP.end()) {
return false;
}

View File

@ -428,7 +428,7 @@ void CompressionTaskReordering::operator()()
while(scoresNum < m_encodedScores.size()) {
std::string scores = m_encodedScores[scoresNum];
std::string compressedScores
= m_creator.CompressEncodedScores(scores);
= m_creator.CompressEncodedScores(scores);
std::string dummy;
PackedItem packedItem(scoresNum, dummy, compressedScores, 0);

View File

@ -61,7 +61,7 @@ PhraseDecoder::~PhraseDecoder()
inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
= m_sourceSymbolsMap.find(symbol);
= m_sourceSymbolsMap.find(symbol);
if(it != m_sourceSymbolsMap.end())
return it->second;
@ -200,7 +200,7 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
if(m_coding == PREnc) {
std::pair<TargetPhraseVectorPtr, size_t> cachedPhraseColl
= m_decodingCache.Retrieve(sourcePhrase);
= m_decodingCache.Retrieve(sourcePhrase);
// Has been cached and is complete or does not need to be completed
if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0))
@ -255,7 +255,7 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
if(m_coding == REnc) {
for(size_t i = 0; i < sourcePhrase.GetSize(); i++) {
std::string sourceWord
= sourcePhrase.GetWord(i).GetString(*m_input, false);
= sourcePhrase.GetWord(i).GetString(*m_input, false);
unsigned idx = GetSourceSymbolId(sourceWord);
sourceWords.push_back(idx);
}

View File

@ -117,7 +117,7 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c
// Retrieve target phrase collection from phrase table
TargetPhraseVectorPtr decodedPhraseColl
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true);
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true);
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));

View File

@ -426,7 +426,7 @@ void PhraseTableCreator::AddTargetSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
= m_sourceSymbolsMap.find(symbol);
= m_sourceSymbolsMap.find(symbol);
if(it != m_sourceSymbolsMap.end())
return it->second;
@ -437,7 +437,7 @@ unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
= m_targetSymbolsMap.find(symbol);
= m_targetSymbolsMap.find(symbol);
if(it != m_targetSymbolsMap.end())
return it->second;
@ -451,7 +451,7 @@ unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol)
boost::mutex::scoped_lock lock(m_mutex);
#endif
boost::unordered_map<std::string, unsigned>::iterator it
= m_targetSymbolsMap.find(symbol);
= m_targetSymbolsMap.find(symbol);
if(it != m_targetSymbolsMap.end())
return it->second;
@ -1212,7 +1212,7 @@ void CompressionTask::operator()()
while(collectionNum < m_encodedCollections.size()) {
std::string collection = m_encodedCollections[collectionNum];
std::string compressedCollection
= m_creator.CompressEncodedCollection(collection);
= m_creator.CompressEncodedCollection(collection);
std::string dummy;
PackedItem packedItem(collectionNum, dummy, compressedCollection, 0);

View File

@ -143,7 +143,7 @@ public:
return data;
else {
typename std::vector<DataType>::iterator it
= std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
= std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
if(it != m_bestVec.end())
return *it;
else

View File

@ -108,28 +108,28 @@ enum DistortionOrientationOptions {
enum PhraseTableImplementation {
Memory = 0
,Binary = 1
,OnDisk = 2
//,GlueRule = 3
//,Joshua = 4
//,MemorySourceLabel = 5
,SCFG = 6
//,BerkeleyDb = 7
,SuffixArray = 8
,Hiero = 9
,ALSuffixArray = 10
,FuzzyMatch = 11
,Compact = 12
,Interpolated = 13
,DSuffixArray = 14
,Binary = 1
,OnDisk = 2
//,GlueRule = 3
//,Joshua = 4
//,MemorySourceLabel = 5
,SCFG = 6
//,BerkeleyDb = 7
,SuffixArray = 8
,Hiero = 9
,ALSuffixArray = 10
,FuzzyMatch = 11
,Compact = 12
,Interpolated = 13
,DSuffixArray = 14
};
enum InputTypeEnum {
SentenceInput = 0
,ConfusionNetworkInput = 1
,WordLatticeInput = 2
,TreeInputType = 3
,WordLatticeInput2 = 4
,ConfusionNetworkInput = 1
,WordLatticeInput = 2
,TreeInputType = 3
,WordLatticeInput2 = 4
};
@ -142,7 +142,7 @@ enum XmlInputType {
enum DictionaryFind {
Best = 0
,All = 1
,All = 1
};
enum ParsingAlgorithm {
@ -152,22 +152,22 @@ enum ParsingAlgorithm {
enum SearchAlgorithm {
Normal = 0
,CubePruning = 1
,CubeGrowing = 2
,ChartDecoding= 3
,NormalBatch = 4
,ChartIncremental = 5
,CubePruning = 1
,CubeGrowing = 2
,ChartDecoding= 3
,NormalBatch = 4
,ChartIncremental = 5
};
enum SourceLabelOverlap {
SourceLabelOverlapAdd = 0
,SourceLabelOverlapReplace = 1
,SourceLabelOverlapDiscard = 2
,SourceLabelOverlapReplace = 1
,SourceLabelOverlapDiscard = 2
};
enum WordAlignmentSort {
NoSort = 0
,TargetOrder = 1
,TargetOrder = 1
};
enum FormatType {

View File

@ -137,7 +137,7 @@ void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
const SplitPoints &point = *p;
if (point.size() > 3) {
const vector< SyntaxNode* >& topNodes
= tree.GetNodes( point[0], point[point.size()-1]-1);
= tree.GetNodes( point[0], point[point.size()-1]-1);
string topLabel = topNodes[0]->GetLabel();
for(size_t i=2; i<point.size()-1; i++) {
@ -155,7 +155,7 @@ void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
if (point.size() > 3) {
int endPoint = point[point.size()-1]-1;
const vector< SyntaxNode* >& topNodes
= tree.GetNodes( point[0], endPoint);
= tree.GetNodes( point[0], endPoint);
string topLabel = topNodes[0]->GetLabel();
for(size_t i=1; i<point.size()-2; i++) {

34
scripts/OSM/OSM-Train.sh Executable file
View File

@ -0,0 +1,34 @@
#!/bin/sh
echo 'Training OSM - Start'
date
mkdir $5
ln -s $1 $5/e
ln -s $2 $5/f
$6/scripts/OSM/flipAlignment $3 > $5/align
echo 'Extracting Singletons'
$6/scripts/OSM/extract-singletons.perl $5/e $5/f $5/align > $5/Singletons
echo 'Converting Bilingual Sentence Pair into Operation Corpus'
$6/scripts/OSM/generateSequences $5/e $5/f $5/align $5/Singletons > $5/opCorpus # Generates Operation Corpus
echo 'Learning Operation Sequence Translation Model'
$7/ngram-count -kndiscount -order $4 -unk -text $5/opCorpus -lm $5/operationLM
echo 'Binarizing'
$6/bin/build_binary $5/operationLM $5/operationLM.bin
\rm $5/e
\rm $5/f
\rm $5/align
echo 'Training OSM - End'
date

View File

@ -0,0 +1,46 @@
#!/usr/bin/perl
use Getopt::Std;
getopts('q');
$target = shift;
$source = shift;
$align = shift or die "
Usage: extract-singletons.perl target source align
";
open(TARGET,$target) or die "Error: unable to open target file \"$target\"!\n";
open(SOURCE,$source) or die "Error: unable to open source file \"$source\"!\n";
open(ALIGN,$align) or die "Error: unable to open alignment file \"$align\"!\n";
while (<TARGET>) {
unless (defined $opt_q) {
print STDERR "\r$M" if ++$M%1000 == 0;
}
@T = split;
$_ = <SOURCE>;
@S = split;
$_ = <ALIGN>;
@A = split;
my(@source_links,@target_links);
for( $i=0; $i<=$#A; $i+=2 ) {
$target_links[$A[$i]]++;
$source_links[$A[$i+1]]++;
}
for( $i=0; $i<=$#A; $i+=2 ) {
if ($target_links[$A[$i]] == 1 && $source_links[$A[$i+1]] == 1 &&
$T[$A[$i]] eq $S[$A[$i+1]])
{
$count{$S[$A[$i+1]]}++; # Print this if it only occurs here
}
else {
$count{$S[$A[$i+1]]}+=2; # Don't print this
}
}
}
foreach $w (sort keys %count) {
print "$w\n" if $count{$w}==1;
}

BIN
scripts/OSM/flipAlignment Executable file

Binary file not shown.

BIN
scripts/OSM/generateSequences Executable file

Binary file not shown.

View File

@ -511,6 +511,13 @@ pcfg-score
default-name: model/scored-corpus
pass-unless: use-pcfg-feature
template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
build-osm
in: corpus word-alignment
out: osm-model
ignore-unless: operation-sequence-model
rerun-on-change: operation-sequence-model training-options script giza-settings
template: $moses-script-dir/OSM/OSM-Train.sh IN0.$output-extension IN0.$input-extension IN1.$alignment-symmetrization-method $operation-sequence-model-order OUT $moses-src-dir $srilm-dir
default-name: model/OSM
extract-phrases
in: corpus-mml-postfilter=OR=word-alignment scored-corpus
out: extracted-phrases
@ -579,7 +586,7 @@ build-sparse
default-name: model/sparse-features
template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
create-config
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains INTERPOLATED-LM:binlm LM:binlm
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
out: config
ignore-if: use-hiero
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini

View File

@ -2164,11 +2164,13 @@ sub get_config_tables {
sub define_training_create_config {
my ($step_id) = @_;
my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,@LM)
my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,$osm, @LM)
= &get_output_and_input($step_id);
my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);
$cmd .= "-osm-model $osm/operationLM.bin " if $osm;
# sparse lexical features provide additional content for config file
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;

View File

@ -31,7 +31,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_DECODING_GRAPH_BACKOFF,
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2,
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
@ -119,6 +119,7 @@ $_HELP = 1
'xml' => \$_XML,
'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
'config=s' => \$_CONFIG,
'osm-model=s' => \$_OSM,
'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
'do-steps=s' => \$_DO_STEPS,
'memscore:s' => \$_MEMSCORE,
@ -1992,6 +1993,15 @@ sub create_ini {
}
}
# operation sequence model
if($_OSM)
{
$feature_spec .= "OpSequenceModel num-features=5 path=". $_OSM . " \n";
$weight_spec .= "OpSequenceModel0= 0.08 -0.02 0.02 -0.001 0.03\n";
}
# distance-based reordering
if (!$_HIERARCHICAL) {
$feature_spec .= "Distortion\n";

View File

@ -33,14 +33,14 @@ extern "C" {
#endif
#if defined(__STDC__)
int DeclareParams(char *, ...);
int DeclareParams(char *, ...);
#else
int DeclareParams();
int DeclareParams();
#endif
int GetParams(int *n, char ***a,char *CmdFileName),
SPrintParams(),
PrintParams();
int GetParams(int *n, char ***a,char *CmdFileName),
SPrintParams(),
PrintParams();
#ifdef __cplusplus
}