Merge branch 'nadir_osm'

This commit is contained in:
Nadir Durrani 2013-07-09 11:44:14 +01:00
commit 418abf42fa
9 changed files with 105 additions and 56 deletions

View File

@ -21,24 +21,6 @@ void OpSequenceModel :: readLanguageModel(const char *lmFile)
{
string unkOp = "_TRANS_SLF_";
/*
// Code for SRILM
vector <int> numbers;
int nonWordFlag = 0;
ptrOp = new Api;
ptrOp -> read_lm(lmFile,lmOrder);
numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str())));
unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag);
*/
// Code to load KenLM
OSM = new Model(m_lmPath.c_str());
State startState = OSM->NullContextState();
State endState;
@ -49,36 +31,6 @@ void OpSequenceModel :: readLanguageModel(const char *lmFile)
void OpSequenceModel::Load()
{
/*
// load future cost
//vector <string> input;
ifstream sr (m_featurePath.c_str());
char* tmp;
CHECK(sr.is_open());
vector<FactorType> factorOrder;
factorOrder.push_back(0);
string line;
while (std::getline(sr, line))
{
std::vector<std::string> tokens;
tokens = TokenizeMultiCharSeparator(line, "|||");
CHECK(tokens.size() == 3);
Phrase source, target;
source.CreateFromString(Input, factorOrder, tokens[0], "|", NULL);
target.CreateFromString(Output, factorOrder, tokens[1], "|", NULL);
ParallelPhrase pp(source, target);
Scores scores = Tokenize<float>(tokens[2], " ");
m_futureCost[pp] = scores;
// m_coll[pp] = scores;
}
*/
readLanguageModel(m_lmPath.c_str());
}
@ -285,9 +237,8 @@ std::vector<float> OpSequenceModel::GetFutureScores(const Phrase &source, const
void OpSequenceModel::SetParameter(const std::string& key, const std::string& value)
{
if (key == "feature-path") {
m_featurePath = value;
} else if (key == "path") {
if (key == "path") {
m_lmPath = value;
} else if (key == "order") {
lmOrder = Scan<int>(value);

View File

@ -60,8 +60,7 @@ protected:
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
std::set <int> targetNullWords;
std::string m_featurePath, m_lmPath;
std::string m_lmPath;
};

34
scripts/OSM/OSM-Train.sh Executable file
View File

@ -0,0 +1,34 @@
#!/bin/sh
echo 'Training OSM - Start'
date
mkdir $5
ln -s $1 $5/e
ln -s $2 $5/f
$6/scripts/OSM/flipAlignment $3 > $5/align
echo 'Extracting Singletons'
$6/scripts/OSM/extract-singletons.perl $5/e $5/f $5/align > $5/Singletons
echo 'Converting Bilingual Sentence Pair into Operation Corpus'
$6/scripts/OSM/generateSequences $5/e $5/f $5/align $5/Singletons > $5/opCorpus # Generates Operation Corpus
echo 'Learning Operation Sequence Translation Model'
$7/ngram-count -kndiscount -order $4 -unk -text $5/opCorpus -lm $5/operationLM
echo 'Binarizing'
$6/bin/build_binary $5/operationLM $5/operationLM.bin
\rm $5/e
\rm $5/f
\rm $5/align
echo 'Training OSM - End'
date

View File

@ -0,0 +1,46 @@
#!/usr/bin/perl
use Getopt::Std;
getopts('q');
$target = shift;
$source = shift;
$align = shift or die "
Usage: extract-singletons.perl target source align
";
open(TARGET,$target) or die "Error: unable to open target file \"$target\"!\n";
open(SOURCE,$source) or die "Error: unable to open source file \"$source\"!\n";
open(ALIGN,$align) or die "Error: unable to open alignment file \"$align\"!\n";
while (<TARGET>) {
unless (defined $opt_q) {
print STDERR "\r$M" if ++$M%1000 == 0;
}
@T = split;
$_ = <SOURCE>;
@S = split;
$_ = <ALIGN>;
@A = split;
my(@source_links,@target_links);
for( $i=0; $i<=$#A; $i+=2 ) {
$target_links[$A[$i]]++;
$source_links[$A[$i+1]]++;
}
for( $i=0; $i<=$#A; $i+=2 ) {
if ($target_links[$A[$i]] == 1 && $source_links[$A[$i+1]] == 1 &&
$T[$A[$i]] eq $S[$A[$i+1]])
{
$count{$S[$A[$i+1]]}++; # Print this if it only occurs here
}
else {
$count{$S[$A[$i+1]]}+=2; # Don't print this
}
}
}
foreach $w (sort keys %count) {
print "$w\n" if $count{$w}==1;
}

BIN
scripts/OSM/flipAlignment Executable file

Binary file not shown.

BIN
scripts/OSM/generateSequences Executable file

Binary file not shown.

View File

@ -511,6 +511,13 @@ pcfg-score
default-name: model/scored-corpus
pass-unless: use-pcfg-feature
template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
build-osm
in: corpus word-alignment
out: osm-model
ignore-unless: operation-sequence-model
rerun-on-change: operation-sequence-model training-options script giza-settings
template: $moses-script-dir/OSM/OSM-Train.sh IN0.$output-extension IN0.$input-extension IN1.$alignment-symmetrization-method $operation-sequence-model-order OUT $moses-src-dir $srilm-dir
default-name: model/OSM
extract-phrases
in: corpus-mml-postfilter=OR=word-alignment scored-corpus
out: extracted-phrases
@ -579,7 +586,7 @@ build-sparse
default-name: model/sparse-features
template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
create-config
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains INTERPOLATED-LM:binlm LM:binlm
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
out: config
ignore-if: use-hiero
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini

View File

@ -2164,11 +2164,13 @@ sub get_config_tables {
sub define_training_create_config {
my ($step_id) = @_;
my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,@LM)
my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,$osm, @LM)
= &get_output_and_input($step_id);
my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);
$cmd .= "-osm-model $osm/operationLM.bin " if $osm;
# sparse lexical features provide additional content for config file
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;

View File

@ -31,7 +31,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_DECODING_GRAPH_BACKOFF,
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2,
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
@ -119,6 +119,7 @@ $_HELP = 1
'xml' => \$_XML,
'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
'config=s' => \$_CONFIG,
'osm-model=s' => \$_OSM,
'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
'do-steps=s' => \$_DO_STEPS,
'memscore:s' => \$_MEMSCORE,
@ -1992,6 +1993,15 @@ sub create_ini {
}
}
# operation sequence model
if($_OSM)
{
$feature_spec .= "OpSequenceModel num-features=5 path=". $_OSM . " \n";
$weight_spec .= "OpSequenceModel0= 0.08 -0.02 0.02 -0.001 0.03\n";
}
# distance-based reordering
if (!$_HIERARCHICAL) {
$feature_spec .= "Distortion\n";