mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
175b540509
@ -21,24 +21,6 @@ void OpSequenceModel :: readLanguageModel(const char *lmFile)
|
||||
{
|
||||
|
||||
string unkOp = "_TRANS_SLF_";
|
||||
|
||||
|
||||
/*
|
||||
|
||||
// Code for SRILM
|
||||
|
||||
vector <int> numbers;
|
||||
int nonWordFlag = 0;
|
||||
|
||||
ptrOp = new Api;
|
||||
ptrOp -> read_lm(lmFile,lmOrder);
|
||||
numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str())));
|
||||
unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag);
|
||||
|
||||
*/
|
||||
|
||||
// Code to load KenLM
|
||||
|
||||
OSM = new Model(m_lmPath.c_str());
|
||||
State startState = OSM->NullContextState();
|
||||
State endState;
|
||||
@ -49,36 +31,6 @@ void OpSequenceModel :: readLanguageModel(const char *lmFile)
|
||||
void OpSequenceModel::Load()
|
||||
{
|
||||
|
||||
/*
|
||||
// load future cost
|
||||
|
||||
//vector <string> input;
|
||||
ifstream sr (m_featurePath.c_str());
|
||||
char* tmp;
|
||||
|
||||
CHECK(sr.is_open());
|
||||
|
||||
vector<FactorType> factorOrder;
|
||||
factorOrder.push_back(0);
|
||||
|
||||
string line;
|
||||
while (std::getline(sr, line))
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
tokens = TokenizeMultiCharSeparator(line, "|||");
|
||||
CHECK(tokens.size() == 3);
|
||||
|
||||
Phrase source, target;
|
||||
source.CreateFromString(Input, factorOrder, tokens[0], "|", NULL);
|
||||
target.CreateFromString(Output, factorOrder, tokens[1], "|", NULL);
|
||||
|
||||
ParallelPhrase pp(source, target);
|
||||
Scores scores = Tokenize<float>(tokens[2], " ");
|
||||
m_futureCost[pp] = scores;
|
||||
// m_coll[pp] = scores;
|
||||
}
|
||||
|
||||
*/
|
||||
readLanguageModel(m_lmPath.c_str());
|
||||
|
||||
}
|
||||
@ -285,9 +237,8 @@ std::vector<float> OpSequenceModel::GetFutureScores(const Phrase &source, const
|
||||
|
||||
void OpSequenceModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "feature-path") {
|
||||
m_featurePath = value;
|
||||
} else if (key == "path") {
|
||||
|
||||
if (key == "path") {
|
||||
m_lmPath = value;
|
||||
} else if (key == "order") {
|
||||
lmOrder = Scan<int>(value);
|
||||
|
@ -60,8 +60,7 @@ protected:
|
||||
|
||||
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
|
||||
std::set <int> targetNullWords;
|
||||
std::string m_featurePath, m_lmPath;
|
||||
|
||||
std::string m_lmPath;
|
||||
|
||||
|
||||
};
|
||||
|
@ -496,7 +496,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
|
||||
if (numElement == NOT_FOUND) {
|
||||
// init numElement
|
||||
numElement = tokens.size();
|
||||
CHECK(numElement >= 3);
|
||||
CHECK(numElement >= (PrintWordAlignment()?4:3));
|
||||
}
|
||||
|
||||
if (tokens.size() != numElement) {
|
||||
|
34
scripts/OSM/OSM-Train.sh
Executable file
34
scripts/OSM/OSM-Train.sh
Executable file
@ -0,0 +1,34 @@
|
||||
#!/bin/sh
|
||||
|
||||
echo 'Training OSM - Start'
|
||||
date
|
||||
|
||||
mkdir $5
|
||||
ln -s $1 $5/e
|
||||
ln -s $2 $5/f
|
||||
|
||||
$6/scripts/OSM/flipAlignment $3 > $5/align
|
||||
|
||||
echo 'Extracting Singletons'
|
||||
|
||||
$6/scripts/OSM/extract-singletons.perl $5/e $5/f $5/align > $5/Singletons
|
||||
|
||||
echo 'Converting Bilingual Sentence Pair into Operation Corpus'
|
||||
|
||||
$6/scripts/OSM/generateSequences $5/e $5/f $5/align $5/Singletons > $5/opCorpus # Generates Operation Corpus
|
||||
|
||||
echo 'Learning Operation Sequence Translation Model'
|
||||
|
||||
$7/ngram-count -kndiscount -order $4 -unk -text $5/opCorpus -lm $5/operationLM
|
||||
|
||||
echo 'Binarizing'
|
||||
|
||||
$6/bin/build_binary $5/operationLM $5/operationLM.bin
|
||||
|
||||
\rm $5/e
|
||||
\rm $5/f
|
||||
\rm $5/align
|
||||
|
||||
echo 'Training OSM - End'
|
||||
date
|
||||
|
46
scripts/OSM/extract-singletons.perl
Executable file
46
scripts/OSM/extract-singletons.perl
Executable file
@ -0,0 +1,46 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use Getopt::Std;
|
||||
getopts('q');
|
||||
|
||||
$target = shift;
|
||||
$source = shift;
|
||||
$align = shift or die "
|
||||
Usage: extract-singletons.perl target source align
|
||||
|
||||
";
|
||||
open(TARGET,$target) or die "Error: unable to open target file \"$target\"!\n";
|
||||
open(SOURCE,$source) or die "Error: unable to open source file \"$source\"!\n";
|
||||
open(ALIGN,$align) or die "Error: unable to open alignment file \"$align\"!\n";
|
||||
|
||||
while (<TARGET>) {
|
||||
unless (defined $opt_q) {
|
||||
print STDERR "\r$M" if ++$M%1000 == 0;
|
||||
}
|
||||
@T = split;
|
||||
$_ = <SOURCE>;
|
||||
@S = split;
|
||||
$_ = <ALIGN>;
|
||||
@A = split;
|
||||
|
||||
my(@source_links,@target_links);
|
||||
for( $i=0; $i<=$#A; $i+=2 ) {
|
||||
$target_links[$A[$i]]++;
|
||||
$source_links[$A[$i+1]]++;
|
||||
}
|
||||
|
||||
for( $i=0; $i<=$#A; $i+=2 ) {
|
||||
if ($target_links[$A[$i]] == 1 && $source_links[$A[$i+1]] == 1 &&
|
||||
$T[$A[$i]] eq $S[$A[$i+1]])
|
||||
{
|
||||
$count{$S[$A[$i+1]]}++; # Print this if it only occurs here
|
||||
}
|
||||
else {
|
||||
$count{$S[$A[$i+1]]}+=2; # Don't print this
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach $w (sort keys %count) {
|
||||
print "$w\n" if $count{$w}==1;
|
||||
}
|
BIN
scripts/OSM/flipAlignment
Executable file
BIN
scripts/OSM/flipAlignment
Executable file
Binary file not shown.
BIN
scripts/OSM/generateSequences
Executable file
BIN
scripts/OSM/generateSequences
Executable file
Binary file not shown.
@ -511,6 +511,13 @@ pcfg-score
|
||||
default-name: model/scored-corpus
|
||||
pass-unless: use-pcfg-feature
|
||||
template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
|
||||
build-osm
|
||||
in: corpus word-alignment
|
||||
out: osm-model
|
||||
ignore-unless: operation-sequence-model
|
||||
rerun-on-change: operation-sequence-model training-options script giza-settings
|
||||
template: $moses-script-dir/OSM/OSM-Train.sh IN0.$output-extension IN0.$input-extension IN1.$alignment-symmetrization-method $operation-sequence-model-order OUT $moses-src-dir $srilm-dir
|
||||
default-name: model/OSM
|
||||
extract-phrases
|
||||
in: corpus-mml-postfilter=OR=word-alignment scored-corpus
|
||||
out: extracted-phrases
|
||||
@ -579,7 +586,7 @@ build-sparse
|
||||
default-name: model/sparse-features
|
||||
template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
|
||||
create-config
|
||||
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains INTERPOLATED-LM:binlm LM:binlm
|
||||
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
|
||||
out: config
|
||||
ignore-if: use-hiero
|
||||
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
|
||||
|
@ -2164,11 +2164,13 @@ sub get_config_tables {
|
||||
sub define_training_create_config {
|
||||
my ($step_id) = @_;
|
||||
|
||||
my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,@LM)
|
||||
my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,$osm, @LM)
|
||||
= &get_output_and_input($step_id);
|
||||
|
||||
my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);
|
||||
|
||||
$cmd .= "-osm-model $osm/operationLM.bin " if $osm;
|
||||
|
||||
# sparse lexical features provide additional content for config file
|
||||
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
|
||||
|
||||
|
@ -31,7 +31,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
|
||||
$_DECODING_GRAPH_BACKOFF,
|
||||
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
|
||||
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
|
||||
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG,
|
||||
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM,
|
||||
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
|
||||
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2,
|
||||
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
|
||||
@ -119,6 +119,7 @@ $_HELP = 1
|
||||
'xml' => \$_XML,
|
||||
'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
|
||||
'config=s' => \$_CONFIG,
|
||||
'osm-model=s' => \$_OSM,
|
||||
'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
|
||||
'do-steps=s' => \$_DO_STEPS,
|
||||
'memscore:s' => \$_MEMSCORE,
|
||||
@ -1992,6 +1993,15 @@ sub create_ini {
|
||||
}
|
||||
}
|
||||
|
||||
# operation sequence model
|
||||
|
||||
if($_OSM)
|
||||
{
|
||||
|
||||
$feature_spec .= "OpSequenceModel num-features=5 path=". $_OSM . " \n";
|
||||
$weight_spec .= "OpSequenceModel0= 0.08 -0.02 0.02 -0.001 0.03\n";
|
||||
}
|
||||
|
||||
# distance-based reordering
|
||||
if (!$_HIERARCHICAL) {
|
||||
$feature_spec .= "Distortion\n";
|
||||
|
Loading…
Reference in New Issue
Block a user