Merge branch 'master' into hieu

This commit is contained in:
Hieu Hoang 2014-02-17 22:36:24 +00:00
commit 5155fa26ce
15 changed files with 391 additions and 152 deletions

View File

@ -9,8 +9,8 @@
#include "moses/FactorCollection.h"
#include "moses/InputFileStream.h"
#include "util/exception.hh"
#include "ChartState.h"
#include "util/exception.hh"
#include "moses/ChartHypothesis.h"
#include "moses/ChartManager.h"
using namespace std;
@ -58,6 +58,11 @@ public:
delete state;
}
void reset(const DALMState &from){
delete state;
state = new DALM::State(*from.state);
}
virtual int Compare(const FFState& other) const{
const DALMState &o = static_cast<const DALMState &>(other);
if(state->get_count() < o.state->get_count()) return -1;
@ -74,6 +79,67 @@ public:
}
};
class DALMChartState : public FFState
{
private:
const ChartHypothesis &m_hypo;
DALM::VocabId *prefixIDs;
size_t prefixLength;
float prefixScore;
DALMState *rightContext;
bool isLarge;
public:
DALMChartState(const ChartHypothesis &hypo, DALM::VocabId *prefixIDs, size_t prefixLength, float prefixScore, DALMState *rightContext, bool isLarge)
: m_hypo(hypo), prefixIDs(prefixIDs), prefixLength(prefixLength), prefixScore(prefixScore), rightContext(rightContext), isLarge(isLarge)
{}
virtual ~DALMChartState(){
if(prefixIDs != NULL) delete [] prefixIDs;
if(rightContext != NULL) delete rightContext;
}
size_t GetPrefixLength() const{
return prefixLength;
}
const DALM::VocabId *GetPrefixIDs() const{
return prefixIDs;
}
float GetPrefixScore() const{
return prefixScore;
}
const DALMState *GetRightContext() const{
return rightContext;
}
bool LargeEnough() const{
return isLarge;
}
virtual int Compare(const FFState& other) const{
const DALMChartState &o = static_cast<const DALMChartState &>(other);
// prefix
if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
if(prefixLength != o.prefixLength){
return (prefixLength < o.prefixLength)?-1:1;
}else{
int ret = memcmp(prefixIDs, o.prefixIDs, prefixLength);
if (ret != 0) return ret;
}
}
// suffix
size_t inputSize = m_hypo.GetManager().GetSource().GetSize();
if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
int ret = o.rightContext->Compare(*rightContext);
if (ret != 0) return ret;
}
return 0;
}
};
LanguageModelDALM::LanguageModelDALM(const std::string &line)
:LanguageModel(line)
{
@ -150,60 +216,40 @@ void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float
size_t phraseSize = phrase.GetSize();
if (!phraseSize) return;
DALMState *dalm_state = new DALMState(m_nGramOrder);
size_t currPos = 0;
size_t hist_count = 0;
DALMState *dalm_state = new DALMState(m_nGramOrder);
DALM::State *state = dalm_state->get_state();
if(phrase.GetWord(0).GetFactor(m_factorType) == m_beginSentenceFactor){
m_lm->init_state(*state);
currPos++;
hist_count++;
}
while (currPos < phraseSize) {
const Word &word = phrase.GetWord(currPos);
hist_count++;
if (word.IsNonTerminal()) {
// do nothing. reset ngram. needed to score target phrases during pt loading in chart decoding
dalm_state->refresh();
state->refresh();
hist_count = 0;
} else {
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
// do nothing, don't include prob for <s> unigram
if (currPos != 0) {
UTIL_THROW2("Either your data contains <s> in a position other than the first word or your language model is missing <s>. Did you build your ARPA using IRSTLM and forget to run add-start-end.sh?");
}
m_lm->init_state(*dalm_state->get_state());
} else {
LMResult result = GetValue(word, dalm_state->get_state());
fullScore += result.score;
if (hist_count >= m_nGramOrder) ngramScore += result.score;
if (result.unknown) ++oovCount;
}
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
float score = m_lm->query(wid, *state);
fullScore += score;
if (hist_count >= m_nGramOrder) ngramScore += score;
if (wid==m_vocab->unk()) ++oovCount;
}
currPos++;
}
fullScore = TransformLMScore(fullScore);
ngramScore = TransformLMScore(ngramScore);
delete dalm_state;
}
LMResult LanguageModelDALM::GetValue(DALM::VocabId wid, DALM::State* finalState) const{
LMResult ret;
// last word is unk?
ret.unknown = (wid == m_vocab->unk());
// calc score.
float score = m_lm->query(wid, *finalState);
score = TransformLMScore(score);
ret.score = score;
return ret;
}
LMResult LanguageModelDALM::GetValue(const Word &word, DALM::State* finalState) const
{
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
return GetValue(wid, finalState);
}
FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const{
// In this function, we only compute the LM scores of n-grams that overlap a
// phrase boundary. Phrase-internal scores are taken directly from the
@ -222,28 +268,28 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
const std::size_t adjust_end = std::min(end, begin + m_nGramOrder - 1);
DALMState *dalm_state = new DALMState(*dalm_ps);
DALM::State *state = dalm_state->get_state();
std::size_t position = begin;
float score = 0.0;
for(; position < adjust_end; position++){
score += GetValue(hypo.GetWord(position), dalm_state->get_state()).score;
for(std::size_t position=begin; position < adjust_end; position++){
score += m_lm->query(GetVocabId(hypo.GetWord(position).GetFactor(m_factorType)), *state);
}
if (hypo.IsSourceCompleted()) {
// Score end of sentence.
std::vector<DALM::VocabId> indices(m_nGramOrder-1);
const DALM::VocabId *last = LastIDs(hypo, &indices.front());
m_lm->set_state(&indices.front(), (last-&indices.front()), *dalm_state->get_state());
m_lm->set_state(&indices.front(), (last-&indices.front()), *state);
float s = GetValue(wid_end, dalm_state->get_state()).score;
score += s;
score += m_lm->query(wid_end, *state);
} else if (adjust_end < end) {
// Get state after adding a long phrase.
std::vector<DALM::VocabId> indices(m_nGramOrder-1);
const DALM::VocabId *last = LastIDs(hypo, &indices.front());
m_lm->set_state(&indices.front(), (last-&indices.front()), *dalm_state->get_state());
m_lm->set_state(&indices.front(), (last-&indices.front()), *state);
}
score = TransformLMScore(score);
if (OOVFeatureEnabled()) {
std::vector<float> scores(2);
scores[0] = score;
@ -257,129 +303,176 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
}
FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const{
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, m_nGramOrder);
// initialize language model context state
DALMState *dalm_state = new DALMState(m_nGramOrder);
DALM::State *state = dalm_state->get_state();
size_t contextSize = m_nGramOrder-1;
DALM::VocabId *prefixIDs = new DALM::VocabId[contextSize];
size_t prefixLength = 0;
bool isLarge = false;
// initial language model scores
float prefixScore = 0.0; // not yet final for initial words (lack context)
float finalizedScore = 0.0; // finalized, has sufficient context
float prevScore = 0.0; // previous hypothesis
const TargetPhrase &targetPhrase = hypo.GetCurrTargetPhrase();
size_t hypoSize = targetPhrase.GetSize();
// get index map for underlying hypotheses
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap();
targetPhrase.GetAlignNonTerm().GetNonTermIndexMap();
size_t phrasePos = 0;
// begginig of sentence.
if(hypoSize > 0){
const Word &word = targetPhrase.GetWord(0);
if(!word.IsNonTerminal()){
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
if(word.GetFactor(m_factorType) == m_beginSentenceFactor){
m_lm->init_state(*state);
if (prefixLength < contextSize){
prefixIDs[prefixLength] = wid;
prefixLength++;
}else{
isLarge = true;
}
}else{
float score = m_lm->query(wid, *state);
if (prefixLength < contextSize){
prefixScore += score;
prefixIDs[prefixLength] = wid;
prefixLength++;
}else{ finalizedScore += score; }
}
}else{
// special case: rule starts with non-terminal -> copy everything
size_t nonTermIndex = nonTermIndexMap[0];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
const DALMChartState* prevState =
static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
// get prefixScore and finalizedScore
prefixScore = prevState->GetPrefixScore();
finalizedScore = -prefixScore;
prevScore = prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0];
// get language model state
dalm_state->reset(*prevState->GetRightContext());
state = dalm_state->get_state();
prefixLength = prevState->GetPrefixLength();
std::memcpy(prefixIDs, prevState->GetPrefixIDs(), sizeof(DALM::VocabId)*prefixLength);
}
phrasePos++;
}
// loop over rule
for (size_t phrasePos = 0, wordPos = 0;
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
phrasePos++) {
for (; phrasePos < hypoSize; phrasePos++) {
// consult rule for either word or non-terminal
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
const Word &word = targetPhrase.GetWord(phrasePos);
// regular word
if (!word.IsNonTerminal()) {
// beginning of sentence symbol <s>? -> just update state
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
UTIL_THROW_IF2(phrasePos != 0,
"Sentence start symbol must be at the beginning of sentence");
m_lm->init_state(*dalm_state->get_state());
}
// score a regular word added by the rule
else {
updateChartScore( &prefixScore, &finalizedScore, GetValue(word, dalm_state->get_state()).score, ++wordPos );
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
float score = m_lm->query(wid, *state);
if (prefixLength < contextSize){
prefixScore += score;
prefixIDs[prefixLength] = wid;
prefixLength++;
}else{
finalizedScore += score;
isLarge = true;
}
}
// non-terminal, add phrase from underlying hypothesis
// internal non-terminal
else {
// look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
const LanguageModelChartState* prevState =
static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID));
const DALMChartState* prevState =
static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
size_t subPhraseLength = prevState->GetNumTargetTerminals();
// special case: rule starts with non-terminal -> copy everything
if (phrasePos == 0) {
size_t prevPrefixLength = prevState->GetPrefixLength();
const DALM::VocabId *prevPrefixIDs = prevState->GetPrefixIDs();
// get prefixScore and finalizedScore
prefixScore = prevState->GetPrefixScore();
finalizedScore = prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] - prefixScore;
// get language model state
delete dalm_state;
dalm_state = new DALMState( *static_cast<DALMState*>(prevState->GetRightContext()) );
wordPos += subPhraseLength;
}
// internal non-terminal
else {
// score its prefix
size_t wpos = wordPos;
for(size_t prefixPos = 0;
prefixPos < m_nGramOrder-1 // up to LM order window
&& prefixPos < subPhraseLength; // up to length
prefixPos++) {
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
updateChartScore( &prefixScore, &finalizedScore, GetValue(word, dalm_state->get_state()).score, ++wpos );
for(size_t prefixPos = 0; prefixPos < prevPrefixLength; prefixPos++) {
DALM::VocabId wid = prevPrefixIDs[prefixPos];
float score = m_lm->query(wid, *state);
if (prefixLength < contextSize){
prefixScore += score;
prefixIDs[prefixLength] = wid;
prefixLength++;
} else {
finalizedScore += score;
isLarge = true;
}
}
wordPos += subPhraseLength;
// check if we are dealing with a large sub-phrase
if (subPhraseLength > m_nGramOrder - 1) {
if (prevState->LargeEnough()) {
// add its finalized language model score
finalizedScore +=
prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] // full score
- prevState->GetPrefixScore(); // - prefix score
prevScore += prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0];
finalizedScore -= prevState->GetPrefixScore();
// copy language model state
delete dalm_state;
dalm_state = new DALMState( *static_cast<DALMState*>(prevState->GetRightContext()) );
}
dalm_state->reset(*prevState->GetRightContext());
state = dalm_state->get_state();
}
}
}
// assign combined score to score breakdown
out->Assign(this, prefixScore + finalizedScore);
out->Assign(this, prevScore + TransformLMScore(prefixScore + finalizedScore));
ret->Set(prefixScore, dalm_state);
return ret;
return new DALMChartState(hypo, prefixIDs, prefixLength, prefixScore, dalm_state, isLarge);
}
bool LanguageModelDALM::IsUseable(const FactorMask &mask) const
{
bool ret = mask[m_factorType];
return ret;
return mask[m_factorType];
}
void LanguageModelDALM::CreateVocabMapping(const std::string &wordstxt)
{
InputFileStream vocabStrm(wordstxt);
std::vector< std::pair<std::size_t, DALM::VocabId> > vlist;
string line;
std::size_t max_fid = 0;
while(getline(vocabStrm, line)) {
const Factor *factor = FactorCollection::Instance().AddFactor(line);
std::size_t fid = factor->GetId();
DALM::VocabId wid = m_vocab->lookup(line.c_str());
VocabMap::value_type entry(factor, wid);
m_vocabMap.insert(entry);
vlist.push_back(std::pair<std::size_t, DALM::VocabId>(fid, wid));
if(max_fid < fid) max_fid = fid;
}
for(std::size_t i = 0; i < m_vocabMap.size(); i++){
m_vocabMap[i] = m_vocab->unk();
}
m_vocabMap.resize(max_fid+1, m_vocab->unk());
std::vector< std::pair<std::size_t, DALM::VocabId> >::iterator it = vlist.begin();
while(it != vlist.end()){
std::pair<std::size_t, DALM::VocabId> &entry = *it;
m_vocabMap[entry.first] = entry.second;
++it;
}
}
DALM::VocabId LanguageModelDALM::GetVocabId(const Factor *factor) const
{
VocabMap::left_map::const_iterator iter;
iter = m_vocabMap.left.find(factor);
if (iter != m_vocabMap.left.end()) {
return iter->second;
}
else {
// not in mapping. Must be UNK
return m_vocab->unk();
}
std::size_t fid = factor->GetId();
return (m_vocabMap.size() > fid)? m_vocabMap[fid] : m_vocab->unk();
}
void LanguageModelDALM::SetParameter(const std::string& key, const std::string& value)
@ -395,13 +488,4 @@ void LanguageModelDALM::SetParameter(const std::string& key, const std::string&
}
}
void LanguageModelDALM::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const
{
if (wordPos < m_nGramOrder) {
*prefixScore += score;
} else {
*finalizedScore += score;
}
}
}

View File

@ -53,17 +53,12 @@ protected:
DALM::LM *m_lm;
DALM::VocabId wid_start, wid_end;
typedef boost::bimap<const Factor *, DALM::VocabId> VocabMap;
mutable VocabMap m_vocabMap;
mutable std::vector<DALM::VocabId> m_vocabMap;
void CreateVocabMapping(const std::string &wordstxt);
DALM::VocabId GetVocabId(const Factor *factor) const;
private:
LMResult GetValue(DALM::VocabId wid, DALM::State* finalState) const;
LMResult GetValue(const Word &word, DALM::State* finalState) const;
void updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const;
// Convert last words of hypothesis into vocab ids, returning an end pointer.
DALM::VocabId *LastIDs(const Hypothesis &hypo, DALM::VocabId *indices) const {
DALM::VocabId *index = indices;

View File

@ -94,9 +94,16 @@ if $(with-nplm) {
local with-dalm = [ option.get "with-dalm" ] ;
if $(with-dalm) {
lib dalm : : <search>$(with-dalm)/lib ;
if [ path.exists $(with-dalm)/lib/libMurmurHash3.a ] {
lib MurmurHash3 : : <search>$(with-dalm)/lib ;
obj DALM.o : DALMWrapper.cpp dalm MurmurHash3 ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
alias dalmALIAS : DALM.o dalm MurmurHash3 : : : <define>LM_DALM ;
alias dalm-libs : dalm MurmurHash3 ;
} else {
alias dalm-libs : dalm ;
}
obj DALM.o : DALMWrapper.cpp dalm-libs ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
alias dalmALIAS : DALM.o dalm-libs : : : <define>LM_DALM ;
dependencies += dalmALIAS ;
lmmacros += LM_DALM ;
}

View File

@ -58,8 +58,7 @@ const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollectionLEGACY(
size_t hash = hash_value(src);
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
CacheColl::iterator iter;
iter = cache.find(hash);
if (iter == cache.end()) {
@ -179,7 +178,7 @@ void PhraseDictionary::ReduceCache() const
// find cutoff for last used time
priority_queue< clock_t > lastUsedTimes;
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iter;
CacheColl::iterator iter;
iter = cache.begin();
while( iter != cache.end() ) {
lastUsedTimes.push( iter->second.second );
@ -193,7 +192,7 @@ void PhraseDictionary::ReduceCache() const
iter = cache.begin();
while( iter != cache.end() ) {
if (iter->second.second < cutoffLastUsedTime) {
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iterRemove = iter++;
CacheColl::iterator iterRemove = iter++;
delete iterRemove->second.first;
cache.erase(iterRemove);
} else iter++;

View File

@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <stdexcept>
#include <vector>
#include <string>
#include <boost/unordered_map.hpp>
#ifdef WITH_THREADS
#include <boost/thread/tss.hpp>
@ -54,7 +55,7 @@ class ChartCellCollectionBase;
class ChartRuleLookupManager;
class ChartParser;
class CacheColl : public std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
class CacheColl : public boost::unordered_map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
{
// 1st = hash of source phrase/ address of phrase-table node
// 2nd = all translations

View File

@ -59,7 +59,7 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
CacheColl &cache = GetCache();
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
CacheColl::iterator iter;
iter = cache.find(hash);
if (iter != cache.end()) {

View File

@ -167,7 +167,7 @@ const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollection(
CacheColl &cache = GetCache();
size_t hash = (size_t) ptNode->GetFilePos();
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
CacheColl::iterator iter;
iter = cache.find(hash);

View File

@ -47,8 +47,8 @@ ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
m_count(count),
m_pcfgSum(pcfgSum)
{
assert(phraseSource.empty());
assert(phraseTarget.empty());
assert(phraseSource->empty());
assert(phraseTarget->empty());
m_count = count;
m_pcfgSum = pcfgSum;

View File

@ -506,7 +506,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
const ScoreFeatureManager& featureManager,
const MaybeLog& maybeLogProb )
{
assert(phrasePair.isValid());
assert(phrasePair.IsValid());
const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
float count = phrasePair.GetCount();

View File

@ -236,9 +236,8 @@ tokenize-tuning
factorize-tuning
in: tokenized-tuning
out: factorized-tuning
rerun-on-change: TRAINING:output-factors
default-name: lm/interpolate-tuning.factored
pass-unless: factors
pass-unless: TRAINING:output-factors
parallelizable: yes
error: can't open
error: incompatible number of words in factor

View File

@ -981,6 +981,9 @@ sub define_step {
elsif ($DO_STEP[$i] eq 'TRAINING:create-config' || $DO_STEP[$i] eq 'TRAINING:create-config-interpolated-lm') {
&define_training_create_config($i);
}
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:factorize-tuning') {
&define_interpolated_lm_factorize_tuning($i);
}
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:interpolate') {
&define_interpolated_lm_interpolate($i);
}
@ -1512,6 +1515,21 @@ sub define_lm_factorize {
&create_step($step_id,$cmd);
}
sub define_interpolated_lm_factorize_tuning {
my ($step_id) = @_;
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir");
my ($output,$input) = &get_output_and_input($step_id);
my $factor = &check_backoff_and_get_array("TRAINING:output-factors");
my $dir = &check_and_get("GENERAL:working-dir");
my $temp_dir = &check_and_get("INPUT-FACTOR:temp-dir") . ".$VERSION";
my $cmd = "mkdir -p $temp_dir\n"
. &factorize_one_language("OUTPUT-FACTOR",$input,$output,$factor,$step_id);
&create_step($step_id,$cmd);
}
sub define_splitter_train {
my ($step_id,$set) = @_;
@ -2277,6 +2295,7 @@ sub define_interpolated_lm_interpolate {
$interpolation_script, $tuning, @LM) = &get_output_and_input($step_id);
my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
my $group = &get("INTERPOLATED-LM:group");
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir");
my $cmd = "";
@ -2309,9 +2328,12 @@ sub define_interpolated_lm_interpolate {
$group_string =~ s/ $//;
$group_string .= " ";
while($group_string =~ /^([^ ,]+)([ ,]+)(.*)$/) {
die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition")
if ! defined($POSITION{$1});
# die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition")
# if ! defined($POSITION{$1});
# detect that elsewhere!
if (defined($POSITION{$1})) {
$numbered_string .= $POSITION{$1}.$2;
}
$group_string = $3;
}
chop($numbered_string);
@ -2323,7 +2345,12 @@ sub define_interpolated_lm_interpolate {
$name .= ".$$FACTOR[$factor]" if defined($FACTOR);
$name .= ".order$order";
}
$cmd .= "$interpolation_script --tuning $tuning --name $name --srilm $srilm_dir --lm $lm_list";
my $factored_tuning = $tuning;
if (&backoff_and_get("TRAINING:output-factors")) {
$factored_tuning = "$tuning.factor$factor";
$cmd .= "$scripts/training/reduce-factors.perl --corpus $tuning --reduced $factored_tuning --factor $factor\n";
}
$cmd .= "$interpolation_script --tuning $factored_tuning --name $name --srilm $srilm_dir --lm $lm_list";
$cmd .= " --group \"$numbered_string\"" if defined($group);
$cmd .= "\n";
}

View File

@ -86,15 +86,23 @@ sub split_xml {
my $i = 0;
$MARKUP[0] = "";
while($line =~ /\S/) {
# XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
$MARKUP[$i] .= $1." ";
$line = $2;
}
# non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
# '<' or '>' occurs in word, but it's not an XML tag
elsif ($line =~ /^\s*(\S+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
else {
die("ERROR: huh? $line\n");
}

View File

@ -70,15 +70,23 @@ sub split_xml {
my $i = 0;
$MARKUP[0] = "";
while($line =~ /\S/) {
# XML tag
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
$MARKUP[$i] .= $1." ";
$line = $2;
}
# non-XML text
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
# '<' or '>' occurs in word, but it's not an XML tag
elsif ($line =~ /^\s*(\S+)(.*)$/) {
$WORD[$i++] = $1;
$MARKUP[$i] = "";
$line = $2;
}
else {
die("ERROR: huh? $line\n");
}

View File

@ -1087,7 +1087,9 @@ if($___RETURN_BEST_DEV) {
if(defined $sparse_weights_file) {
$best_sparse_file = "run$bestit.sparse-weights";
}
create_config($___CONFIG_ORIG, "./moses.ini", get_featlist_from_file("run$bestit.dense"),
my $best_featlist = get_featlist_from_file("run$bestit.dense");
$best_featlist->{"untuneables"} = $featlist->{"untuneables"};
create_config($___CONFIG_ORIG, "./moses.ini", $best_featlist,
$bestit, $bestbleu, $best_sparse_file);
}
else {

View File

@ -0,0 +1,109 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
my $___FACTOR_DELIMITER = "|";
# utilities
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
my ($CORPUS,$REDUCED,$FACTOR);
die("ERROR: wrong syntax when invoking reduce-factors")
unless &GetOptions('corpus=s' => \$CORPUS,
'reduced-corpus=s' => \$REDUCED,
'factor=s' => \$FACTOR);
&reduce_factors($CORPUS,$REDUCED,$FACTOR);
# from train-model.perl
sub reduce_factors {
my ($full,$reduced,$factors) = @_;
my @INCLUDE = sort {$a <=> $b} split(/,/,$factors);
print "Reducing factors to produce $reduced @ ".`date`;
while(-e $reduced.".lock") {
sleep(10);
}
if (-e $reduced) {
print STDERR " $reduced in place, reusing\n";
return;
}
if (-e $reduced.".gz") {
print STDERR " $reduced.gz in place, reusing\n";
return;
}
# peek at input, to check if we are asked to produce exactly the
# available factors
my $inh = open_or_zcat($full);
my $firstline = <$inh>;
die "Corpus file $full is empty" unless $firstline;
close $inh;
# pick first word
$firstline =~ s/^\s*//;
$firstline =~ s/\s.*//;
# count factors
my $maxfactorindex = $firstline =~ tr/|/|/;
if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
# create just symlink; preserving compression
my $realfull = $full;
if (!-e $realfull && -e $realfull.".gz") {
$realfull .= ".gz";
$reduced =~ s/(\.gz)?$/.gz/;
}
safesystem("ln -s '$realfull' '$reduced'")
or die "Failed to create symlink $realfull -> $reduced";
return;
}
# The default is to select the needed factors
`touch $reduced.lock`;
*IN = open_or_zcat($full);
open(OUT,">".$reduced) or die "ERROR: Can't write $reduced";
my $nr = 0;
while(<IN>) {
$nr++;
print STDERR "." if $nr % 10000 == 0;
print STDERR "($nr)" if $nr % 100000 == 0;
chomp; s/ +/ /g; s/^ //; s/ $//;
my $first = 1;
foreach (split) {
my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
# \Q causes to disable metacharacters in regex
print OUT " " unless $first;
$first = 0;
my $first_factor = 1;
foreach my $outfactor (@INCLUDE) {
print OUT "|" unless $first_factor;
$first_factor = 0;
my $out = $FACTOR[$outfactor];
die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
print OUT $out;
}
}
print OUT "\n";
}
print STDERR "\n";
close(OUT);
close(IN);
`rm -f $reduced.lock`;
}
sub open_or_zcat {
my $fn = shift;
my $read = $fn;
$fn = $fn.".gz" if ! -e $fn && -e $fn.".gz";
$fn = $fn.".bz2" if ! -e $fn && -e $fn.".bz2";
if ($fn =~ /\.bz2$/) {
$read = "$BZCAT $fn|";
} elsif ($fn =~ /\.gz$/) {
$read = "$ZCAT $fn|";
}
my $hdl;
open($hdl,$read) or die "Can't read $fn ($read)";
return $hdl;
}