Merge ../mosesdecoder into perf_moses2

This commit is contained in:
Hieu Hoang 2016-01-13 14:57:20 +00:00
commit 38f999fa3f
20 changed files with 643 additions and 416 deletions

View File

@ -1635,6 +1635,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetNgramFeature.h</locationURI>
</link>
<link>
<name>FF/TargetPreferencesFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetPreferencesFeature.cpp</locationURI>
</link>
<link>
<name>FF/TargetPreferencesFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetPreferencesFeature.h</locationURI>
</link>
<link>
<name>FF/TargetWordInsertionFeature.cpp</name>
<type>1</type>
@ -1995,6 +2005,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/SpanLengthPhraseProperty.h</locationURI>
</link>
<link>
<name>PP/TargetPreferencesPhraseProperty.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetPreferencesPhraseProperty.cpp</locationURI>
</link>
<link>
<name>PP/TargetPreferencesPhraseProperty.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetPreferencesPhraseProperty.h</locationURI>
</link>
<link>
<name>PP/TreeStructurePhraseProperty.h</name>
<type>1</type>

View File

@ -13,7 +13,7 @@ cmake_minimum_required(VERSION 2.8.8)
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
set(KENLM_MAX_ORDER 6)
set(KENLM_MAX_ORDER 6 CACHE STRING "Maximum supported ngram order")
add_definitions(-DKENLM_MAX_ORDER=${KENLM_MAX_ORDER})
@ -64,76 +64,27 @@ set(EXE_LIST
build_binary
)
# Iterate through the executable list
foreach(exe ${EXE_LIST})
# Compile the executable, linking against the requisite dependent object files
add_executable(${exe} ${exe}_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>)
# Link the executable against boost
target_link_libraries(${exe} ${Boost_LIBRARIES} pthread)
# Group executables together
set_target_properties(${exe} PROPERTIES FOLDER executables)
# End for loop
endforeach(exe)
# Install the executable files
install(TARGETS ${EXE_LIST} DESTINATION bin)
AddExes(EXES ${EXE_LIST}
DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
LIBRARIES ${Boost_LIBRARIES} pthread)
# Conditionally build the interpolation code
if(BUILD_INTERPOLATE)
add_subdirectory(interpolate)
endif()
if(BUILD_TESTING)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
left_test
model_test
partial_test
)
# Iterate through the Boost tests list
foreach(test ${KENLM_BOOST_TESTS_LIST})
# Compile the executable, linking against the requisite dependent object files
add_executable(${test} ${test}.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>)
# Require the following compile flag
set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# model_test requires an extra command line parameter
if ("${test}" STREQUAL "model_test")
set(test_params
${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa
)
else()
set(test_params
${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
)
endif()
# Specify command arguments for how to run each unit test
#
# Assuming that foo was defined via add_executable(foo ...),
# the syntax $<TARGET_FILE:foo> gives the full path to the executable.
#
add_test(NAME ${test}_test
COMMAND $<TARGET_FILE:${test}> ${test_params})
# Group unit tests together
set_target_properties(${test} PROPERTIES FOLDER "unit_tests")
# End for loop
endforeach(test)
set(KENLM_BOOST_TESTS_LIST left_test partial_test)
AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
LIBRARIES ${Boost_LIBRARIES} pthread
TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa)
# model_test requires an extra command line parameter
KenLMAddTest(TEST model_test
DEPENDS $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>
LIBRARIES ${Boost_LIBRARIES} pthread
TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa)
endif()

View File

@ -52,36 +52,16 @@ set_target_properties(lmplz PROPERTIES FOLDER executables)
if(BUILD_TESTING)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
adjust_counts_test
corpus_count_test
)
# Iterate through the Boost tests list
foreach(test ${KENLM_BOOST_TESTS_LIST})
# Compile the executable, linking against the requisite dependent object files
add_executable(${test} ${test}.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
# Require the following compile flag
set_target_properties(${test} PROPERTIES COMPILE_FLAGS "-DBOOST_TEST_DYN_LINK -DBOOST_PROGRAM_OPTIONS_DYN_LINK")
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# Specify command arguments for how to run each unit test
#
# Assuming that foo was defined via add_executable(foo ...),
# the syntax $<TARGET_FILE:foo> gives the full path to the executable.
#
add_test(NAME ${test}_test
COMMAND $<TARGET_FILE:${test}>)
# Group unit tests together
set_target_properties(${test} PROPERTIES FOLDER "unit_tests")
# End for loop
endforeach(test)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
adjust_counts_test
corpus_count_test
)
AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
DEPENDS $<TARGET_OBJECTS:kenlm>
$<TARGET_OBJECTS:kenlm_common>
$<TARGET_OBJECTS:kenlm_util>
$<TARGET_OBJECTS:kenlm_builder>
LIBRARIES ${Boost_LIBRARIES} pthread)
endif()

View File

@ -269,7 +269,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
std::size_t same = full->end() - 1 - different;
// STEP 1: Output all the n-grams that changed.
for (; lower_valid >= &streams[same]; --lower_valid) {
for (; lower_valid >= streams.begin() + same; --lower_valid) {
uint64_t order_minus_1 = lower_valid - streams_begin;
if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1])
(*lower_valid)->Value().Mark();

View File

@ -42,6 +42,7 @@
#include "moses/FF/ControlRecombination.h"
#include "moses/FF/ConstrainedDecoding.h"
#include "moses/FF/SoftSourceSyntacticConstraintsFeature.h"
#include "moses/FF/TargetPreferencesFeature.h"
#include "moses/FF/CoveredReferenceFeature.h"
#include "moses/FF/TreeStructureFeature.h"
#include "moses/FF/SoftMatchingFeature.h"
@ -254,6 +255,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(CoveredReferenceFeature);
MOSES_FNAME(SourceGHKMTreeInputMatchFeature);
MOSES_FNAME(SoftSourceSyntacticConstraintsFeature);
MOSES_FNAME(TargetPreferencesFeature);
MOSES_FNAME(TreeStructureFeature);
MOSES_FNAME(SoftMatchingFeature);
MOSES_FNAME(DynamicCacheBasedLanguageModel);

View File

@ -193,7 +193,7 @@ void SoftSourceSyntacticConstraintsFeature::LoadLabelSet(std::string &filename,
if ( foundSourceLabelIndex != m_sourceLabels.end() ) {
labelSet.insert(foundSourceLabelIndex->second);
} else {
FEATUREVERBOSE(2, "Ignoring unknown source label \"" << label << "\" "
FEATUREVERBOSE(2, "Ignoring undefined source label \"" << label << "\" "
<< "from core source label set file " << filename << "."
<< std::endl);
}
@ -232,7 +232,7 @@ void SoftSourceSyntacticConstraintsFeature::LoadTargetSourceLeftHandSideJointCou
boost::unordered_map<std::string,size_t>::iterator foundSourceLabelIndex = m_sourceLabels.find( sourceLabel );
UTIL_THROW_IF2(foundSourceLabelIndex == m_sourceLabels.end(), GetScoreProducerDescription()
<< ": Target/source label joint count file " << m_targetSourceLHSJointCountFile
<< " contains unknown source label \"" << sourceLabel << "\".");
<< " contains undefined source label \"" << sourceLabel << "\".");
const Factor* targetLabelFactor = factorCollection.AddFactor(targetLabel,true);

View File

@ -0,0 +1,408 @@
#include <vector>
#include <limits>
#include <boost/math/special_functions/fpclassify.hpp>
#include <assert.h>
#include "TargetPreferencesFeature.h"
#include "moses/StaticData.h"
#include "moses/InputFileStream.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/Hypothesis.h"
#include "moses/ChartHypothesis.h"
#include "moses/ChartManager.h"
#include "moses/FactorCollection.h"
#include "moses/TreeInput.h"
#include "moses/PP/TargetPreferencesPhraseProperty.h"
using namespace std;
namespace Moses
{
void TargetPreferencesFeatureState::AddProbabilityForLHSLabel(size_t label, double cost)
{
std::pair< std::map<size_t,double>::iterator, bool > inserted =
m_probabilitiesForLHSLabels.insert(std::pair<size_t,double>(label,cost));
if ( !inserted.second ) {
(inserted.first)->second += cost;
}
}
void TargetPreferencesFeatureState::NormalizeProbabilitiesForLHSLabels(double denominator)
{
for ( std::map<size_t,double>::iterator iter=m_probabilitiesForLHSLabels.begin();
iter!=m_probabilitiesForLHSLabels.end(); ++iter ) {
(iter->second) /= denominator;
}
}
double TargetPreferencesFeatureState::GetProbabilityForLHSLabel(size_t label, bool &isMatch) const
{
std::map<size_t,double>::const_iterator iter = m_probabilitiesForLHSLabels.find(label);
if ( iter != m_probabilitiesForLHSLabels.end() ) {
isMatch = true;
return iter->second;
}
isMatch = false;
return 0;
}
size_t TargetPreferencesFeatureState::hash() const
{
if (!m_distinguishStates) {
return 0;
}
size_t ret = 0;
boost::hash_combine(ret, m_probabilitiesForLHSLabels.size());
for (std::map<size_t,double>::const_iterator it=m_probabilitiesForLHSLabels.begin();
it!=m_probabilitiesForLHSLabels.end(); ++it) {
boost::hash_combine(ret, it->first);
}
return ret;
};
bool TargetPreferencesFeatureState::operator==(const FFState& other) const
{
if (!m_distinguishStates) {
return true;
}
if (this == &other) {
return true;
}
const TargetPreferencesFeatureState* otherState =
dynamic_cast<const TargetPreferencesFeatureState*>(&other);
UTIL_THROW_IF2(otherState == NULL, "Wrong state type");
if (m_probabilitiesForLHSLabels.size() != (otherState->m_probabilitiesForLHSLabels).size()) {
return false;
}
std::map<size_t,double>::const_iterator thisIt, otherIt;
for (thisIt=m_probabilitiesForLHSLabels.begin(), otherIt=(otherState->m_probabilitiesForLHSLabels).begin();
thisIt!=m_probabilitiesForLHSLabels.end(); ++thisIt, ++otherIt) {
if (thisIt->first != otherIt->first) {
return false;
}
}
return true;
};
TargetPreferencesFeature::TargetPreferencesFeature(const std::string &line)
: StatefulFeatureFunction(2, line)
, m_featureVariant(0)
, m_distinguishStates(false)
, m_noMismatches(false)
{
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
ReadParameters();
VERBOSE(1, " Done." << std::endl);
VERBOSE(1, " Feature variant: " << m_featureVariant << "." << std::endl);
}
TargetPreferencesFeature::~TargetPreferencesFeature()
{}
void TargetPreferencesFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "label-set-file") {
m_labelSetFile = value;
} else if (key == "unknown-word-labels-file") {
m_unknownLeftHandSideFile = value;
} else if (key == "variant") {
m_featureVariant = Scan<size_t>(value);
} else if (key == "distinguish-states") {
m_distinguishStates = Scan<bool>(value);
} else if (key == "no-mismatches") {
m_noMismatches = Scan<bool>(value);
} else {
StatefulFeatureFunction::SetParameter(key, value);
}
}
void TargetPreferencesFeature::Load(AllOptions::ptr const& opts)
{
// don't change the loading order!
LoadLabelSet();
LoadUnknownLeftHandSideFile();
}
void TargetPreferencesFeature::LoadLabelSet()
{
FEATUREVERBOSE(2, "Loading label set from file " << m_labelSetFile << " ...");
InputFileStream inFile(m_labelSetFile);
// read label set
std::string line;
m_labels.clear();
m_labelsByIndex.clear();
while (getline(inFile, line)) {
std::istringstream tokenizer(line);
std::string label;
size_t index;
try {
tokenizer >> label >> index;
} catch (const std::exception &e) {
UTIL_THROW2(GetScoreProducerDescription()
<< ": Error reading label set file " << m_labelSetFile << " .");
}
std::pair< boost::unordered_map<std::string,size_t>::iterator, bool > inserted = m_labels.insert( std::pair<std::string,size_t>(label,index) );
UTIL_THROW_IF2(!inserted.second, GetScoreProducerDescription()
<< ": Label set file " << m_labelSetFile << " should contain each label only once.");
if (index >= m_labelsByIndex.size()) {
m_labelsByIndex.resize(index+1);
}
m_labelsByIndex[index] = label;
}
inFile.Close();
std::list<std::string> specialLabels;
specialLabels.push_back("GlueTop");
for (std::list<std::string>::const_iterator iter=specialLabels.begin();
iter!=specialLabels.end(); ++iter) {
boost::unordered_map<std::string,size_t>::iterator found = m_labels.find(*iter);
UTIL_THROW_IF2(found == m_labels.end(), GetScoreProducerDescription()
<< ": Label set file " << m_labelSetFile << " should contain an entry for the special label \"" << *iter << "\".");
if (!(found->first).compare("GlueTop")) {
m_GlueTopLabel = found->second;
}
}
FEATUREVERBOSE2(2, " Done." << std::endl);
}
// Make sure to call this method _after_ LoadLabelSet()
void TargetPreferencesFeature::LoadUnknownLeftHandSideFile()
{
FEATUREVERBOSE(2, "Loading left-hand side labels for unknowns from file " << m_unknownLeftHandSideFile << std::endl);
InputFileStream inFile(m_unknownLeftHandSideFile);
// read left-hand side labels for unknowns
std::string line;
m_unknownLHSProbabilities.clear();
double countsSum = 0.0;
while (getline(inFile, line)) {
istringstream tokenizer(line);
std::string label;
double count;
tokenizer >> label;
tokenizer >> count;
boost::unordered_map<std::string,size_t>::iterator found = m_labels.find( label );
if ( found != m_labels.end() ) {
std::pair< std::map<size_t,double>::iterator, bool > inserted =
m_unknownLHSProbabilities.insert( std::pair<size_t,double>(found->second,count) );
if ( !inserted.second ) {
(inserted.first)->second += count;
}
countsSum += count;
} else {
FEATUREVERBOSE(1, "WARNING: undefined label \"" << label << "\" in file " << m_unknownLeftHandSideFile << std::endl);
}
}
// compute probabilities from counts
countsSum += (double)m_labels.size();
for (std::map<size_t,double>::iterator iter=m_unknownLHSProbabilities.begin();
iter!=m_unknownLHSProbabilities.end(); ++iter) {
iter->second /= countsSum;
}
IFFEATUREVERBOSE(3) {
for (std::map<size_t,double>::iterator iter=m_unknownLHSProbabilities.begin();
iter!=m_unknownLHSProbabilities.end(); ++iter) {
FEATUREVERBOSE(3, GetScoreProducerDescription() << "::LoadUnknownLeftHandSideFile(): " << iter->first << " " << iter->second << std::endl);
}
}
inFile.Close();
}
FFState* TargetPreferencesFeature::EvaluateWhenApplied(
const ChartHypothesis& hypo,
int featureID, // used to index the state in the previous hypotheses
ScoreComponentCollection* accumulator) const
{
streamsize cerr_precision = std::cerr.precision();
std::cerr.precision(20); // TODO: remove. just for debug purposes.
// dense scores
std::vector<float> newScores(m_numScoreComponents,0); // m_numScoreComponents == 2
// state: used to store tree probabilities of partial hypotheses
// and access the respective tree probabilities of subderivations
TargetPreferencesFeatureState *state = new TargetPreferencesFeatureState(m_distinguishStates);
size_t nNTs = 1;
double overallTreeProbability = 0.0;
bool isGlueGrammarRule = false;
// read TargetPreferences property
const TargetPhrase &currTarPhr = hypo.GetCurrTargetPhrase();
FEATUREVERBOSE(2, "Phrase: " << currTarPhr << std::endl);
if (const PhraseProperty *property = currTarPhr.GetProperty("TargetPreferences")) {
const TargetPreferencesPhraseProperty *targetPreferencesPhraseProperty = static_cast<const TargetPreferencesPhraseProperty*>(property);
// IFFEATUREVERBOSE(2) {
// const std::string *targetPreferencesPhrasePropertyValueString = targetPreferencesPhraseProperty->GetValueString();
// if (targetPreferencesPhrasePropertyValueString) {
// FEATUREVERBOSE(2, "PreferencesPhraseProperty " << *targetPreferencesPhrasePropertyValueString << std::endl);
// } else {
// FEATUREVERBOSE(2, "PreferencesPhraseProperty NULL" << std::endl);
// }
// }
nNTs = targetPreferencesPhraseProperty->GetNumberOfNonTerminals();
double totalCount = targetPreferencesPhraseProperty->GetTotalCount();
// get index map for underlying hypotheses
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
currTarPhr.GetAlignNonTerm().GetNonTermIndexMap();
// retrieve states from previous hypotheses, if any
std::vector< const TargetPreferencesFeatureState* > prevStatesByNonTerminal(nNTs-1);
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
size_t nonTerminalNumber = 0;
for (size_t phrasePos=0; phrasePos<currTarPhr.GetSize(); ++phrasePos) {
// consult rule for either word or non-terminal
const Word &word = currTarPhr.GetWord(phrasePos);
if ( word.IsNonTerminal() ) {
// non-terminal: consult subderivation
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
const TargetPreferencesFeatureState* prevState =
static_cast<const TargetPreferencesFeatureState*>(prevHypo->GetFFState(featureID));
prevStatesByNonTerminal[nonTerminalNumber] = prevState;
IFFEATUREVERBOSE(2) {
// some log output that is not required in any way for the functionality
const std::map<size_t,double> &prevHypoTreeProbabilities =
prevStatesByNonTerminal[nonTerminalNumber]->GetProbabilitiesForLHSLabels();
FEATUREVERBOSE(2, "Previous tree probs:");
for (std::map<size_t,double>::const_iterator iter=prevHypoTreeProbabilities.begin();
iter!=prevHypoTreeProbabilities.end(); ++iter) {
FEATUREVERBOSE2(2, " " << m_labelsByIndex[iter->first] << " " << iter->second);
}
FEATUREVERBOSE2(2, std::endl);
}
++nonTerminalNumber;
}
}
}
// inspect labelled rule items
overallTreeProbability = 0.0;
const std::list<TargetPreferencesPhrasePropertyItem> &targetPreferencesItems = targetPreferencesPhraseProperty->GetTargetPreferencesItems();
for (std::list<TargetPreferencesPhrasePropertyItem>::const_iterator targetPreferencesItem = targetPreferencesItems.begin();
targetPreferencesItem != targetPreferencesItems.end(); ++targetPreferencesItem) {
const std::list<size_t> &targetPreferencesRHS = targetPreferencesItem->GetTargetPreferencesRHS();
const std::list< std::pair<size_t,float> > &targetPreferencesLHSList = targetPreferencesItem->GetTargetPreferencesLHSList();
assert(targetPreferencesRHS.size() == nNTs-1);
size_t currentTargetLabelsMismatches = nNTs - 1;
double matchingLabelsProbabilityProduct = 1.0;
size_t nonTerminalNumber=0;
for (std::list<size_t>::const_iterator targetPreferencesRHSIt = targetPreferencesRHS.begin();
targetPreferencesRHSIt != targetPreferencesRHS.end(); ++targetPreferencesRHSIt, ++nonTerminalNumber) {
bool isLabelMatch = false;
double matchingLabelsProbability =
prevStatesByNonTerminal[nonTerminalNumber]->GetProbabilityForLHSLabel(*targetPreferencesRHSIt,
isLabelMatch);
matchingLabelsProbabilityProduct *= matchingLabelsProbability;
if ( isLabelMatch ) {
currentTargetLabelsMismatches -= 1;
}
}
FEATUREVERBOSE(2, "matchingLabelsProbabilityProduct = " << matchingLabelsProbabilityProduct << std::endl);
// LHS labels seen with this RHS
for (std::list< std::pair<size_t,float> >::const_iterator targetPreferencesLHSIt = targetPreferencesLHSList.begin();
targetPreferencesLHSIt != targetPreferencesLHSList.end(); ++targetPreferencesLHSIt) {
size_t targetPreferenceLHS = targetPreferencesLHSIt->first;
if ( targetPreferenceLHS == m_GlueTopLabel ) {
isGlueGrammarRule = true;
}
// proceed with the actual probability computations
double ruleTargetPreferenceCount = targetPreferencesLHSIt->second;
double ruleTargetPreferenceProbability = ruleTargetPreferenceCount / totalCount;
FEATUREVERBOSE(2, " ruleTargetPreferenceProbability = " << ruleTargetPreferenceProbability << std::endl);
double weightedTargetPreferenceRuleProbability = ruleTargetPreferenceProbability * matchingLabelsProbabilityProduct;
if ( weightedTargetPreferenceRuleProbability != 0 ) {
state->AddProbabilityForLHSLabel(targetPreferenceLHS, weightedTargetPreferenceRuleProbability);
}
overallTreeProbability += weightedTargetPreferenceRuleProbability;
}
}
IFFEATUREVERBOSE(2) {
FEATUREVERBOSE(2, "overallTreeProbability = " << overallTreeProbability);
if ( overallTreeProbability > 1.0001 ) { // account for some rounding error
FEATUREVERBOSE2(2, " -- WARNING: overallTreeProbability > 1");
}
FEATUREVERBOSE2(2, std::endl);
}
if ( overallTreeProbability != 0 ) {
UTIL_THROW_IF2(!boost::math::isnormal(overallTreeProbability), GetScoreProducerDescription()
<< ": Oops. Numerical precision issues.");
state->NormalizeProbabilitiesForLHSLabels(overallTreeProbability);
}
} else {
// abort with error message if the phrase does not translate an unknown word
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
<< ": Missing TargetPreferences property. Please check phrase table and glue rules.");
// unknown word
overallTreeProbability = 1.0;
for (std::map<size_t,double>::const_iterator iter=m_unknownLHSProbabilities.begin();
iter!=m_unknownLHSProbabilities.end(); ++iter) {
// update state
state->AddProbabilityForLHSLabel(iter->first, iter->second);
}
}
FEATUREVERBOSE(2, "-> OVERALLTREEPROB = " << overallTreeProbability << std::endl);
// add scores
// tree probability (preference grammar style)
newScores[0] = (overallTreeProbability == 0 ? 0 : std::log(overallTreeProbability) );
if ( m_noMismatches && (overallTreeProbability == 0) && !isGlueGrammarRule ) {
newScores[0] = -std::numeric_limits<float>::infinity();
}
// tree mismatch penalty
// TODO: deactivate the tree mismatch penalty score component automatically if feature configuration parameter no-mismatches=true
newScores[1] = (overallTreeProbability == 0 ? 1 : 0 );
accumulator->PlusEquals(this, newScores);
std::cerr.precision(cerr_precision);
return state;
}
}

View File

@ -0,0 +1,121 @@
#pragma once
#include <string>
#include <map>
#include <iostream>
#include <boost/unordered_map.hpp>
#include "StatefulFeatureFunction.h"
#include "FFState.h"
#include "util/exception.hh"
#include <stdint.h>
namespace Moses
{
class TargetPreferencesFeatureState : public FFState
{
public:
TargetPreferencesFeatureState(bool distinguishStates)
: m_distinguishStates(distinguishStates)
{}
void AddProbabilityForLHSLabel(size_t label, double cost);
void NormalizeProbabilitiesForLHSLabels(double denominator);
const std::map<size_t,double> &GetProbabilitiesForLHSLabels() const {
return m_probabilitiesForLHSLabels;
}
double GetProbabilityForLHSLabel(size_t label, bool &isMatch) const;
size_t hash() const;
virtual bool operator==(const FFState& other) const;
private:
const bool m_distinguishStates;
std::map<size_t,double> m_probabilitiesForLHSLabels;
};
class TargetPreferencesFeature : public StatefulFeatureFunction
{
public:
TargetPreferencesFeature(const std::string &line);
~TargetPreferencesFeature();
bool IsUseable(const FactorMask &mask) const {
return true;
}
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
return new TargetPreferencesFeatureState(m_distinguishStates);
}
void SetParameter(const std::string& key, const std::string& value);
void Load(AllOptions::ptr const& opts);
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{};
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{};
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const
{}
FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const {
UTIL_THROW2(GetScoreProducerDescription() << ": feature currently not implemented for phrase-based decoding.");
return new TargetPreferencesFeatureState(m_distinguishStates);
};
FFState* EvaluateWhenApplied(
const ChartHypothesis& cur_hypo,
int featureID, // used to index the state in the previous hypotheses
ScoreComponentCollection* accumulator) const;
private:
std::string m_labelSetFile;
std::string m_unknownLeftHandSideFile;
size_t m_featureVariant;
bool m_distinguishStates;
bool m_noMismatches;
mutable boost::unordered_map<std::string,size_t> m_labels;
mutable std::vector<std::string> m_labelsByIndex;
mutable size_t m_XRHSLabel;
mutable size_t m_XLHSLabel;
mutable size_t m_GlueTopLabel;
std::map<size_t,double> m_unknownLHSProbabilities;
void LoadLabelSet();
void LoadUnknownLeftHandSideFile();
};
}

View File

@ -1723,8 +1723,8 @@ OutputSurface(std::ostream &out, Hypothesis const& edge, bool const recursive) c
out << *factor;
for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
UTIL_THROW_IF2(factor==NULL,"No factor "<<i<<" at position "<< pos);
out << fd << *factor;
if (factor) out << fd << *factor;
else out << fd << UNKNOWN_FACTOR;
}
if(markUnknown && word.IsOOV()) {

View File

@ -1,123 +0,0 @@
#include "moses/PP/TargetPreferencesPhraseProperty.h"
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <string>
#include <queue>
#include <assert.h>
#include <limits>
namespace Moses
{
void TargetPreferencesPhraseProperty::ProcessValue(const std::string &value)
{
std::istringstream tokenizer(value);
if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read number of non-terminals. Flawed property?");
}
assert( m_nNTs > 0 );
if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read overall rule count. Flawed property?");
}
assert( m_totalCount > 0.0 );
// read labelled rule items
std::priority_queue<float> ruleLabelledCountsPQ;
while (tokenizer.peek() != EOF) {
try {
TargetPreferencesPhrasePropertyItem item;
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
if (m_nNTs == 1) {
item.m_labelsRHSCount = m_totalCount;
} else { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
for (size_t i=0; i<m_nNTs-1; ++i) { // RHS non-terminal labels
size_t labelRHS;
if (! (tokenizer >> labelRHS) ) { // RHS non-terminal label
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read right-hand side label index. Flawed property?");
}
item.m_labelsRHS.push_back(labelRHS);
}
if (! (tokenizer >> item.m_labelsRHSCount)) {
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read right-hand side count. Flawed property?");
}
if (! (tokenizer >> numberOfLHSsGivenRHS)) {
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read number of left-hand sides. Flawed property?");
}
}
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS non-terminal labels seen with this RHS
size_t labelLHS;
if (! (tokenizer >> labelLHS)) { // LHS non-terminal label
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read left-hand side label index. Flawed property?");
}
float ruleLabelledCount;
if (! (tokenizer >> ruleLabelledCount)) {
UTIL_THROW2("TargetPreferencesPhraseProperty: Not able to read count. Flawed property?");
}
item.m_labelsLHSList.push_back( std::make_pair(labelLHS,ruleLabelledCount) );
ruleLabelledCountsPQ.push(ruleLabelledCount);
}
m_labelItems.push_back(item);
} catch (const std::exception &e) {
UTIL_THROW2("TargetPreferencesPhraseProperty: Read error. Flawed property?");
}
}
// keep only top N label vectors
const size_t N=50;
if (ruleLabelledCountsPQ.size() > N) {
float topNRuleLabelledCount = std::numeric_limits<int>::max();
for (size_t i=0; !ruleLabelledCountsPQ.empty() && i<N; ++i) {
topNRuleLabelledCount = ruleLabelledCountsPQ.top();
ruleLabelledCountsPQ.pop();
}
size_t nKept=0;
std::list<TargetPreferencesPhrasePropertyItem>::iterator itemIter=m_labelItems.begin();
while (itemIter!=m_labelItems.end()) {
if (itemIter->m_labelsRHSCount < topNRuleLabelledCount) {
itemIter = m_labelItems.erase(itemIter);
} else {
std::list< std::pair<size_t,float> >::iterator itemLHSIter=(itemIter->m_labelsLHSList).begin();
while (itemLHSIter!=(itemIter->m_labelsLHSList).end()) {
if (itemLHSIter->second < topNRuleLabelledCount) {
itemLHSIter = (itemIter->m_labelsLHSList).erase(itemLHSIter);
} else {
if (nKept >= N) {
itemLHSIter = (itemIter->m_labelsLHSList).erase(itemLHSIter,(itemIter->m_labelsLHSList).end());
} else {
++nKept;
++itemLHSIter;
}
}
}
if ((itemIter->m_labelsLHSList).empty()) {
itemIter = m_labelItems.erase(itemIter);
} else {
++itemIter;
}
}
}
}
};
} // namespace Moses

View File

@ -1,71 +0,0 @@
#pragma once
#include "moses/PP/PhraseProperty.h"
#include "util/exception.hh"
#include <string>
#include <list>
namespace Moses
{
class TargetPreferencesPhrasePropertyItem
{
friend class TargetPreferencesPhraseProperty;
public:
TargetPreferencesPhrasePropertyItem() {};
float GetTargetPreferencesRHSCount() const {
return m_labelsRHSCount;
};
const std::list<size_t> &GetTargetPreferencesRHS() const {
return m_labelsRHS;
};
const std::list< std::pair<size_t,float> > &GetTargetPreferencesLHSList() const {
return m_labelsLHSList;
};
private:
float m_labelsRHSCount;
std::list<size_t> m_labelsRHS; // should be of size nNTs-1 (empty if initial rule, i.e. no right-hand side non-terminals)
std::list< std::pair<size_t,float> > m_labelsLHSList; // list of left-hand sides for this right-hand side, with counts
};
class TargetPreferencesPhraseProperty : public PhraseProperty
{
public:
TargetPreferencesPhraseProperty() {};
virtual void ProcessValue(const std::string &value);
size_t GetNumberOfNonTerminals() const {
return m_nNTs;
}
float GetTotalCount() const {
return m_totalCount;
}
const std::list<TargetPreferencesPhrasePropertyItem> &GetTargetPreferencesItems() const {
return m_labelItems;
};
virtual const std::string *GetValueString() const {
UTIL_THROW2("TargetPreferencesPhraseProperty: value string not available in this phrase property");
return NULL;
};
protected:
size_t m_nNTs;
float m_totalCount;
std::list<TargetPreferencesPhrasePropertyItem> m_labelItems;
};
} // namespace Moses

View File

@ -92,15 +92,17 @@ namespace Moses {
}
}
params= param.GetParam("output-factors");
if (params) factor_order = Scan<FactorType>(*params);
if (factor_order.empty()) factor_order.assign(1,0);
if (ReportAllFactors) {
for (size_t i = 1; i < MAX_NUM_FACTORS; ++i)
factor_order.clear();
for (size_t i = 0; i < MAX_NUM_FACTORS; ++i)
factor_order.push_back(i);
} else {
params= param.GetParam("output-factors");
if (params) factor_order = Scan<FactorType>(*params);
if (factor_order.empty()) factor_order.assign(1,0);
}
param.SetParameter(factor_delimiter, "factor-delimiter", std::string("|"));
param.SetParameter(factor_delimiter, "output-factor-delimiter", factor_delimiter);

@ -1 +1 @@
Subproject commit bbea49d71c5b9835d9a777a82085e57a33a0bcf6
Subproject commit 0f892797ae03b37f7bf4470b172de83736bce953

View File

@ -314,7 +314,7 @@ if (!$inverse && defined($partsOfSpeechFile))
# merge target syntactic preferences labels files
if (!$inverse && defined($targetSyntacticPreferencesLabelsFile))
{
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $targetSyntacticPreferencesLabelsFile";
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $targetSyntacticPreferencesLabelsFile";
print STDERR "Merging target syntactic preferences labels files: $cmd \n";
`$cmd`;
}

View File

@ -2378,7 +2378,7 @@ sub create_ini {
print INI "PhraseOrientationFeature";
# find the label of the left-hand side non-terminal in glue rules (target non-terminal set)
my $TOPLABEL = `head -n 1 $___GLUE_GRAMMAR_FILE`;
$TOPLABEL =~ s/.* \|\|\| .* \[(.*)\] \|\|\| .*/\1/;
$TOPLABEL =~ s/.* \|\|\| .* \[(.*)\] \|\|\| .*/$1/;
chomp($TOPLABEL);
print INI " glue-label=$TOPLABEL\n";
}

View File

@ -58,52 +58,24 @@ add_library(kenlm_util OBJECT ${KENLM_UTIL_DOUBLECONVERSION_SOURCE} ${KENLM_UTIL
# Only compile and run unit tests if tests should be run
if(BUILD_TESTING)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
bit_packing_test
file_piece_test
joint_sort_test
multi_intersection_test
probing_hash_table_test
read_compressed_test
sorted_uniform_test
tokenize_piece_test
)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
bit_packing_test
joint_sort_test
multi_intersection_test
probing_hash_table_test
read_compressed_test
sorted_uniform_test
tokenize_piece_test
)
# Iterate through the Boost tests list
foreach(test ${KENLM_BOOST_TESTS_LIST})
# Compile the executable, linking against the requisite dependent object files
add_executable(${test} ${test}.cc $<TARGET_OBJECTS:kenlm_util>)
# Require the following compile flag
set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# file_piece_test requires an extra command line parameter
if ("${test}" STREQUAL "file_piece_test")
set(test_params
${CMAKE_CURRENT_SOURCE_DIR}/file_piece.cc
)
else()
set(test_params
)
endif()
# Specify command arguments for how to run each unit test
#
# Assuming that foo was defined via add_executable(foo ...),
# the syntax $<TARGET_FILE:foo> gives the full path to the executable.
#
add_test(NAME ${test}_test
COMMAND $<TARGET_FILE:${test}> ${test_params})
# Group unit tests together
set_target_properties(${test} PROPERTIES FOLDER "unit_tests")
# End for loop
endforeach(test)
AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
DEPENDS $<TARGET_OBJECTS:kenlm_util>
LIBRARIES ${Boost_LIBRARIES} pthread)
# file_piece_test requires an extra command line parameter
KenLMAddTest(TEST file_piece_test
DEPENDS $<TARGET_OBJECTS:kenlm_util>
LIBRARIES ${Boost_LIBRARIES} pthread
TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/file_piece.cc)
endif()

View File

@ -24,25 +24,23 @@ void Exception::SetLocation(const char *file, unsigned int line, const char *fun
* them down.
*/
std::string old_text;
std::swap(old_text, what_);
StringStream stream;
stream << what_;
stream << file << ':' << line;
if (func) stream << " in " << func << " threw ";
what_.swap(old_text);
what_ << file << ':' << line;
if (func) what_ << " in " << func << " threw ";
if (child_name) {
stream << child_name;
what_ << child_name;
} else {
#ifdef __GXX_RTTI
stream << typeid(this).name();
what_ << typeid(this).name();
#else
stream << "an exception";
what_ << "an exception";
#endif
}
if (condition) {
stream << " because `" << condition << '\'';
what_ << " because `" << condition << '\'';
}
stream << ".\n";
stream << old_text;
what_ << ".\n";
what_ << old_text;
}
namespace {

View File

@ -8,7 +8,7 @@
#include <string>
#include <stdint.h>
// TODO(hieu) delete this
// TODO(hieu): delete this
#include <sstream>
namespace util {
@ -20,7 +20,7 @@ class Exception : public std::exception {
Exception() throw();
virtual ~Exception() throw();
const char *what() const throw() { return what_.c_str(); }
const char *what() const throw() { return what_.str().c_str(); }
// For use by the UTIL_THROW macros.
void SetLocation(
@ -38,7 +38,7 @@ class Exception : public std::exception {
typedef T Identity;
};
std::string what_;
StringStream what_;
};
/* This implements the normal operator<< for Exception and all its children.
@ -46,12 +46,10 @@ class Exception : public std::exception {
* boost::enable_if.
*/
template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
// TODO(hieu): change this to
// StringStream(e.what_) << data;
// TODO(hieu): delete this.
std::stringstream moses_hack;
moses_hack << data;
e.what_ += moses_hack.str();
e.what_ << moses_hack.str();
return e;
}

View File

@ -37,38 +37,14 @@ set(KENLM_UTIL_STREAM_SOURCE
if(BUILD_TESTING)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
io_test
sort_test
stream_test
)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
io_test
sort_test
stream_test
)
# Iterate through the Boost tests list
foreach(test ${KENLM_BOOST_TESTS_LIST})
# Compile the executable, linking against the requisite dependent object files
add_executable(${test} ${test}.cc $<TARGET_OBJECTS:kenlm_util>)
# Require the following compile flag
set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# Specify command arguments for how to run each unit test
#
# Assuming that foo was defined via add_executable(foo ...),
# the syntax $<TARGET_FILE:foo> gives the full path to the executable.
#
add_test(NAME ${test}_test
COMMAND $<TARGET_FILE:${test}>)
# Group unit tests together
set_target_properties(${test} PROPERTIES FOLDER "unit_tests")
# End for loop
endforeach(test)
AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
DEPENDS $<TARGET_OBJECTS:kenlm_util>
LIBRARIES ${Boost_LIBRARIES} pthread)
endif()

View File

@ -10,14 +10,8 @@ namespace util {
class StringStream : public FakeOStream<StringStream> {
public:
// Semantics: appends to string. Remember to clear first!
StringStream() {}
explicit StringStream()
{}
/*
explicit StringStream(std::string &out)
: out_(out) {}
*/
StringStream &flush() { return *this; }
StringStream &write(const void *data, std::size_t length) {
@ -25,12 +19,11 @@ class StringStream : public FakeOStream<StringStream> {
return *this;
}
const std::string &str() const
{ return out_; }
void str(const std::string &val)
{
out_ = val;
}
const std::string &str() const { return out_; }
void str(const std::string &val) { out_ = val; }
void swap(std::string &str) { std::swap(out_, str); }
protected:
friend class FakeOStream<StringStream>;