mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-21 08:07:14 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
23556697c4
2
Jamroot
2
Jamroot
@ -89,7 +89,7 @@ if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_min
|
||||
requirements += <library>tcmalloc_and_profiler <library>unwind <cflags>-fno-omit-frame-pointer <cxxflags>-fno-omit-frame-pointer ;
|
||||
} else {
|
||||
external-lib tcmalloc_minimal ;
|
||||
requirements += <threading>multi:<library>$(tcmalloc_minimal) ;
|
||||
requirements += <threading>multi:<library>tcmalloc_minimal ;
|
||||
}
|
||||
} else {
|
||||
echo "Tip: install tcmalloc for faster threading. See BUILD-INSTRUCTIONS.txt for more information." ;
|
||||
|
@ -11,12 +11,12 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -72,13 +72,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -65,6 +65,11 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>RuleExtractionOptions.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/RuleExtractionOptions.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>SentenceAlignment.cpp</name>
|
||||
<type>1</type>
|
||||
|
@ -11,11 +11,11 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -64,11 +64,11 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -5,13 +5,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.162355801" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -70,7 +70,6 @@
|
||||
<listOptionValue builtIn="false" value="irstlm"/>
|
||||
<listOptionValue builtIn="false" value="dstruct"/>
|
||||
<listOptionValue builtIn="false" value="dalm"/>
|
||||
<listOptionValue builtIn="false" value="MurmurHash3"/>
|
||||
<listOptionValue builtIn="false" value="flm"/>
|
||||
<listOptionValue builtIn="false" value="oolm"/>
|
||||
<listOptionValue builtIn="false" value="lattice"/>
|
||||
@ -108,13 +107,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.516628324" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -5,13 +5,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.461114338" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -71,7 +71,6 @@
|
||||
<listOptionValue builtIn="false" value="lattice"/>
|
||||
<listOptionValue builtIn="false" value="misc"/>
|
||||
<listOptionValue builtIn="false" value="dalm"/>
|
||||
<listOptionValue builtIn="false" value="MurmurHash3"/>
|
||||
<listOptionValue builtIn="false" value="search"/>
|
||||
<listOptionValue builtIn="false" value="RandLM"/>
|
||||
<listOptionValue builtIn="false" value="OnDiskPt"/>
|
||||
@ -109,13 +108,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.2121690436" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -11,12 +11,12 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -88,13 +88,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.401150096" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -181,10 +181,8 @@ FeatureStats::FeatureStats(const size_t size)
|
||||
|
||||
FeatureStats::~FeatureStats()
|
||||
{
|
||||
if (m_array) {
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
|
||||
void FeatureStats::Copy(const FeatureStats &stats)
|
||||
|
@ -35,10 +35,8 @@ ScoreStats::ScoreStats(const size_t size)
|
||||
|
||||
ScoreStats::~ScoreStats()
|
||||
{
|
||||
if (m_array) {
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
|
||||
void ScoreStats::Copy(const ScoreStats &stats)
|
||||
@ -157,4 +155,4 @@ bool operator==(const ScoreStats& s1, const ScoreStats& s2)
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -21,10 +21,8 @@ public:
|
||||
}
|
||||
|
||||
static void Delete() {
|
||||
if (m_instance) {
|
||||
delete m_instance;
|
||||
m_instance = NULL;
|
||||
}
|
||||
delete m_instance;
|
||||
m_instance = NULL;
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -50,7 +50,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "moses/FeatureVector.h"
|
||||
#include "moses/FF/StatefulFeatureFunction.h"
|
||||
#include "moses/FF/StatelessFeatureFunction.h"
|
||||
#include "moses/FF/SyntaxConstraintFeature.h"
|
||||
#include "moses/FF/TreeStructureFeature.h"
|
||||
#include "util/exception.hh"
|
||||
|
||||
using namespace std;
|
||||
@ -395,14 +395,16 @@ void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
|
||||
UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
|
||||
"No output file for tree fragments specified");
|
||||
|
||||
//Tree of full sentence (to stderr)
|
||||
const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
|
||||
for( size_t i=0; i<sff.size(); i++ ) {
|
||||
const StatefulFeatureFunction *ff = sff[i];
|
||||
if (ff->GetScoreProducerDescription() == "SyntaxConstraintFeature0") {
|
||||
const TreeState* tree = dynamic_cast<const TreeState*>(hypo->GetFFState(i));
|
||||
out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n";
|
||||
break;
|
||||
//Tree of full sentence
|
||||
const StatefulFeatureFunction* treeStructure = StaticData::Instance().GetTreeStructure();
|
||||
if (treeStructure != NULL) {
|
||||
const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
|
||||
for( size_t i=0; i<sff.size(); i++ ) {
|
||||
if (sff[i] == treeStructure) {
|
||||
const TreeState* tree = dynamic_cast<const TreeState*>(hypo->GetFFState(i));
|
||||
out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,7 +97,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
|
||||
|
||||
targetPhrase->SetTargetLHS(targetLHS);
|
||||
targetPhrase->SetAlignmentInfo("0-0");
|
||||
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
|
||||
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.GetTreeStructure() != NULL) {
|
||||
targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]");
|
||||
}
|
||||
|
||||
|
@ -34,7 +34,7 @@
|
||||
#include "moses/FF/ExternalFeature.h"
|
||||
#include "moses/FF/ConstrainedDecoding.h"
|
||||
#include "moses/FF/CoveredReferenceFeature.h"
|
||||
#include "moses/FF/SyntaxConstraintFeature.h"
|
||||
#include "moses/FF/TreeStructureFeature.h"
|
||||
#include "moses/FF/SoftMatchingFeature.h"
|
||||
#include "moses/FF/HyperParameterAsWeight.h"
|
||||
|
||||
@ -174,7 +174,7 @@ FeatureRegistry::FeatureRegistry()
|
||||
MOSES_FNAME(ConstrainedDecoding);
|
||||
MOSES_FNAME(CoveredReferenceFeature);
|
||||
MOSES_FNAME(ExternalFeature);
|
||||
MOSES_FNAME(SyntaxConstraintFeature);
|
||||
MOSES_FNAME(TreeStructureFeature);
|
||||
MOSES_FNAME(SoftMatchingFeature);
|
||||
MOSES_FNAME(HyperParameterAsWeight);
|
||||
|
||||
|
@ -52,8 +52,7 @@ LexicalReordering::LexicalReordering(const std::string &line)
|
||||
|
||||
LexicalReordering::~LexicalReordering()
|
||||
{
|
||||
if(m_table)
|
||||
delete m_table;
|
||||
delete m_table;
|
||||
delete m_configuration;
|
||||
}
|
||||
|
||||
|
@ -1,186 +0,0 @@
|
||||
#include "SyntaxConstraintFeature.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/ChartHypothesis.h"
|
||||
#include "moses/TargetPhrase.h"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
InternalTree::InternalTree(const std::string & line, const bool terminal) {
|
||||
|
||||
size_t found = line.find_first_of("[] ");
|
||||
m_isTerminal = terminal;
|
||||
|
||||
if (found == line.npos) {
|
||||
m_value = line;
|
||||
}
|
||||
|
||||
else {
|
||||
AddSubTree(line, 0);
|
||||
}
|
||||
}
|
||||
|
||||
size_t InternalTree::AddSubTree(const std::string & line, size_t pos) {
|
||||
|
||||
std::string value = "";
|
||||
char token = 0;
|
||||
|
||||
while (token != ']' && pos != std::string::npos)
|
||||
{
|
||||
size_t oldpos = pos;
|
||||
pos = line.find_first_of("[] ", pos);
|
||||
if (pos == std::string::npos) break;
|
||||
token = line[pos];
|
||||
value = line.substr(oldpos,pos-oldpos);
|
||||
|
||||
if (token == '[') {
|
||||
if (m_value.size() > 0) {
|
||||
TreePointer child(new InternalTree(value, false));
|
||||
m_children.push_back(child);
|
||||
pos = child->AddSubTree(line, pos+1);
|
||||
}
|
||||
else {
|
||||
if (value.size() > 0) {
|
||||
m_value = value;
|
||||
}
|
||||
pos = AddSubTree(line, pos+1);
|
||||
}
|
||||
}
|
||||
else if (token == ' ' || token == ']') {
|
||||
if (value.size() > 0 && ! m_value.size() > 0) {
|
||||
m_value = value;
|
||||
}
|
||||
else if (value.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
TreePointer child(new InternalTree(value, true));
|
||||
m_children.push_back(child);
|
||||
}
|
||||
if (token == ' ') {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_children.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (pos == std::string::npos) {
|
||||
return line.size();
|
||||
}
|
||||
return min(line.size(),pos+1);
|
||||
|
||||
}
|
||||
|
||||
std::string InternalTree::GetString() const {
|
||||
|
||||
std::string ret = " ";
|
||||
|
||||
if (!m_isTerminal) {
|
||||
ret += "[";
|
||||
}
|
||||
|
||||
ret += m_value;
|
||||
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it)
|
||||
{
|
||||
ret += (*it)->GetString();
|
||||
}
|
||||
|
||||
if (!m_isTerminal) {
|
||||
ret += "]";
|
||||
}
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
void InternalTree::Combine(const std::vector<TreePointer> &previous) {
|
||||
|
||||
std::vector<TreePointer>::iterator it;
|
||||
bool found = false;
|
||||
leafNT next_leafNT(this);
|
||||
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
|
||||
found = next_leafNT(it);
|
||||
if (found) {
|
||||
*it = *it_prev;
|
||||
}
|
||||
else {
|
||||
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
FFState* SyntaxConstraintFeature::EvaluateChart(const ChartHypothesis& cur_hypo
|
||||
, int featureID /* used to index the state in the previous hypotheses */
|
||||
, ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
std::string tree;
|
||||
bool found = 0;
|
||||
cur_hypo.GetCurrTargetPhrase().GetProperty("Tree", tree, found);
|
||||
|
||||
TreePointer mytree (new InternalTree(tree));
|
||||
|
||||
//get subtrees (in target order)
|
||||
std::vector<TreePointer> previous_trees;
|
||||
for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
|
||||
const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos);
|
||||
if (word.IsNonTerminal()) {
|
||||
size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
|
||||
const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermInd);
|
||||
const TreeState* prev = dynamic_cast<const TreeState*>(prevHypo->GetFFState(featureID));
|
||||
const TreePointer prev_tree = prev->GetTree();
|
||||
previous_trees.push_back(prev_tree);
|
||||
}
|
||||
}
|
||||
|
||||
mytree->Combine(previous_trees);
|
||||
|
||||
|
||||
return new TreeState(mytree);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
315
moses/FF/TreeStructureFeature.cpp
Normal file
315
moses/FF/TreeStructureFeature.cpp
Normal file
@ -0,0 +1,315 @@
|
||||
#include "TreeStructureFeature.h"
|
||||
#include "moses/StaticData.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/ChartHypothesis.h"
|
||||
#include "moses/TargetPhrase.h"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
InternalTree::InternalTree(const std::string & line, const bool terminal):
|
||||
m_value_nt(0),
|
||||
m_isTerminal(terminal)
|
||||
{
|
||||
|
||||
size_t found = line.find_first_of("[] ");
|
||||
|
||||
if (found == line.npos) {
|
||||
m_value = line;
|
||||
}
|
||||
|
||||
else {
|
||||
AddSubTree(line, 0);
|
||||
}
|
||||
}
|
||||
|
||||
size_t InternalTree::AddSubTree(const std::string & line, size_t pos) {
|
||||
|
||||
std::string value = "";
|
||||
char token = 0;
|
||||
|
||||
while (token != ']' && pos != std::string::npos)
|
||||
{
|
||||
size_t oldpos = pos;
|
||||
pos = line.find_first_of("[] ", pos);
|
||||
if (pos == std::string::npos) break;
|
||||
token = line[pos];
|
||||
value = line.substr(oldpos,pos-oldpos);
|
||||
|
||||
if (token == '[') {
|
||||
if (m_value.size() > 0) {
|
||||
TreePointer child(new InternalTree(value, false));
|
||||
m_children.push_back(child);
|
||||
pos = child->AddSubTree(line, pos+1);
|
||||
}
|
||||
else {
|
||||
if (value.size() > 0) {
|
||||
m_value = value;
|
||||
}
|
||||
pos = AddSubTree(line, pos+1);
|
||||
}
|
||||
}
|
||||
else if (token == ' ' || token == ']') {
|
||||
if (value.size() > 0 && ! m_value.size() > 0) {
|
||||
m_value = value;
|
||||
}
|
||||
else if (value.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
TreePointer child(new InternalTree(value, true));
|
||||
m_children.push_back(child);
|
||||
}
|
||||
if (token == ' ') {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_children.size() > 0) {
|
||||
m_isTerminal = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (pos == std::string::npos) {
|
||||
return line.size();
|
||||
}
|
||||
return min(line.size(),pos+1);
|
||||
|
||||
}
|
||||
|
||||
std::string InternalTree::GetString() const {
|
||||
|
||||
std::string ret = " ";
|
||||
|
||||
if (!m_isTerminal) {
|
||||
ret += "[";
|
||||
}
|
||||
|
||||
ret += m_value;
|
||||
for (std::vector<TreePointer>::const_iterator it = m_children.begin(); it != m_children.end(); ++it)
|
||||
{
|
||||
ret += (*it)->GetString();
|
||||
}
|
||||
|
||||
if (!m_isTerminal) {
|
||||
ret += "]";
|
||||
}
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
void InternalTree::Combine(const std::vector<TreePointer> &previous) {
|
||||
|
||||
std::vector<TreePointer>::iterator it;
|
||||
bool found = false;
|
||||
leafNT next_leafNT(this);
|
||||
for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
|
||||
found = next_leafNT(it);
|
||||
if (found) {
|
||||
*it = *it_prev;
|
||||
}
|
||||
else {
|
||||
std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool InternalTree::FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetLabel() == label) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool InternalTree::FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetNTLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetNTLabel() == label) {
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if ((*it)->GetNTLabel() == label) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(label, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool InternalTree::FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(labels, it2)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool InternalTree::RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const {
|
||||
for (it = m_children.begin(); it != m_children.end(); ++it) {
|
||||
if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) {
|
||||
parent = this;
|
||||
return true;
|
||||
}
|
||||
std::vector<TreePointer>::const_iterator it2;
|
||||
if ((*it)->RecursiveSearch(labels, it2, parent)) {
|
||||
it = it2;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void TreeStructureFeature::Load() {
|
||||
|
||||
// syntactic constraints can be hooked in here.
|
||||
m_constraints = NULL;
|
||||
m_labelset = NULL;
|
||||
|
||||
StaticData &staticData = StaticData::InstanceNonConst();
|
||||
staticData.SetTreeStructure(this);
|
||||
}
|
||||
|
||||
|
||||
// define NT labels (ints) that are mapped from strings for quicker comparison.
|
||||
void TreeStructureFeature::AddNTLabels(TreePointer root) const {
|
||||
std::string label = root->GetLabel();
|
||||
|
||||
if (root->IsTerminal()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::map<std::string, NTLabel>::const_iterator it = m_labelset->string_to_label.find(label);
|
||||
if (it != m_labelset->string_to_label.end()) {
|
||||
root->SetNTLabel(it->second);
|
||||
}
|
||||
|
||||
std::vector<TreePointer> children = root->GetChildren();
|
||||
for (std::vector<TreePointer>::const_iterator it2 = children.begin(); it2 != children.end(); ++it2) {
|
||||
AddNTLabels(*it2);
|
||||
}
|
||||
}
|
||||
|
||||
FFState* TreeStructureFeature::EvaluateChart(const ChartHypothesis& cur_hypo
|
||||
, int featureID /* used to index the state in the previous hypotheses */
|
||||
, ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
std::string tree;
|
||||
bool found = 0;
|
||||
cur_hypo.GetCurrTargetPhrase().GetProperty("Tree", tree, found);
|
||||
if (found) {
|
||||
TreePointer mytree (new InternalTree(tree));
|
||||
|
||||
if (m_labelset) {
|
||||
AddNTLabels(mytree);
|
||||
}
|
||||
|
||||
//get subtrees (in target order)
|
||||
std::vector<TreePointer> previous_trees;
|
||||
for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
|
||||
const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos);
|
||||
if (word.IsNonTerminal()) {
|
||||
size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
|
||||
const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermInd);
|
||||
const TreeState* prev = dynamic_cast<const TreeState*>(prevHypo->GetFFState(featureID));
|
||||
const TreePointer prev_tree = prev->GetTree();
|
||||
previous_trees.push_back(prev_tree);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> sparse_features;
|
||||
if (m_constraints) {
|
||||
sparse_features = m_constraints->SyntacticRules(mytree, previous_trees);
|
||||
}
|
||||
mytree->Combine(previous_trees);
|
||||
|
||||
//sparse scores
|
||||
for (std::vector<std::string>::const_iterator feature=sparse_features.begin(); feature != sparse_features.end(); ++feature) {
|
||||
accumulator->PlusEquals(this, *feature, 1);
|
||||
}
|
||||
return new TreeState(mytree);
|
||||
}
|
||||
else {
|
||||
UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include "StatefulFeatureFunction.h"
|
||||
#include "FFState.h"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
@ -12,14 +13,25 @@ namespace Moses
|
||||
|
||||
class InternalTree;
|
||||
typedef boost::shared_ptr<InternalTree> TreePointer;
|
||||
typedef int NTLabel;
|
||||
|
||||
class InternalTree
|
||||
{
|
||||
std::string m_value;
|
||||
NTLabel m_value_nt;
|
||||
std::vector<TreePointer> m_children;
|
||||
bool m_isTerminal;
|
||||
public:
|
||||
InternalTree(const std::string & line, const bool terminal = false);
|
||||
InternalTree(const InternalTree & tree):
|
||||
m_value(tree.m_value),
|
||||
m_isTerminal(tree.m_isTerminal) {
|
||||
const std::vector<TreePointer> & children = tree.m_children;
|
||||
for (std::vector<TreePointer>::const_iterator it = children.begin(); it != children.end(); it++) {
|
||||
TreePointer child (new InternalTree(**it));
|
||||
m_children.push_back(child);
|
||||
}
|
||||
}
|
||||
size_t AddSubTree(const std::string & line, size_t start);
|
||||
|
||||
std::string GetString() const;
|
||||
@ -27,6 +39,17 @@ public:
|
||||
const std::string & GetLabel() const {
|
||||
return m_value;
|
||||
}
|
||||
|
||||
// optionally identify label by int instead of string;
|
||||
// allows abstraction if multiple nonterminal strings should map to same label.
|
||||
const NTLabel & GetNTLabel() const {
|
||||
return m_value_nt;
|
||||
}
|
||||
|
||||
void SetNTLabel(NTLabel value) {
|
||||
m_value_nt = value;
|
||||
}
|
||||
|
||||
size_t GetLength() const {
|
||||
return m_children.size();
|
||||
}
|
||||
@ -45,6 +68,8 @@ public:
|
||||
return (!m_isTerminal && m_children.size() == 0);
|
||||
}
|
||||
|
||||
// different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents.
|
||||
// can be used for formulating syntax constraints.
|
||||
|
||||
// if found, 'it' is iterator to first tree node that matches search string
|
||||
bool FlatSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
@ -53,6 +78,41 @@ public:
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const std::string & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
|
||||
// use NTLabel for search to reduce number of string comparisons / deal with synonymous labels
|
||||
// if found, 'it' is iterator to first tree node that matches search string
|
||||
bool FlatSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it) const;
|
||||
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const NTLabel & label, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
|
||||
// pass vector of possible labels to search
|
||||
// if found, 'it' is iterator to first tree node that matches search string
|
||||
bool FlatSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
|
||||
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it) const;
|
||||
|
||||
// if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node
|
||||
bool RecursiveSearch(const std::vector<NTLabel> & labels, std::vector<TreePointer>::const_iterator & it, InternalTree const* &parent) const;
|
||||
|
||||
|
||||
};
|
||||
|
||||
// mapping from string nonterminal label to int representation.
|
||||
// allows abstraction if multiple nonterminal strings should map to same label.
|
||||
struct LabelSet
|
||||
{
|
||||
public:
|
||||
std::map<std::string, NTLabel> string_to_label;
|
||||
};
|
||||
|
||||
|
||||
// class to implement language-specific syntactic constraints.
|
||||
// the method SyntacticRules must return a vector of strings (each identifying a constraint violation), which are then made into sparse features.
|
||||
class SyntaxConstraints
|
||||
{
|
||||
public:
|
||||
virtual std::vector<std::string> SyntacticRules(TreePointer root, const std::vector<TreePointer> &previous) = 0;
|
||||
virtual ~SyntaxConstraints() {};
|
||||
};
|
||||
|
||||
|
||||
@ -71,18 +131,23 @@ public:
|
||||
int Compare(const FFState& other) const {return 0;};
|
||||
};
|
||||
|
||||
class SyntaxConstraintFeature : public StatefulFeatureFunction
|
||||
class TreeStructureFeature : public StatefulFeatureFunction
|
||||
{
|
||||
SyntaxConstraints* m_constraints;
|
||||
LabelSet* m_labelset;
|
||||
public:
|
||||
SyntaxConstraintFeature(const std::string &line)
|
||||
TreeStructureFeature(const std::string &line)
|
||||
:StatefulFeatureFunction(0, line) {
|
||||
ReadParameters();
|
||||
}
|
||||
~TreeStructureFeature() {delete m_constraints;};
|
||||
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
|
||||
return new TreeState(TreePointer());
|
||||
}
|
||||
|
||||
void AddNTLabels(TreePointer root) const;
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
@ -105,6 +170,7 @@ public:
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
void Load();
|
||||
};
|
||||
|
||||
// Python-like generator that yields next nonterminal leaf on every call
|
@ -9,8 +9,8 @@
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/InputFileStream.h"
|
||||
#include "util/exception.hh"
|
||||
#include "ChartState.h"
|
||||
#include "util/exception.hh"
|
||||
#include "moses/ChartHypothesis.h"
|
||||
#include "moses/ChartManager.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -58,6 +58,16 @@ public:
|
||||
delete state;
|
||||
}
|
||||
|
||||
void reset(const DALMState &from){
|
||||
delete state;
|
||||
state = new DALM::State(*from.state);
|
||||
}
|
||||
|
||||
void reset(DALM::State *s){
|
||||
delete state;
|
||||
state = s;
|
||||
}
|
||||
|
||||
virtual int Compare(const FFState& other) const{
|
||||
const DALMState &o = static_cast<const DALMState &>(other);
|
||||
if(state->get_count() < o.state->get_count()) return -1;
|
||||
@ -74,6 +84,83 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
class DALMChartState : public FFState
|
||||
{
|
||||
private:
|
||||
const ChartHypothesis &hypo;
|
||||
DALM::Fragment *prefixFragments;
|
||||
unsigned short prefixLength;
|
||||
float prefixScore;
|
||||
DALMState *rightContext;
|
||||
bool isLarge;
|
||||
|
||||
public:
|
||||
DALMChartState(
|
||||
const ChartHypothesis &hypo,
|
||||
DALM::Fragment *prefixFragments,
|
||||
unsigned short prefixLength,
|
||||
float prefixScore,
|
||||
DALMState *rightContext,
|
||||
bool isLarge)
|
||||
: hypo(hypo),
|
||||
prefixFragments(prefixFragments),
|
||||
prefixLength(prefixLength),
|
||||
prefixScore(prefixScore),
|
||||
rightContext(rightContext),
|
||||
isLarge(isLarge)
|
||||
{}
|
||||
|
||||
virtual ~DALMChartState(){
|
||||
delete [] prefixFragments;
|
||||
delete rightContext;
|
||||
}
|
||||
|
||||
unsigned short GetPrefixLength() const{
|
||||
return prefixLength;
|
||||
}
|
||||
|
||||
const DALM::Fragment *GetPrefixFragments() const{
|
||||
return prefixFragments;
|
||||
}
|
||||
|
||||
float GetPrefixScore() const{
|
||||
return prefixScore;
|
||||
}
|
||||
|
||||
const DALMState *GetRightContext() const{
|
||||
return rightContext;
|
||||
}
|
||||
|
||||
bool LargeEnough() const{
|
||||
return isLarge;
|
||||
}
|
||||
|
||||
virtual int Compare(const FFState& other) const{
|
||||
const DALMChartState &o = static_cast<const DALMChartState &>(other);
|
||||
// prefix
|
||||
if (hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
|
||||
if (prefixLength != o.prefixLength){
|
||||
return (prefixLength < o.prefixLength)?-1:1;
|
||||
} else {
|
||||
if(prefixLength > 0){
|
||||
DALM::Fragment &f = prefixFragments[prefixLength-1];
|
||||
DALM::Fragment &of = o.prefixFragments[prefixLength-1];
|
||||
int ret = DALM::compare_fragments(f, of);
|
||||
if(ret != 0) return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// suffix
|
||||
size_t inputSize = hypo.GetManager().GetSource().GetSize();
|
||||
if (hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
|
||||
int ret = o.rightContext->Compare(*rightContext);
|
||||
if (ret != 0) return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
LanguageModelDALM::LanguageModelDALM(const std::string &line)
|
||||
:LanguageModel(line)
|
||||
{
|
||||
@ -96,7 +183,7 @@ void LanguageModelDALM::Load()
|
||||
/////////////////////
|
||||
// READING INIFILE //
|
||||
/////////////////////
|
||||
string inifile= m_filePath + "/dalm.ini";
|
||||
string inifile= m_filePath + "/dalm.ini";
|
||||
|
||||
string model; // Path to the double-array file.
|
||||
string words; // Path to the vocabulary file.
|
||||
@ -104,8 +191,8 @@ void LanguageModelDALM::Load()
|
||||
read_ini(inifile.c_str(), model, words, wordstxt);
|
||||
|
||||
model = m_filePath + "/" + model;
|
||||
words = m_filePath + "/" + words;
|
||||
wordstxt = m_filePath + "/" + wordstxt;
|
||||
words = m_filePath + "/" + words;
|
||||
wordstxt = m_filePath + "/" + wordstxt;
|
||||
|
||||
UTIL_THROW_IF(model.empty() || words.empty() || wordstxt.empty(),
|
||||
util::FileOpenException,
|
||||
@ -150,60 +237,40 @@ void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float
|
||||
size_t phraseSize = phrase.GetSize();
|
||||
if (!phraseSize) return;
|
||||
|
||||
DALMState *dalm_state = new DALMState(m_nGramOrder);
|
||||
|
||||
size_t currPos = 0;
|
||||
size_t hist_count = 0;
|
||||
DALMState *dalm_state = new DALMState(m_nGramOrder);
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
|
||||
if(phrase.GetWord(0).GetFactor(m_factorType) == m_beginSentenceFactor){
|
||||
m_lm->init_state(*state);
|
||||
currPos++;
|
||||
hist_count++;
|
||||
}
|
||||
|
||||
while (currPos < phraseSize) {
|
||||
const Word &word = phrase.GetWord(currPos);
|
||||
hist_count++;
|
||||
|
||||
if (word.IsNonTerminal()) {
|
||||
// do nothing. reset ngram. needed to score target phrases during pt loading in chart decoding
|
||||
dalm_state->refresh();
|
||||
state->refresh();
|
||||
hist_count = 0;
|
||||
} else {
|
||||
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
|
||||
// do nothing, don't include prob for <s> unigram
|
||||
if (currPos != 0) {
|
||||
UTIL_THROW2("Either your data contains <s> in a position other than the first word or your language model is missing <s>. Did you build your ARPA using IRSTLM and forget to run add-start-end.sh?");
|
||||
}
|
||||
m_lm->init_state(*dalm_state->get_state());
|
||||
} else {
|
||||
LMResult result = GetValue(word, dalm_state->get_state());
|
||||
fullScore += result.score;
|
||||
if (hist_count >= m_nGramOrder) ngramScore += result.score;
|
||||
if (result.unknown) ++oovCount;
|
||||
}
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
float score = m_lm->query(wid, *state);
|
||||
fullScore += score;
|
||||
if (hist_count >= m_nGramOrder) ngramScore += score;
|
||||
if (wid==m_vocab->unk()) ++oovCount;
|
||||
}
|
||||
|
||||
currPos++;
|
||||
}
|
||||
|
||||
fullScore = TransformLMScore(fullScore);
|
||||
ngramScore = TransformLMScore(ngramScore);
|
||||
delete dalm_state;
|
||||
}
|
||||
|
||||
LMResult LanguageModelDALM::GetValue(DALM::VocabId wid, DALM::State* finalState) const{
|
||||
LMResult ret;
|
||||
|
||||
// last word is unk?
|
||||
ret.unknown = (wid == m_vocab->unk());
|
||||
|
||||
// calc score.
|
||||
float score = m_lm->query(wid, *finalState);
|
||||
score = TransformLMScore(score);
|
||||
ret.score = score;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
LMResult LanguageModelDALM::GetValue(const Word &word, DALM::State* finalState) const
|
||||
{
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
|
||||
return GetValue(wid, finalState);
|
||||
}
|
||||
|
||||
FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const{
|
||||
// In this function, we only compute the LM scores of n-grams that overlap a
|
||||
// phrase boundary. Phrase-internal scores are taken directly from the
|
||||
@ -222,28 +289,28 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
|
||||
const std::size_t adjust_end = std::min(end, begin + m_nGramOrder - 1);
|
||||
|
||||
DALMState *dalm_state = new DALMState(*dalm_ps);
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
|
||||
std::size_t position = begin;
|
||||
float score = 0.0;
|
||||
for(; position < adjust_end; position++){
|
||||
score += GetValue(hypo.GetWord(position), dalm_state->get_state()).score;
|
||||
for(std::size_t position=begin; position < adjust_end; position++){
|
||||
score += m_lm->query(GetVocabId(hypo.GetWord(position).GetFactor(m_factorType)), *state);
|
||||
}
|
||||
|
||||
if (hypo.IsSourceCompleted()) {
|
||||
// Score end of sentence.
|
||||
std::vector<DALM::VocabId> indices(m_nGramOrder-1);
|
||||
const DALM::VocabId *last = LastIDs(hypo, &indices.front());
|
||||
m_lm->set_state(&indices.front(), (last-&indices.front()), *dalm_state->get_state());
|
||||
m_lm->set_state(&indices.front(), (last-&indices.front()), *state);
|
||||
|
||||
float s = GetValue(wid_end, dalm_state->get_state()).score;
|
||||
score += s;
|
||||
score += m_lm->query(wid_end, *state);
|
||||
} else if (adjust_end < end) {
|
||||
// Get state after adding a long phrase.
|
||||
std::vector<DALM::VocabId> indices(m_nGramOrder-1);
|
||||
const DALM::VocabId *last = LastIDs(hypo, &indices.front());
|
||||
m_lm->set_state(&indices.front(), (last-&indices.front()), *dalm_state->get_state());
|
||||
m_lm->set_state(&indices.front(), (last-&indices.front()), *state);
|
||||
}
|
||||
|
||||
score = TransformLMScore(score);
|
||||
if (OOVFeatureEnabled()) {
|
||||
std::vector<float> scores(2);
|
||||
scores[0] = score;
|
||||
@ -257,129 +324,184 @@ FFState *LanguageModelDALM::Evaluate(const Hypothesis &hypo, const FFState *ps,
|
||||
}
|
||||
|
||||
FFState *LanguageModelDALM::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const{
|
||||
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, m_nGramOrder);
|
||||
// initialize language model context state
|
||||
DALMState *dalm_state = new DALMState(m_nGramOrder);
|
||||
DALM::State *state = dalm_state->get_state();
|
||||
|
||||
size_t contextSize = m_nGramOrder-1;
|
||||
DALM::Fragment *prefixFragments = new DALM::Fragment[contextSize];
|
||||
unsigned short prefixLength = 0;
|
||||
bool isLarge = false;
|
||||
|
||||
// initial language model scores
|
||||
float prefixScore = 0.0; // not yet final for initial words (lack context)
|
||||
float finalizedScore = 0.0; // finalized, has sufficient context
|
||||
float hypoScore = 0.0; // total hypothesis score.
|
||||
|
||||
const TargetPhrase &targetPhrase = hypo.GetCurrTargetPhrase();
|
||||
size_t hypoSize = targetPhrase.GetSize();
|
||||
|
||||
// get index map for underlying hypotheses
|
||||
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
||||
hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap();
|
||||
targetPhrase.GetAlignNonTerm().GetNonTermIndexMap();
|
||||
|
||||
size_t phrasePos = 0;
|
||||
|
||||
// begginig of sentence.
|
||||
if(hypoSize > 0){
|
||||
const Word &word = targetPhrase.GetWord(0);
|
||||
if(!word.IsNonTerminal()){
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
if(word.GetFactor(m_factorType) == m_beginSentenceFactor){
|
||||
m_lm->init_state(*state);
|
||||
// state is finalized.
|
||||
isLarge = true;
|
||||
}else{
|
||||
if(isLarge){
|
||||
float score = m_lm->query(wid, *state);
|
||||
hypoScore += score;
|
||||
}else{
|
||||
float score = m_lm->query(wid, *state, prefixFragments[prefixLength]);
|
||||
|
||||
prefixScore += score;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
}
|
||||
}else{
|
||||
// special case: rule starts with non-terminal -> copy everything
|
||||
size_t nonTermIndex = nonTermIndexMap[0];
|
||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
|
||||
|
||||
const DALMChartState* prevState =
|
||||
static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
|
||||
|
||||
// get prefixScore and hypoScore
|
||||
prefixScore = prevState->GetPrefixScore();
|
||||
hypoScore = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
||||
|
||||
// get language model state
|
||||
dalm_state->reset(*prevState->GetRightContext());
|
||||
state = dalm_state->get_state();
|
||||
|
||||
prefixLength = prevState->GetPrefixLength();
|
||||
const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
|
||||
std::memcpy(prefixFragments, prevPrefixFragments, sizeof(DALM::Fragment)*prefixLength);
|
||||
isLarge = prevState->LargeEnough();
|
||||
}
|
||||
phrasePos++;
|
||||
}
|
||||
|
||||
// loop over rule
|
||||
for (size_t phrasePos = 0, wordPos = 0;
|
||||
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
|
||||
phrasePos++) {
|
||||
for (; phrasePos < hypoSize; phrasePos++) {
|
||||
// consult rule for either word or non-terminal
|
||||
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
|
||||
const Word &word = targetPhrase.GetWord(phrasePos);
|
||||
|
||||
// regular word
|
||||
if (!word.IsNonTerminal()) {
|
||||
// beginning of sentence symbol <s>? -> just update state
|
||||
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
|
||||
UTIL_THROW_IF2(phrasePos != 0,
|
||||
"Sentence start symbol must be at the beginning of sentence");
|
||||
m_lm->init_state(*dalm_state->get_state());
|
||||
}
|
||||
// score a regular word added by the rule
|
||||
else {
|
||||
updateChartScore( &prefixScore, &finalizedScore, GetValue(word, dalm_state->get_state()).score, ++wordPos );
|
||||
}
|
||||
DALM::VocabId wid = GetVocabId(word.GetFactor(m_factorType));
|
||||
if (isLarge) {
|
||||
hypoScore += m_lm->query(wid, *state);
|
||||
}else{
|
||||
float score = m_lm->query(wid, *state, prefixFragments[prefixLength]);
|
||||
prefixScore += score;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
}
|
||||
|
||||
// non-terminal, add phrase from underlying hypothesis
|
||||
// internal non-terminal
|
||||
else {
|
||||
// look up underlying hypothesis
|
||||
size_t nonTermIndex = nonTermIndexMap[phrasePos];
|
||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
|
||||
|
||||
const LanguageModelChartState* prevState =
|
||||
static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID));
|
||||
const DALMChartState* prevState =
|
||||
static_cast<const DALMChartState*>(prevHypo->GetFFState(featureID));
|
||||
|
||||
size_t subPhraseLength = prevState->GetNumTargetTerminals();
|
||||
// special case: rule starts with non-terminal -> copy everything
|
||||
if (phrasePos == 0) {
|
||||
size_t prevPrefixLength = prevState->GetPrefixLength();
|
||||
const DALM::Fragment *prevPrefixFragments = prevState->GetPrefixFragments();
|
||||
DALM::Gap gap(*state);
|
||||
// score its prefix
|
||||
for(size_t prefixPos = 0; prefixPos < prevPrefixLength; prefixPos++) {
|
||||
const DALM::Fragment &f = prevPrefixFragments[prefixPos];
|
||||
|
||||
// get prefixScore and finalizedScore
|
||||
prefixScore = prevState->GetPrefixScore();
|
||||
finalizedScore = prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] - prefixScore;
|
||||
|
||||
// get language model state
|
||||
delete dalm_state;
|
||||
dalm_state = new DALMState( *static_cast<DALMState*>(prevState->GetRightContext()) );
|
||||
wordPos += subPhraseLength;
|
||||
if (isLarge) {
|
||||
hypoScore += m_lm->query(f, *state, gap);
|
||||
} else {
|
||||
float score = m_lm->query(f, *state, gap, prefixFragments[prefixLength]);
|
||||
prefixScore += score;
|
||||
hypoScore += score;
|
||||
prefixLength++;
|
||||
if(prefixLength >= contextSize) isLarge = true;
|
||||
}
|
||||
gap.succ();
|
||||
}
|
||||
|
||||
// internal non-terminal
|
||||
else {
|
||||
// score its prefix
|
||||
size_t wpos = wordPos;
|
||||
for(size_t prefixPos = 0;
|
||||
prefixPos < m_nGramOrder-1 // up to LM order window
|
||||
&& prefixPos < subPhraseLength; // up to length
|
||||
prefixPos++) {
|
||||
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
|
||||
updateChartScore( &prefixScore, &finalizedScore, GetValue(word, dalm_state->get_state()).score, ++wpos );
|
||||
}
|
||||
wordPos += subPhraseLength;
|
||||
|
||||
// check if we are dealing with a large sub-phrase
|
||||
if (subPhraseLength > m_nGramOrder - 1) {
|
||||
// add its finalized language model score
|
||||
finalizedScore +=
|
||||
prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] // full score
|
||||
- prevState->GetPrefixScore(); // - prefix score
|
||||
|
||||
// copy language model state
|
||||
delete dalm_state;
|
||||
dalm_state = new DALMState( *static_cast<DALMState*>(prevState->GetRightContext()) );
|
||||
}
|
||||
}
|
||||
// check if we are dealing with a large sub-phrase
|
||||
if (prevState->LargeEnough()) {
|
||||
// add its language model score
|
||||
hypoScore += UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
|
||||
hypoScore -= prevState->GetPrefixScore(); // remove overwrapped score.
|
||||
// copy language model state
|
||||
dalm_state->reset(*prevState->GetRightContext());
|
||||
state = dalm_state->get_state();
|
||||
} else {
|
||||
DALM::State *state_new = new DALM::State(*prevState->GetRightContext()->get_state());
|
||||
m_lm->set_state(*state_new, *state, gap);
|
||||
dalm_state->reset(state_new);
|
||||
state = dalm_state->get_state();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// assign combined score to score breakdown
|
||||
out->Assign(this, prefixScore + finalizedScore);
|
||||
out->Assign(this, TransformLMScore(hypoScore));
|
||||
|
||||
ret->Set(prefixScore, dalm_state);
|
||||
return ret;
|
||||
return new DALMChartState(hypo, prefixFragments, prefixLength, prefixScore, dalm_state, isLarge);
|
||||
}
|
||||
|
||||
bool LanguageModelDALM::IsUseable(const FactorMask &mask) const
|
||||
{
|
||||
bool ret = mask[m_factorType];
|
||||
return ret;
|
||||
return mask[m_factorType];
|
||||
}
|
||||
|
||||
void LanguageModelDALM::CreateVocabMapping(const std::string &wordstxt)
|
||||
{
|
||||
InputFileStream vocabStrm(wordstxt);
|
||||
|
||||
std::vector< std::pair<std::size_t, DALM::VocabId> > vlist;
|
||||
string line;
|
||||
std::size_t max_fid = 0;
|
||||
while(getline(vocabStrm, line)) {
|
||||
const Factor *factor = FactorCollection::Instance().AddFactor(line);
|
||||
std::size_t fid = factor->GetId();
|
||||
DALM::VocabId wid = m_vocab->lookup(line.c_str());
|
||||
|
||||
VocabMap::value_type entry(factor, wid);
|
||||
m_vocabMap.insert(entry);
|
||||
vlist.push_back(std::pair<std::size_t, DALM::VocabId>(fid, wid));
|
||||
if(max_fid < fid) max_fid = fid;
|
||||
}
|
||||
|
||||
for(std::size_t i = 0; i < m_vocabMap.size(); i++){
|
||||
m_vocabMap[i] = m_vocab->unk();
|
||||
}
|
||||
|
||||
m_vocabMap.resize(max_fid+1, m_vocab->unk());
|
||||
std::vector< std::pair<std::size_t, DALM::VocabId> >::iterator it = vlist.begin();
|
||||
while(it != vlist.end()){
|
||||
std::pair<std::size_t, DALM::VocabId> &entry = *it;
|
||||
m_vocabMap[entry.first] = entry.second;
|
||||
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
DALM::VocabId LanguageModelDALM::GetVocabId(const Factor *factor) const
|
||||
{
|
||||
VocabMap::left_map::const_iterator iter;
|
||||
iter = m_vocabMap.left.find(factor);
|
||||
if (iter != m_vocabMap.left.end()) {
|
||||
return iter->second;
|
||||
}
|
||||
else {
|
||||
// not in mapping. Must be UNK
|
||||
return m_vocab->unk();
|
||||
}
|
||||
std::size_t fid = factor->GetId();
|
||||
return (m_vocabMap.size() > fid)? m_vocabMap[fid] : m_vocab->unk();
|
||||
}
|
||||
|
||||
void LanguageModelDALM::SetParameter(const std::string& key, const std::string& value)
|
||||
@ -395,13 +517,4 @@ void LanguageModelDALM::SetParameter(const std::string& key, const std::string&
|
||||
}
|
||||
}
|
||||
|
||||
void LanguageModelDALM::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const
|
||||
{
|
||||
if (wordPos < m_nGramOrder) {
|
||||
*prefixScore += score;
|
||||
} else {
|
||||
*finalizedScore += score;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -53,17 +53,12 @@ protected:
|
||||
DALM::LM *m_lm;
|
||||
DALM::VocabId wid_start, wid_end;
|
||||
|
||||
typedef boost::bimap<const Factor *, DALM::VocabId> VocabMap;
|
||||
mutable VocabMap m_vocabMap;
|
||||
mutable std::vector<DALM::VocabId> m_vocabMap;
|
||||
|
||||
void CreateVocabMapping(const std::string &wordstxt);
|
||||
DALM::VocabId GetVocabId(const Factor *factor) const;
|
||||
|
||||
private:
|
||||
LMResult GetValue(DALM::VocabId wid, DALM::State* finalState) const;
|
||||
LMResult GetValue(const Word &word, DALM::State* finalState) const;
|
||||
void updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const;
|
||||
|
||||
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
||||
DALM::VocabId *LastIDs(const Hypothesis &hypo, DALM::VocabId *indices) const {
|
||||
DALM::VocabId *index = indices;
|
||||
|
@ -94,9 +94,16 @@ if $(with-nplm) {
|
||||
local with-dalm = [ option.get "with-dalm" ] ;
|
||||
if $(with-dalm) {
|
||||
lib dalm : : <search>$(with-dalm)/lib ;
|
||||
lib MurmurHash3 : : <search>$(with-dalm)/lib ;
|
||||
obj DALM.o : DALMWrapper.cpp dalm MurmurHash3 ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
|
||||
alias dalmALIAS : DALM.o dalm MurmurHash3 : : : <define>LM_DALM ;
|
||||
|
||||
if [ path.exists $(with-dalm)/lib/libMurmurHash3.a ] {
|
||||
lib MurmurHash3 : : <search>$(with-dalm)/lib ;
|
||||
alias dalm-libs : dalm MurmurHash3 ;
|
||||
} else {
|
||||
alias dalm-libs : dalm ;
|
||||
}
|
||||
|
||||
obj DALM.o : DALMWrapper.cpp dalm-libs ..//headers : <include>$(with-dalm)/include <include>$(with-dalm)/darts-clone ;
|
||||
alias dalmALIAS : DALM.o dalm-libs : : : <define>LM_DALM ;
|
||||
dependencies += dalmALIAS ;
|
||||
lmmacros += LM_DALM ;
|
||||
}
|
||||
|
@ -66,6 +66,7 @@ StaticData::StaticData()
|
||||
,m_lmEnableOOVFeature(false)
|
||||
,m_isAlwaysCreateDirectTranslationOption(false)
|
||||
,m_currentWeightSetting("default")
|
||||
,m_treeStructure(NULL)
|
||||
{
|
||||
m_xmlBrackets.first="<";
|
||||
m_xmlBrackets.second=">";
|
||||
@ -1184,5 +1185,52 @@ void StaticData::CheckLEGACYPT()
|
||||
}
|
||||
|
||||
|
||||
void StaticData::ResetWeights(const std::string &denseWeights, const std::string &sparseFile)
|
||||
{
|
||||
m_allWeights = ScoreComponentCollection();
|
||||
|
||||
// dense weights
|
||||
string name("");
|
||||
vector<float> weights;
|
||||
vector<string> toks = Tokenize(denseWeights);
|
||||
for (size_t i = 0; i < toks.size(); ++i) {
|
||||
const string &tok = toks[i];
|
||||
|
||||
if (tok.substr(tok.size() - 1, 1) == "=") {
|
||||
// start of new feature
|
||||
|
||||
if (name != "") {
|
||||
// save previous ff
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name);
|
||||
m_allWeights.Assign(&ff, weights);
|
||||
weights.clear();
|
||||
}
|
||||
|
||||
name = tok.substr(0, tok.size() - 1);
|
||||
} else {
|
||||
// a weight for curr ff
|
||||
float weight = Scan<float>(toks[i]);
|
||||
weights.push_back(weight);
|
||||
}
|
||||
}
|
||||
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name);
|
||||
m_allWeights.Assign(&ff, weights);
|
||||
|
||||
// sparse weights
|
||||
InputFileStream sparseStrme(sparseFile);
|
||||
string line;
|
||||
while (getline(sparseStrme, line)) {
|
||||
vector<string> toks = Tokenize(line);
|
||||
UTIL_THROW_IF2(toks.size() != 2, "Incorrect sparse weight format. Should be FFName_spareseName weight");
|
||||
|
||||
vector<string> names = Tokenize(toks[0], "_");
|
||||
UTIL_THROW_IF2(names.size() != 2, "Incorrect sparse weight name. Should be FFName_spareseName");
|
||||
|
||||
const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(names[0]);
|
||||
m_allWeights.Assign(&ff, names[1], Scan<float>(toks[1]));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
@ -221,6 +221,8 @@ protected:
|
||||
std::map<Word, std::set<Word> > m_soft_matches_map;
|
||||
std::map<Word, std::set<Word> > m_soft_matches_map_reverse;
|
||||
|
||||
const StatefulFeatureFunction* m_treeStructure;
|
||||
|
||||
public:
|
||||
|
||||
bool IsAlwaysCreateDirectTranslationOption() const {
|
||||
@ -756,6 +758,20 @@ public:
|
||||
|
||||
bool AdjacentOnly() const
|
||||
{ return m_adjacentOnly; }
|
||||
|
||||
|
||||
void ResetWeights(const std::string &denseWeights, const std::string &sparseFile);
|
||||
|
||||
|
||||
// need global access for output of tree structure
|
||||
const StatefulFeatureFunction* GetTreeStructure() const {
|
||||
return m_treeStructure;
|
||||
}
|
||||
|
||||
void SetTreeStructure(const StatefulFeatureFunction* treeStructure) {
|
||||
m_treeStructure = treeStructure;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -43,10 +43,10 @@ public:
|
||||
alpha_[i] = i * log10(0.4);
|
||||
}
|
||||
~OnlineRLM() {
|
||||
if(alpha_) delete[] alpha_;
|
||||
delete[] alpha_;
|
||||
if(bAdapting_) delete vocab_;
|
||||
else vocab_ = NULL;
|
||||
if(cache_) delete cache_;
|
||||
delete cache_;
|
||||
delete bPrefix_;
|
||||
delete bHit_;
|
||||
}
|
||||
|
@ -58,8 +58,7 @@ const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollectionLEGACY(
|
||||
|
||||
size_t hash = hash_value(src);
|
||||
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
|
||||
|
||||
CacheColl::iterator iter;
|
||||
iter = cache.find(hash);
|
||||
|
||||
if (iter == cache.end()) {
|
||||
@ -179,7 +178,7 @@ void PhraseDictionary::ReduceCache() const
|
||||
|
||||
// find cutoff for last used time
|
||||
priority_queue< clock_t > lastUsedTimes;
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iter;
|
||||
CacheColl::iterator iter;
|
||||
iter = cache.begin();
|
||||
while( iter != cache.end() ) {
|
||||
lastUsedTimes.push( iter->second.second );
|
||||
@ -193,7 +192,7 @@ void PhraseDictionary::ReduceCache() const
|
||||
iter = cache.begin();
|
||||
while( iter != cache.end() ) {
|
||||
if (iter->second.second < cutoffLastUsedTime) {
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*,clock_t> >::iterator iterRemove = iter++;
|
||||
CacheColl::iterator iterRemove = iter++;
|
||||
delete iterRemove->second.first;
|
||||
cache.erase(iterRemove);
|
||||
} else iter++;
|
||||
|
@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
#include <boost/thread/tss.hpp>
|
||||
@ -54,7 +55,7 @@ class ChartCellCollectionBase;
|
||||
class ChartRuleLookupManager;
|
||||
class ChartParser;
|
||||
|
||||
class CacheColl : public std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
|
||||
class CacheColl : public boost::unordered_map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >
|
||||
{
|
||||
// 1st = hash of source phrase/ address of phrase-table node
|
||||
// 2nd = all translations
|
||||
|
@ -59,7 +59,7 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
|
||||
|
||||
CacheColl &cache = GetCache();
|
||||
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
|
||||
CacheColl::iterator iter;
|
||||
iter = cache.find(hash);
|
||||
|
||||
if (iter != cache.end()) {
|
||||
|
@ -165,7 +165,7 @@ const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollection(
|
||||
CacheColl &cache = GetCache();
|
||||
size_t hash = (size_t) ptNode->GetFilePos();
|
||||
|
||||
std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
|
||||
CacheColl::iterator iter;
|
||||
|
||||
iter = cache.find(hash);
|
||||
|
||||
|
@ -47,8 +47,8 @@ ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
|
||||
m_count(count),
|
||||
m_pcfgSum(pcfgSum)
|
||||
{
|
||||
assert(phraseSource.empty());
|
||||
assert(phraseTarget.empty());
|
||||
assert(phraseSource->empty());
|
||||
assert(phraseTarget->empty());
|
||||
|
||||
m_count = count;
|
||||
m_pcfgSum = pcfgSum;
|
||||
|
@ -235,8 +235,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
|
||||
// SCORES ...
|
||||
string directScores, directSparseScores, indirectScores, indirectSparseScores;
|
||||
breakdownCoreAndSparse( itemDirect[2], directScores, directSparseScores );
|
||||
breakdownCoreAndSparse( itemIndirect[2], indirectScores, indirectSparseScores );
|
||||
breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
|
||||
breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );
|
||||
|
||||
vector<string> directCounts = tokenize(itemDirect[4].c_str());
|
||||
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
|
||||
@ -307,7 +307,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
}
|
||||
|
||||
// alignment
|
||||
fileConsolidated << " ||| " << itemDirect[3];
|
||||
fileConsolidated << " ||| " << itemDirect[2];
|
||||
|
||||
// counts, for debugging
|
||||
fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
|
||||
|
@ -166,8 +166,9 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
|
||||
void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g)
|
||||
{
|
||||
Write(rule,false);
|
||||
m_fwd << " Tree ";
|
||||
m_fwd << " {{Tree ";
|
||||
g.PrintTree(m_fwd);
|
||||
m_fwd << "}}";
|
||||
m_fwd << std::endl;
|
||||
m_inv << std::endl;
|
||||
}
|
||||
|
@ -506,7 +506,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
const ScoreFeatureManager& featureManager,
|
||||
const MaybeLog& maybeLogProb )
|
||||
{
|
||||
assert(phrasePair.isValid());
|
||||
assert(phrasePair.IsValid());
|
||||
|
||||
const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource();
|
||||
float count = phrasePair.GetCount();
|
||||
@ -555,6 +555,51 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
phraseTableFile << " ||| ";
|
||||
}
|
||||
|
||||
// alignment
|
||||
if ( hierarchicalFlag ) {
|
||||
// always output alignment if hiero style
|
||||
assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
|
||||
std::vector<std::string> alignment;
|
||||
for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
|
||||
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
|
||||
if ( bestAlignmentT2S->at(j).size() != 1 ) {
|
||||
std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
|
||||
phraseTableFile.flush();
|
||||
assert(bestAlignmentT2S->at(j).size() == 1);
|
||||
}
|
||||
size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
|
||||
//phraseTableFile << sourcePos << "-" << j << " ";
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
} else {
|
||||
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
||||
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
||||
size_t sourcePos = *setIter;
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
// now print all alignments, sorted by source index
|
||||
sort(alignment.begin(), alignment.end());
|
||||
for (size_t i = 0; i < alignment.size(); ++i) {
|
||||
phraseTableFile << alignment[i] << " ";
|
||||
}
|
||||
} else if ( !inverseFlag && wordAlignmentFlag) {
|
||||
// alignment info in pb model
|
||||
for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
|
||||
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
||||
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
||||
size_t sourcePos = *setIter;
|
||||
phraseTableFile << sourcePos << "-" << j << " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
phraseTableFile << " ||| ";
|
||||
|
||||
// lexical translation probability
|
||||
if (lexFlag) {
|
||||
double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S );
|
||||
@ -596,53 +641,6 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
phraseTableFile << " " << i->first << " " << i->second;
|
||||
}
|
||||
|
||||
phraseTableFile << " ||| ";
|
||||
|
||||
// output alignment info
|
||||
if ( !inverseFlag ) {
|
||||
if ( hierarchicalFlag ) {
|
||||
// always output alignment if hiero style
|
||||
assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
|
||||
std::vector<std::string> alignment;
|
||||
for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
|
||||
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
|
||||
if ( bestAlignmentT2S->at(j).size() != 1 ) {
|
||||
std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
|
||||
phraseTableFile.flush();
|
||||
assert(bestAlignmentT2S->at(j).size() == 1);
|
||||
}
|
||||
size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
|
||||
//phraseTableFile << sourcePos << "-" << j << " ";
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
} else {
|
||||
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
||||
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
||||
size_t sourcePos = *setIter;
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
// now print all alignments, sorted by source index
|
||||
sort(alignment.begin(), alignment.end());
|
||||
for (size_t i = 0; i < alignment.size(); ++i) {
|
||||
phraseTableFile << alignment[i] << " ";
|
||||
}
|
||||
} else if (wordAlignmentFlag) {
|
||||
// alignment info in pb model
|
||||
for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
|
||||
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
||||
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
||||
size_t sourcePos = *setIter;
|
||||
phraseTableFile << sourcePos << "-" << j << " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// counts
|
||||
phraseTableFile << " ||| " << totalCount << " " << count;
|
||||
if (kneserNeyFlag)
|
||||
|
@ -236,9 +236,8 @@ tokenize-tuning
|
||||
factorize-tuning
|
||||
in: tokenized-tuning
|
||||
out: factorized-tuning
|
||||
rerun-on-change: TRAINING:output-factors
|
||||
default-name: lm/interpolate-tuning.factored
|
||||
pass-unless: factors
|
||||
pass-unless: TRAINING:output-factors
|
||||
parallelizable: yes
|
||||
error: can't open
|
||||
error: incompatible number of words in factor
|
||||
|
@ -981,6 +981,9 @@ sub define_step {
|
||||
elsif ($DO_STEP[$i] eq 'TRAINING:create-config' || $DO_STEP[$i] eq 'TRAINING:create-config-interpolated-lm') {
|
||||
&define_training_create_config($i);
|
||||
}
|
||||
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:factorize-tuning') {
|
||||
&define_interpolated_lm_factorize_tuning($i);
|
||||
}
|
||||
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:interpolate') {
|
||||
&define_interpolated_lm_interpolate($i);
|
||||
}
|
||||
@ -1512,6 +1515,21 @@ sub define_lm_factorize {
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
|
||||
sub define_interpolated_lm_factorize_tuning {
|
||||
my ($step_id) = @_;
|
||||
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir");
|
||||
|
||||
my ($output,$input) = &get_output_and_input($step_id);
|
||||
my $factor = &check_backoff_and_get_array("TRAINING:output-factors");
|
||||
|
||||
my $dir = &check_and_get("GENERAL:working-dir");
|
||||
my $temp_dir = &check_and_get("INPUT-FACTOR:temp-dir") . ".$VERSION";
|
||||
my $cmd = "mkdir -p $temp_dir\n"
|
||||
. &factorize_one_language("OUTPUT-FACTOR",$input,$output,$factor,$step_id);
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
|
||||
sub define_splitter_train {
|
||||
my ($step_id,$set) = @_;
|
||||
|
||||
@ -1986,6 +2004,10 @@ sub define_training_extract_phrases {
|
||||
if (&get("TRAINING:use-ghkm")) {
|
||||
$cmd .= "-ghkm ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:ghkm-tree-fragments")) {
|
||||
$cmd .= "-ghkm-tree-fragments ";
|
||||
}
|
||||
}
|
||||
|
||||
my $extract_settings = &get("TRAINING:extract-settings");
|
||||
@ -2013,6 +2035,12 @@ sub define_training_build_ttable {
|
||||
$cmd .= "-no-word-alignment " if defined($word_alignment) && $word_alignment eq "no";
|
||||
|
||||
$cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
|
||||
|
||||
if (&get("TRAINING:hierarchical-rule-set")) {
|
||||
if (&get("TRAINING:ghkm-tree-fragments")) {
|
||||
$cmd .= "-ghkm-tree-fragments ";
|
||||
}
|
||||
}
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
@ -2267,6 +2295,7 @@ sub define_interpolated_lm_interpolate {
|
||||
$interpolation_script, $tuning, @LM) = &get_output_and_input($step_id);
|
||||
my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
|
||||
my $group = &get("INTERPOLATED-LM:group");
|
||||
my $scripts = &check_backoff_and_get("TUNING:moses-script-dir");
|
||||
|
||||
my $cmd = "";
|
||||
|
||||
@ -2299,9 +2328,12 @@ sub define_interpolated_lm_interpolate {
|
||||
$group_string =~ s/ $//;
|
||||
$group_string .= " ";
|
||||
while($group_string =~ /^([^ ,]+)([ ,]+)(.*)$/) {
|
||||
die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition")
|
||||
if ! defined($POSITION{$1});
|
||||
$numbered_string .= $POSITION{$1}.$2;
|
||||
# die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition")
|
||||
# if ! defined($POSITION{$1});
|
||||
# detect that elsewhere!
|
||||
if (defined($POSITION{$1})) {
|
||||
$numbered_string .= $POSITION{$1}.$2;
|
||||
}
|
||||
$group_string = $3;
|
||||
}
|
||||
chop($numbered_string);
|
||||
@ -2313,7 +2345,12 @@ sub define_interpolated_lm_interpolate {
|
||||
$name .= ".$$FACTOR[$factor]" if defined($FACTOR);
|
||||
$name .= ".order$order";
|
||||
}
|
||||
$cmd .= "$interpolation_script --tuning $tuning --name $name --srilm $srilm_dir --lm $lm_list";
|
||||
my $factored_tuning = $tuning;
|
||||
if (&backoff_and_get("TRAINING:output-factors")) {
|
||||
$factored_tuning = "$tuning.factor$factor";
|
||||
$cmd .= "$scripts/training/reduce-factors.perl --corpus $tuning --reduced $factored_tuning --factor $factor\n";
|
||||
}
|
||||
$cmd .= "$interpolation_script --tuning $factored_tuning --name $name --srilm $srilm_dir --lm $lm_list";
|
||||
$cmd .= " --group \"$numbered_string\"" if defined($group);
|
||||
$cmd .= "\n";
|
||||
}
|
||||
|
@ -86,15 +86,23 @@ sub split_xml {
|
||||
my $i = 0;
|
||||
$MARKUP[0] = "";
|
||||
while($line =~ /\S/) {
|
||||
# XML tag
|
||||
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
|
||||
$MARKUP[$i] .= $1." ";
|
||||
$line = $2;
|
||||
}
|
||||
# non-XML text
|
||||
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
|
||||
$WORD[$i++] = $1;
|
||||
$MARKUP[$i] = "";
|
||||
$line = $2;
|
||||
}
|
||||
# '<' or '>' occurs in word, but it's not an XML tag
|
||||
elsif ($line =~ /^\s*(\S+)(.*)$/) {
|
||||
$WORD[$i++] = $1;
|
||||
$MARKUP[$i] = "";
|
||||
$line = $2;
|
||||
}
|
||||
else {
|
||||
die("ERROR: huh? $line\n");
|
||||
}
|
||||
|
@ -70,15 +70,23 @@ sub split_xml {
|
||||
my $i = 0;
|
||||
$MARKUP[0] = "";
|
||||
while($line =~ /\S/) {
|
||||
# XML tag
|
||||
if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
|
||||
$MARKUP[$i] .= $1." ";
|
||||
$line = $2;
|
||||
}
|
||||
# non-XML text
|
||||
elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
|
||||
$WORD[$i++] = $1;
|
||||
$MARKUP[$i] = "";
|
||||
$line = $2;
|
||||
}
|
||||
# '<' or '>' occurs in word, but it's not an XML tag
|
||||
elsif ($line =~ /^\s*(\S+)(.*)$/) {
|
||||
$WORD[$i++] = $1;
|
||||
$MARKUP[$i] = "";
|
||||
$line = $2;
|
||||
}
|
||||
else {
|
||||
die("ERROR: huh? $line\n");
|
||||
}
|
||||
|
@ -124,14 +124,14 @@ class FlexScore:
|
||||
line = self.phrase_pairs[src][target]
|
||||
flexscore_l = b"{0:.6g}".format(self.flexprob_l[src][target])
|
||||
flexscore_r = b"{0:.6g}".format(self.flexprob_r[src][target])
|
||||
line[2] += b' ' + flexscore_l + b' ' + flexscore_r
|
||||
line[3] += b' ' + flexscore_l + b' ' + flexscore_r
|
||||
|
||||
if self.hierarchical:
|
||||
try:
|
||||
flexscore_d = b"{0:.6g}".format(self.flexprob_d[src][target])
|
||||
except KeyError:
|
||||
flexscore_d = b"1"
|
||||
line[2] += b' ' + flexscore_d
|
||||
line[3] += b' ' + flexscore_d
|
||||
|
||||
return b' ||| '.join(line) + b'\n'
|
||||
|
||||
|
@ -1087,7 +1087,9 @@ if($___RETURN_BEST_DEV) {
|
||||
if(defined $sparse_weights_file) {
|
||||
$best_sparse_file = "run$bestit.sparse-weights";
|
||||
}
|
||||
create_config($___CONFIG_ORIG, "./moses.ini", get_featlist_from_file("run$bestit.dense"),
|
||||
my $best_featlist = get_featlist_from_file("run$bestit.dense");
|
||||
$best_featlist->{"untuneables"} = $featlist->{"untuneables"};
|
||||
create_config($___CONFIG_ORIG, "./moses.ini", $best_featlist,
|
||||
$bestit, $bestbleu, $best_sparse_file);
|
||||
}
|
||||
else {
|
||||
|
109
scripts/training/reduce-factors.perl
Executable file
109
scripts/training/reduce-factors.perl
Executable file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
use FindBin qw($RealBin);
|
||||
|
||||
my $___FACTOR_DELIMITER = "|";
|
||||
|
||||
# utilities
|
||||
my $ZCAT = "gzip -cd";
|
||||
my $BZCAT = "bzcat";
|
||||
|
||||
my ($CORPUS,$REDUCED,$FACTOR);
|
||||
die("ERROR: wrong syntax when invoking reduce-factors")
|
||||
unless &GetOptions('corpus=s' => \$CORPUS,
|
||||
'reduced-corpus=s' => \$REDUCED,
|
||||
'factor=s' => \$FACTOR);
|
||||
|
||||
&reduce_factors($CORPUS,$REDUCED,$FACTOR);
|
||||
|
||||
# from train-model.perl
|
||||
sub reduce_factors {
|
||||
my ($full,$reduced,$factors) = @_;
|
||||
|
||||
my @INCLUDE = sort {$a <=> $b} split(/,/,$factors);
|
||||
|
||||
print "Reducing factors to produce $reduced @ ".`date`;
|
||||
while(-e $reduced.".lock") {
|
||||
sleep(10);
|
||||
}
|
||||
if (-e $reduced) {
|
||||
print STDERR " $reduced in place, reusing\n";
|
||||
return;
|
||||
}
|
||||
if (-e $reduced.".gz") {
|
||||
print STDERR " $reduced.gz in place, reusing\n";
|
||||
return;
|
||||
}
|
||||
|
||||
# peek at input, to check if we are asked to produce exactly the
|
||||
# available factors
|
||||
my $inh = open_or_zcat($full);
|
||||
my $firstline = <$inh>;
|
||||
die "Corpus file $full is empty" unless $firstline;
|
||||
close $inh;
|
||||
# pick first word
|
||||
$firstline =~ s/^\s*//;
|
||||
$firstline =~ s/\s.*//;
|
||||
# count factors
|
||||
my $maxfactorindex = $firstline =~ tr/|/|/;
|
||||
if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
|
||||
# create just symlink; preserving compression
|
||||
my $realfull = $full;
|
||||
if (!-e $realfull && -e $realfull.".gz") {
|
||||
$realfull .= ".gz";
|
||||
$reduced =~ s/(\.gz)?$/.gz/;
|
||||
}
|
||||
safesystem("ln -s '$realfull' '$reduced'")
|
||||
or die "Failed to create symlink $realfull -> $reduced";
|
||||
return;
|
||||
}
|
||||
|
||||
# The default is to select the needed factors
|
||||
`touch $reduced.lock`;
|
||||
*IN = open_or_zcat($full);
|
||||
open(OUT,">".$reduced) or die "ERROR: Can't write $reduced";
|
||||
my $nr = 0;
|
||||
while(<IN>) {
|
||||
$nr++;
|
||||
print STDERR "." if $nr % 10000 == 0;
|
||||
print STDERR "($nr)" if $nr % 100000 == 0;
|
||||
chomp; s/ +/ /g; s/^ //; s/ $//;
|
||||
my $first = 1;
|
||||
foreach (split) {
|
||||
my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
|
||||
# \Q causes to disable metacharacters in regex
|
||||
print OUT " " unless $first;
|
||||
$first = 0;
|
||||
my $first_factor = 1;
|
||||
foreach my $outfactor (@INCLUDE) {
|
||||
print OUT "|" unless $first_factor;
|
||||
$first_factor = 0;
|
||||
my $out = $FACTOR[$outfactor];
|
||||
die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
|
||||
print OUT $out;
|
||||
}
|
||||
}
|
||||
print OUT "\n";
|
||||
}
|
||||
print STDERR "\n";
|
||||
close(OUT);
|
||||
close(IN);
|
||||
`rm -f $reduced.lock`;
|
||||
}
|
||||
|
||||
sub open_or_zcat {
|
||||
my $fn = shift;
|
||||
my $read = $fn;
|
||||
$fn = $fn.".gz" if ! -e $fn && -e $fn.".gz";
|
||||
$fn = $fn.".bz2" if ! -e $fn && -e $fn.".bz2";
|
||||
if ($fn =~ /\.bz2$/) {
|
||||
$read = "$BZCAT $fn|";
|
||||
} elsif ($fn =~ /\.gz$/) {
|
||||
$read = "$ZCAT $fn|";
|
||||
}
|
||||
my $hdl;
|
||||
open($hdl,$read) or die "Can't read $fn ($read)";
|
||||
return $hdl;
|
||||
}
|
@ -752,7 +752,7 @@ sub reduce_factors {
|
||||
$firstline =~ s/^\s*//;
|
||||
$firstline =~ s/\s.*//;
|
||||
# count factors
|
||||
my $maxfactorindex = $firstline =~ tr/|/|/;
|
||||
my $maxfactorindex = $firstline =~ tr/$___FACTOR_DELIMITER/$___FACTOR_DELIMITER/;
|
||||
if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
|
||||
# create just symlink; preserving compression
|
||||
my $realfull = $full;
|
||||
@ -785,7 +785,7 @@ sub reduce_factors {
|
||||
$first = 0;
|
||||
my $first_factor = 1;
|
||||
foreach my $outfactor (@INCLUDE) {
|
||||
print OUT "|" unless $first_factor;
|
||||
print OUT $___FACTOR_DELIMITER unless $first_factor;
|
||||
$first_factor = 0;
|
||||
my $out = $FACTOR[$outfactor];
|
||||
die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
|
||||
@ -1785,19 +1785,19 @@ sub get_generation {
|
||||
while(<E>) {
|
||||
chomp;
|
||||
foreach (split) {
|
||||
my @FACTOR = split(/\|/);
|
||||
my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
|
||||
|
||||
my ($source,$target);
|
||||
my $first_factor = 1;
|
||||
foreach my $factor (split(/,/,$factor_e_source)) {
|
||||
$source .= "|" unless $first_factor;
|
||||
$source .= $___FACTOR_DELIMITER unless $first_factor;
|
||||
$first_factor = 0;
|
||||
$source .= $FACTOR[$factor];
|
||||
}
|
||||
|
||||
$first_factor = 1;
|
||||
foreach my $factor (split(/,/,$factor_e)) {
|
||||
$target .= "|" unless $first_factor;
|
||||
$target .= $___FACTOR_DELIMITER unless $first_factor;
|
||||
$first_factor = 0;
|
||||
$target .= $FACTOR[$factor];
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user