mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
source labels: integration into EMS
This commit is contained in:
parent
cda9d1d5ae
commit
c27cbf55ea
@ -9,6 +9,7 @@
|
|||||||
#include "moses/PP/TreeStructurePhraseProperty.h"
|
#include "moses/PP/TreeStructurePhraseProperty.h"
|
||||||
#include "moses/PP/SpanLengthPhraseProperty.h"
|
#include "moses/PP/SpanLengthPhraseProperty.h"
|
||||||
#include "moses/PP/NonTermContextProperty.h"
|
#include "moses/PP/NonTermContextProperty.h"
|
||||||
|
#include "moses/PP/OrientationPhraseProperty.h"
|
||||||
|
|
||||||
namespace Moses
|
namespace Moses
|
||||||
{
|
{
|
||||||
@ -59,6 +60,7 @@ PhrasePropertyFactory::PhrasePropertyFactory()
|
|||||||
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
|
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
|
||||||
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
|
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
|
||||||
MOSES_PNAME2("NonTermContext", NonTermContextProperty);
|
MOSES_PNAME2("NonTermContext", NonTermContextProperty);
|
||||||
|
MOSES_PNAME2("Orientation", OrientationPhraseProperty);
|
||||||
}
|
}
|
||||||
|
|
||||||
PhrasePropertyFactory::~PhrasePropertyFactory()
|
PhrasePropertyFactory::~PhrasePropertyFactory()
|
||||||
|
@ -16,12 +16,12 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
|
|||||||
std::istringstream tokenizer(value);
|
std::istringstream tokenizer(value);
|
||||||
|
|
||||||
if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
||||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property?");
|
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
|
||||||
}
|
}
|
||||||
assert( m_nNTs > 0 );
|
assert( m_nNTs > 0 );
|
||||||
|
|
||||||
if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
|
if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
|
||||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property?");
|
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
|
||||||
}
|
}
|
||||||
assert( m_totalCount > 0.0 );
|
assert( m_totalCount > 0.0 );
|
||||||
|
|
||||||
@ -32,7 +32,7 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
|
|||||||
std::priority_queue<float> ruleLabelledCountsPQ;
|
std::priority_queue<float> ruleLabelledCountsPQ;
|
||||||
|
|
||||||
while (tokenizer.peek() != EOF) {
|
while (tokenizer.peek() != EOF) {
|
||||||
try {
|
// try {
|
||||||
|
|
||||||
SourceLabelsPhrasePropertyItem item;
|
SourceLabelsPhrasePropertyItem item;
|
||||||
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
||||||
@ -46,28 +46,28 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
|
|||||||
for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
|
for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
|
||||||
size_t sourceLabelRHS;
|
size_t sourceLabelRHS;
|
||||||
if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
|
if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
|
||||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property?");
|
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
|
||||||
}
|
}
|
||||||
item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
|
item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
|
if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
|
||||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property?");
|
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (! (tokenizer >> numberOfLHSsGivenRHS)) {
|
if (! (tokenizer >> numberOfLHSsGivenRHS)) {
|
||||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property?");
|
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
|
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
|
||||||
size_t sourceLabelLHS;
|
size_t sourceLabelLHS;
|
||||||
if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
|
if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
|
||||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property?");
|
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
|
||||||
}
|
}
|
||||||
float ruleSourceLabelledCount;
|
float ruleSourceLabelledCount;
|
||||||
if (! (tokenizer >> ruleSourceLabelledCount)) {
|
if (! (tokenizer >> ruleSourceLabelledCount)) {
|
||||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property?");
|
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
|
||||||
}
|
}
|
||||||
item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
|
item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
|
||||||
ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
|
ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
|
||||||
@ -75,9 +75,9 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
|
|||||||
|
|
||||||
m_sourceLabelItems.push_back(item);
|
m_sourceLabelItems.push_back(item);
|
||||||
|
|
||||||
} catch (const std::exception &e) {
|
// } catch (const std::exception &e) {
|
||||||
UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
|
// UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep only top N label vectors
|
// keep only top N label vectors
|
||||||
|
159
phrase-extract/PropertiesConsolidator.cpp
Normal file
159
phrase-extract/PropertiesConsolidator.cpp
Normal file
@ -0,0 +1,159 @@
|
|||||||
|
/***********************************************************************
|
||||||
|
Moses - factored phrase-based language decoder
|
||||||
|
Copyright (C) University of Edinburgh
|
||||||
|
|
||||||
|
This library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with this library; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
***********************************************************************/
|
||||||
|
|
||||||
|
#include "PropertiesConsolidator.h"
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include <limits>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "moses/Util.h"
|
||||||
|
#include "phrase-extract/InputFileStream.h"
|
||||||
|
#include "phrase-extract/OutputFileStream.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace MosesTraining
|
||||||
|
{
|
||||||
|
|
||||||
|
void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
|
||||||
|
{
|
||||||
|
Moses::InputFileStream inFile(sourceLabelSetFile);
|
||||||
|
|
||||||
|
// read source label set
|
||||||
|
m_sourceLabels.clear();
|
||||||
|
std::string line;
|
||||||
|
while (getline(inFile, line)) {
|
||||||
|
std::istringstream tokenizer(line);
|
||||||
|
std::string label;
|
||||||
|
size_t index;
|
||||||
|
try {
|
||||||
|
tokenizer >> label >> index;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
|
||||||
|
}
|
||||||
|
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
|
||||||
|
UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
|
||||||
|
}
|
||||||
|
|
||||||
|
inFile.Close();
|
||||||
|
|
||||||
|
m_sourceLabelsFlag = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const
|
||||||
|
{
|
||||||
|
if ( propertiesString.empty() ) {
|
||||||
|
return propertiesString;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostringstream out;
|
||||||
|
std::vector<std::string> toks;
|
||||||
|
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
|
||||||
|
for (size_t i = 1; i < toks.size(); ++i) {
|
||||||
|
std::string &tok = toks[i];
|
||||||
|
if (tok.empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
size_t endPos = tok.rfind("}");
|
||||||
|
tok = tok.substr(0, endPos - 1);
|
||||||
|
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
||||||
|
assert(keyValue.size() == 2);
|
||||||
|
|
||||||
|
if ( !keyValue[0].compare("SourceLabels") ) {
|
||||||
|
|
||||||
|
if ( m_sourceLabelsFlag ) {
|
||||||
|
|
||||||
|
// SourceLabels additional property: replace strings with vocabulary indices
|
||||||
|
out << " {{" << keyValue[0];
|
||||||
|
|
||||||
|
std::istringstream tokenizer(keyValue[1]);
|
||||||
|
|
||||||
|
size_t nNTs;
|
||||||
|
double totalCount;
|
||||||
|
|
||||||
|
if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
||||||
|
UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
|
||||||
|
<< "Flawed SourceLabels property?");
|
||||||
|
}
|
||||||
|
assert( nNTs > 0 );
|
||||||
|
out << " " << nNTs;
|
||||||
|
|
||||||
|
if (! (tokenizer >> totalCount)) { // second token: overall rule count
|
||||||
|
UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
|
||||||
|
<< "Flawed SourceLabels property?");
|
||||||
|
}
|
||||||
|
assert( totalCount > 0.0 );
|
||||||
|
out << " " << totalCount;
|
||||||
|
|
||||||
|
while (tokenizer.peek() != EOF) {
|
||||||
|
try {
|
||||||
|
|
||||||
|
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
||||||
|
|
||||||
|
std::string token;
|
||||||
|
|
||||||
|
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
|
||||||
|
for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
|
||||||
|
tokenizer >> token; // RHS source non-terminal label
|
||||||
|
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
|
||||||
|
UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
|
||||||
|
out << " " << found->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizer >> token; // sourceLabelsRHSCount
|
||||||
|
out << " " << token;
|
||||||
|
|
||||||
|
tokenizer >> numberOfLHSsGivenRHS;
|
||||||
|
out << " " << numberOfLHSsGivenRHS;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
|
||||||
|
tokenizer >> token; // LHS source non-terminal label
|
||||||
|
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
|
||||||
|
UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
|
||||||
|
out << " " << found->second;
|
||||||
|
|
||||||
|
tokenizer >> token; // ruleSourceLabelledCount
|
||||||
|
out << " " << token;
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
UTIL_THROW2("Flawed item in SourceLabels property?");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out << "}}";
|
||||||
|
|
||||||
|
} else { // don't process source labels additional property
|
||||||
|
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// output other additional property
|
||||||
|
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace MosesTraining
|
||||||
|
|
48
phrase-extract/PropertiesConsolidator.h
Normal file
48
phrase-extract/PropertiesConsolidator.h
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
/***********************************************************************
|
||||||
|
Moses - factored phrase-based language decoder
|
||||||
|
Copyright (C) University of Edinburgh
|
||||||
|
|
||||||
|
This library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with this library; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
***********************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
|
||||||
|
namespace MosesTraining
|
||||||
|
{
|
||||||
|
|
||||||
|
class PropertiesConsolidator
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
|
||||||
|
PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
|
||||||
|
|
||||||
|
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
|
||||||
|
|
||||||
|
std::string ProcessPropertiesString(const std::string &propertiesString) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
bool m_sourceLabelsFlag;
|
||||||
|
std::map<std::string,size_t> m_sourceLabels;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace MosesTraining
|
||||||
|
|
@ -28,6 +28,7 @@
|
|||||||
#include "tables-core.h"
|
#include "tables-core.h"
|
||||||
#include "InputFileStream.h"
|
#include "InputFileStream.h"
|
||||||
#include "OutputFileStream.h"
|
#include "OutputFileStream.h"
|
||||||
|
#include "PropertiesConsolidator.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -37,13 +38,14 @@ bool phraseCountFlag = false;
|
|||||||
bool lowCountFlag = false;
|
bool lowCountFlag = false;
|
||||||
bool goodTuringFlag = false;
|
bool goodTuringFlag = false;
|
||||||
bool kneserNeyFlag = false;
|
bool kneserNeyFlag = false;
|
||||||
|
bool sourceLabelsFlag = false;
|
||||||
bool logProbFlag = false;
|
bool logProbFlag = false;
|
||||||
inline float maybeLogProb( float a )
|
inline float maybeLogProb( float a )
|
||||||
{
|
{
|
||||||
return logProbFlag ? log(a) : a;
|
return logProbFlag ? log(a) : a;
|
||||||
}
|
}
|
||||||
|
|
||||||
void processFiles( char*, char*, char*, char* );
|
void processFiles( char*, char*, char*, char*, char* );
|
||||||
void loadCountOfCounts( char* );
|
void loadCountOfCounts( char* );
|
||||||
void breakdownCoreAndSparse( string combined, string &core, string &sparse );
|
void breakdownCoreAndSparse( string combined, string &core, string &sparse );
|
||||||
bool getLine( istream &fileP, vector< string > &item );
|
bool getLine( istream &fileP, vector< string > &item );
|
||||||
@ -57,13 +59,14 @@ int main(int argc, char* argv[])
|
|||||||
<< "consolidating direct and indirect rule tables\n";
|
<< "consolidating direct and indirect rule tables\n";
|
||||||
|
|
||||||
if (argc < 4) {
|
if (argc < 4) {
|
||||||
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] \n";
|
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] \n";
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
char* &fileNameDirect = argv[1];
|
char* &fileNameDirect = argv[1];
|
||||||
char* &fileNameIndirect = argv[2];
|
char* &fileNameIndirect = argv[2];
|
||||||
char* &fileNameConsolidated = argv[3];
|
char* &fileNameConsolidated = argv[3];
|
||||||
char* fileNameCountOfCounts;
|
char* fileNameCountOfCounts;
|
||||||
|
char* fileNameSourceLabelSet;
|
||||||
|
|
||||||
for(int i=4; i<argc; i++) {
|
for(int i=4; i<argc; i++) {
|
||||||
if (strcmp(argv[i],"--Hierarchical") == 0) {
|
if (strcmp(argv[i],"--Hierarchical") == 0) {
|
||||||
@ -114,13 +117,21 @@ int main(int argc, char* argv[])
|
|||||||
} else if (strcmp(argv[i],"--LogProb") == 0) {
|
} else if (strcmp(argv[i],"--LogProb") == 0) {
|
||||||
logProbFlag = true;
|
logProbFlag = true;
|
||||||
cerr << "using log-probabilities\n";
|
cerr << "using log-probabilities\n";
|
||||||
|
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
|
||||||
|
sourceLabelsFlag = true;
|
||||||
|
if (i+1==argc) {
|
||||||
|
cerr << "ERROR: specify source label set file!\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
fileNameSourceLabelSet = argv[++i];
|
||||||
|
cerr << "processing source labels property\n";
|
||||||
} else {
|
} else {
|
||||||
cerr << "ERROR: unknown option " << argv[i] << endl;
|
cerr << "ERROR: unknown option " << argv[i] << endl;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
|
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
|
||||||
}
|
}
|
||||||
|
|
||||||
vector< float > countOfCounts;
|
vector< float > countOfCounts;
|
||||||
@ -169,7 +180,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
|
|||||||
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
|
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
|
||||||
}
|
}
|
||||||
|
|
||||||
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
|
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
|
||||||
{
|
{
|
||||||
if (goodTuringFlag || kneserNeyFlag)
|
if (goodTuringFlag || kneserNeyFlag)
|
||||||
loadCountOfCounts( fileNameCountOfCounts );
|
loadCountOfCounts( fileNameCountOfCounts );
|
||||||
@ -198,6 +209,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// create properties consolidator
|
||||||
|
// (in case any additional phrase property requires further processing)
|
||||||
|
MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
|
||||||
|
if (sourceLabelsFlag) {
|
||||||
|
propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
|
||||||
|
}
|
||||||
|
|
||||||
// loop through all extracted phrase translations
|
// loop through all extracted phrase translations
|
||||||
int i=0;
|
int i=0;
|
||||||
while(true) {
|
while(true) {
|
||||||
@ -307,12 +325,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
|||||||
// counts, for debugging
|
// counts, for debugging
|
||||||
fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
|
fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
|
||||||
|
|
||||||
// count bin feature (as a sparse feature)
|
// sparse features
|
||||||
fileConsolidated << " |||";
|
fileConsolidated << " |||";
|
||||||
if (directSparseScores.compare("") != 0)
|
if (directSparseScores.compare("") != 0)
|
||||||
fileConsolidated << " " << directSparseScores;
|
fileConsolidated << " " << directSparseScores;
|
||||||
if (indirectSparseScores.compare("") != 0)
|
if (indirectSparseScores.compare("") != 0)
|
||||||
fileConsolidated << " " << indirectSparseScores;
|
fileConsolidated << " " << indirectSparseScores;
|
||||||
|
// count bin feature (as a sparse feature)
|
||||||
if (sparseCountBinFeatureFlag) {
|
if (sparseCountBinFeatureFlag) {
|
||||||
bool foundBin = false;
|
bool foundBin = false;
|
||||||
for(size_t i=0; i < countBin.size(); i++) {
|
for(size_t i=0; i < countBin.size(); i++) {
|
||||||
@ -332,9 +351,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
|||||||
}
|
}
|
||||||
|
|
||||||
// arbitrary key-value pairs
|
// arbitrary key-value pairs
|
||||||
fileConsolidated << " ||| ";
|
fileConsolidated << " |||";
|
||||||
if (itemDirect.size() >= 6) {
|
if (itemDirect.size() >= 6) {
|
||||||
fileConsolidated << itemDirect[5];
|
//if (sourceLabelsFlag) {
|
||||||
|
fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
|
||||||
|
//} else {
|
||||||
|
// fileConsolidated << itemDirect[5];
|
||||||
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
fileConsolidated << endl;
|
fileConsolidated << endl;
|
||||||
|
@ -617,9 +617,8 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string sourceTopLabel = "TOPLABEL";
|
size_t sourceLabelGlueTop = 0;
|
||||||
std::string sourceSLabel = "S";
|
size_t sourceLabelGlueX = 1;
|
||||||
std::string sourceSomeLabel = "SOMELABEL";
|
|
||||||
|
|
||||||
// basic rules
|
// basic rules
|
||||||
out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| |||";
|
out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| |||";
|
||||||
@ -627,7 +626,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
out << " {{Tree [" << topLabel << " <s>]}}";
|
out << " {{Tree [" << topLabel << " <s>]}}";
|
||||||
}
|
}
|
||||||
if (options.sourceLabels) {
|
if (options.sourceLabels) {
|
||||||
out << " {{SourceLabels 1 1 " << sourceTopLabel << " 1}}";
|
out << " {{SourceLabels 1 1 " << sourceLabelGlueTop << " 1}}";
|
||||||
}
|
}
|
||||||
out << std::endl;
|
out << std::endl;
|
||||||
|
|
||||||
@ -636,7 +635,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
out << " {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}";
|
out << " {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}";
|
||||||
}
|
}
|
||||||
if (options.sourceLabels) {
|
if (options.sourceLabels) {
|
||||||
out << " {{SourceLabels 2 1 " << sourceTopLabel << " 1 1 " << sourceTopLabel << " 1}}";
|
out << " {{SourceLabels 2 1 " << sourceLabelGlueTop << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
||||||
}
|
}
|
||||||
out << std::endl;
|
out << std::endl;
|
||||||
|
|
||||||
@ -648,7 +647,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
out << " {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}";
|
out << " {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}";
|
||||||
}
|
}
|
||||||
if (options.sourceLabels) {
|
if (options.sourceLabels) {
|
||||||
out << " {{SourceLabels 2 1 " << sourceSLabel << " 1 1 " << sourceTopLabel << " 1}}";
|
out << " {{SourceLabels 2 1 " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
||||||
}
|
}
|
||||||
out << std::endl;
|
out << std::endl;
|
||||||
}
|
}
|
||||||
@ -661,7 +660,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
|
out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
|
||||||
}
|
}
|
||||||
if (options.sourceLabels) {
|
if (options.sourceLabels) {
|
||||||
out << " {{SourceLabels 3 2.718 " << sourceTopLabel << " " << sourceSomeLabel << " 2.718 1 " << sourceTopLabel << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL"
|
out << " {{SourceLabels 3 2.718 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 2.718 1 " << sourceLabelGlueTop << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL"
|
||||||
}
|
}
|
||||||
out << std::endl;
|
out << std::endl;
|
||||||
}
|
}
|
||||||
@ -672,7 +671,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
|
out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
|
||||||
}
|
}
|
||||||
if (options.sourceLabels) {
|
if (options.sourceLabels) {
|
||||||
out << " {{SourceLabels 3 1 " << sourceTopLabel << " " << sourceSomeLabel << " 1 1 " << sourceTopLabel << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
|
out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
|
||||||
}
|
}
|
||||||
out << std::endl;
|
out << std::endl;
|
||||||
}
|
}
|
||||||
|
@ -1860,7 +1860,7 @@ sub define_tuning_tune {
|
|||||||
$cmd .= " --lambdas \"$lambda\"" if $lambda;
|
$cmd .= " --lambdas \"$lambda\"" if $lambda;
|
||||||
$cmd .= " --continue" if $tune_continue;
|
$cmd .= " --continue" if $tune_continue;
|
||||||
$cmd .= " --skip-decoder" if $skip_decoder;
|
$cmd .= " --skip-decoder" if $skip_decoder;
|
||||||
$cmd .= " --inputtype $tune_inputtype" if $tune_inputtype;
|
$cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype);
|
||||||
|
|
||||||
my $qsub_args = &get_qsub_args("TUNING");
|
my $qsub_args = &get_qsub_args("TUNING");
|
||||||
$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
|
$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
|
||||||
@ -2217,6 +2217,10 @@ sub define_training_extract_phrases {
|
|||||||
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
|
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
|
||||||
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
|
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (&get("TRAINING:ghkm-source-labels")) {
|
||||||
|
$cmd .= "-ghkm-source-labels ";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
my $extract_settings = &get("TRAINING:extract-settings");
|
my $extract_settings = &get("TRAINING:extract-settings");
|
||||||
@ -2254,6 +2258,11 @@ sub define_training_build_ttable {
|
|||||||
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
|
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
|
||||||
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
|
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
|
||||||
}
|
}
|
||||||
|
if (&get("TRAINING:ghkm-source-labels")) {
|
||||||
|
$cmd .= "-ghkm-source-labels ";
|
||||||
|
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
|
||||||
|
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
&create_step($step_id,$cmd);
|
&create_step($step_id,$cmd);
|
||||||
@ -2438,6 +2447,12 @@ sub define_training_create_config {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (&get("TRAINING:ghkm-source-labels")) {
|
||||||
|
$cmd .= "-ghkm-source-labels ";
|
||||||
|
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
|
||||||
|
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
|
||||||
|
}
|
||||||
|
|
||||||
# sparse lexical features provide additional content for config file
|
# sparse lexical features provide additional content for config file
|
||||||
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
|
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
|
||||||
|
|
||||||
@ -3412,7 +3427,7 @@ sub check_backoff_and_get_array {
|
|||||||
# the following two functions deal with getting information about
|
# the following two functions deal with getting information about
|
||||||
# files that are passed between steps. this are either specified
|
# files that are passed between steps. this are either specified
|
||||||
# in the meta file (default) or in the configuration file (here called
|
# in the meta file (default) or in the configuration file (here called
|
||||||
# 'specified', in the step management refered to as 'given').
|
# 'specified', in the step management referred to as 'given').
|
||||||
|
|
||||||
sub get_specified_or_default_file {
|
sub get_specified_or_default_file {
|
||||||
my ($specified_module,$specified_set,$specified_parameter,
|
my ($specified_module,$specified_set,$specified_parameter,
|
||||||
|
@ -219,14 +219,14 @@ foreach (@children) {
|
|||||||
waitpid($_, 0);
|
waitpid($_, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
# glue rules
|
# merge glue rules
|
||||||
if (defined($glueFile)) {
|
if (defined($glueFile)) {
|
||||||
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
|
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
|
||||||
print STDERR "Merging glue rules: $cmd \n";
|
print STDERR "Merging glue rules: $cmd \n";
|
||||||
print STDERR `$cmd`;
|
print STDERR `$cmd`;
|
||||||
}
|
}
|
||||||
|
|
||||||
# phrase orientation priors (GHKM extraction)
|
# merge phrase orientation priors (GHKM extraction)
|
||||||
if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
|
if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
|
||||||
print STDERR "Merging phrase orientation priors\n";
|
print STDERR "Merging phrase orientation priors\n";
|
||||||
|
|
||||||
|
@ -27,10 +27,22 @@ my $scoreCmd = $ARGV[2];
|
|||||||
my $extractFile = $ARGV[3]; # 1st arg of extract argument
|
my $extractFile = $ARGV[3]; # 1st arg of extract argument
|
||||||
my $lexFile = $ARGV[4];
|
my $lexFile = $ARGV[4];
|
||||||
my $ptHalf = $ARGV[5]; # output
|
my $ptHalf = $ARGV[5]; # output
|
||||||
|
my $inverse = 0;
|
||||||
|
my $sourceLabelsFile;
|
||||||
|
|
||||||
my $otherExtractArgs= "";
|
my $otherExtractArgs= "";
|
||||||
for (my $i = 6; $i < $#ARGV; ++$i)
|
for (my $i = 6; $i < $#ARGV; ++$i)
|
||||||
{
|
{
|
||||||
|
if ($ARGV[$i] eq '--SourceLabels') {
|
||||||
|
$sourceLabelsFile = $ARGV[++$i];
|
||||||
|
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet ";
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
if ($ARGV[$i] eq '--Inverse') {
|
||||||
|
$inverse = 1;
|
||||||
|
$otherExtractArgs .= $ARGV[$i] ." ";
|
||||||
|
next;
|
||||||
|
}
|
||||||
$otherExtractArgs .= $ARGV[$i] ." ";
|
$otherExtractArgs .= $ARGV[$i] ." ";
|
||||||
}
|
}
|
||||||
#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
|
#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
|
||||||
@ -258,6 +270,14 @@ if (-e $cocPath)
|
|||||||
close(FHCOC);
|
close(FHCOC);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# merge source label files
|
||||||
|
if (!$inverse && defined($sourceLabelsFile))
|
||||||
|
{
|
||||||
|
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $sourceLabelsFile";
|
||||||
|
print STDERR "Merging source label files: $cmd \n";
|
||||||
|
`$cmd`;
|
||||||
|
}
|
||||||
|
|
||||||
$cmd = "rm -rf $TMPDIR \n";
|
$cmd = "rm -rf $TMPDIR \n";
|
||||||
print STDERR $cmd;
|
print STDERR $cmd;
|
||||||
systemCheck($cmd);
|
systemCheck($cmd);
|
||||||
|
@ -127,8 +127,8 @@ my $___NOCASE = 0;
|
|||||||
# Use "--nonorm" to non normalize translation before computing scores
|
# Use "--nonorm" to non normalize translation before computing scores
|
||||||
my $___NONORM = 0;
|
my $___NONORM = 0;
|
||||||
|
|
||||||
# set 0 if input type is text, set 1 if input type is confusion network
|
# set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree
|
||||||
my $___INPUTTYPE = 0;
|
my $___INPUTTYPE;
|
||||||
|
|
||||||
|
|
||||||
my $mertdir = undef; # path to new mert directory
|
my $mertdir = undef; # path to new mert directory
|
||||||
@ -1228,14 +1228,18 @@ sub run_decoder {
|
|||||||
|
|
||||||
if (defined $___JOBS && $___JOBS > 0) {
|
if (defined $___JOBS && $___JOBS > 0) {
|
||||||
die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
|
die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
|
||||||
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
|
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
|
||||||
|
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
|
||||||
|
$decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
|
||||||
} else {
|
} else {
|
||||||
my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE";
|
my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct";
|
||||||
if ($___HG_MIRA) {
|
if ($___HG_MIRA) {
|
||||||
safesystem("rm -rf $hypergraph_dir");
|
safesystem("rm -rf $hypergraph_dir");
|
||||||
$nbest_list_cmd = "-output-search-graph-hypergraph true gz";
|
$nbest_list_cmd = "-output-search-graph-hypergraph true gz";
|
||||||
}
|
}
|
||||||
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
|
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG";
|
||||||
|
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
|
||||||
|
$decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
|
||||||
}
|
}
|
||||||
|
|
||||||
print STDERR "Executing: $decoder_cmd \n";
|
print STDERR "Executing: $decoder_cmd \n";
|
||||||
@ -1309,7 +1313,9 @@ sub get_featlist_from_moses {
|
|||||||
print STDERR "Using cached features list: $featlistfn\n";
|
print STDERR "Using cached features list: $featlistfn\n";
|
||||||
} else {
|
} else {
|
||||||
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
|
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
|
||||||
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn";
|
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
|
||||||
|
$cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
|
||||||
|
$cmd .= " -show-weights > $featlistfn";
|
||||||
print STDERR "Executing: $cmd\n";
|
print STDERR "Executing: $cmd\n";
|
||||||
safesystem($cmd) or die "Failed to run moses with the config $configfn";
|
safesystem($cmd) or die "Failed to run moses with the config $configfn";
|
||||||
}
|
}
|
||||||
|
@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
|
|||||||
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
|
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
|
||||||
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
|
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
|
||||||
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
|
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
|
||||||
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
|
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
|
||||||
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
|
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
|
||||||
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
|
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
|
||||||
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
|
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
|
||||||
@ -112,6 +112,8 @@ $_HELP = 1
|
|||||||
'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
|
'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
|
||||||
'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION,
|
'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION,
|
||||||
'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
|
'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
|
||||||
|
'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
|
||||||
|
'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
|
||||||
'pcfg' => \$_PCFG,
|
'pcfg' => \$_PCFG,
|
||||||
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
|
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
|
||||||
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
|
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
|
||||||
@ -1427,10 +1429,15 @@ sub extract_phrase {
|
|||||||
$cmd .= " --PCFG" if $_PCFG;
|
$cmd .= " --PCFG" if $_PCFG;
|
||||||
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
|
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
|
||||||
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
|
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
|
||||||
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
|
if (defined($_GHKM))
|
||||||
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
{
|
||||||
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
|
||||||
if (!defined($_GHKM)) {
|
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
||||||
|
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
||||||
|
$cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
|
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
|
||||||
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
|
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
|
||||||
$cmd .= " --MaxSpan $max_length";
|
$cmd .= " --MaxSpan $max_length";
|
||||||
@ -1609,6 +1616,7 @@ sub score_phrase_phrase_extract {
|
|||||||
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
|
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
|
||||||
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
||||||
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
||||||
|
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||||
$cmd .= " $DOMAIN" if $DOMAIN;
|
$cmd .= " $DOMAIN" if $DOMAIN;
|
||||||
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
|
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
|
||||||
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
|
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
|
||||||
@ -1659,6 +1667,7 @@ sub score_phrase_phrase_extract {
|
|||||||
$cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN;
|
$cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN;
|
||||||
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
|
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
|
||||||
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
|
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
|
||||||
|
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||||
|
|
||||||
$cmd .= " | gzip -c > $ttable_file.gz";
|
$cmd .= " | gzip -c > $ttable_file.gz";
|
||||||
|
|
||||||
@ -2164,6 +2173,7 @@ sub create_ini {
|
|||||||
print INI "WordPenalty\n";
|
print INI "WordPenalty\n";
|
||||||
print INI "PhrasePenalty\n";
|
print INI "PhrasePenalty\n";
|
||||||
print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
|
print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
|
||||||
|
print INI "SoftSourceSyntacticConstraintsFeature sourceLabelSetFile=$_GHKM_SOURCE_LABELS_FILE\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||||
print INI $feature_spec;
|
print INI $feature_spec;
|
||||||
|
|
||||||
print INI "\n# dense weights for feature functions\n";
|
print INI "\n# dense weights for feature functions\n";
|
||||||
@ -2171,6 +2181,7 @@ sub create_ini {
|
|||||||
print INI "UnknownWordPenalty0= 1\n";
|
print INI "UnknownWordPenalty0= 1\n";
|
||||||
print INI "WordPenalty0= -1\n";
|
print INI "WordPenalty0= -1\n";
|
||||||
print INI "PhrasePenalty0= 0.2\n";
|
print INI "PhrasePenalty0= 0.2\n";
|
||||||
|
print INI "SoftSourceSyntacticConstraintsFeature0= 0.3 -0.3 -0.3\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||||
print INI $weight_spec;
|
print INI $weight_spec;
|
||||||
close(INI);
|
close(INI);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user