mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-07-14 14:50:41 +03:00
beautify
This commit is contained in:
parent
2f3cd5e2fe
commit
20b3e8929e
@ -221,7 +221,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
|
||||
phrase.AddWord(word);
|
||||
|
||||
if (retSourceTarget == 1) {
|
||||
out = word;
|
||||
out = word;
|
||||
}
|
||||
}
|
||||
|
||||
@ -232,7 +232,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
|
||||
phrase.AddWord(word);
|
||||
|
||||
if (retSourceTarget == 2) {
|
||||
out = word;
|
||||
out = word;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -34,7 +34,8 @@ namespace MosesTuning
|
||||
#define CHILD_STDOUT_WRITE pipefds_output[1]
|
||||
|
||||
MeteorScorer::MeteorScorer(const string& config)
|
||||
: StatisticsBasedScorer("METEOR",config) {
|
||||
: StatisticsBasedScorer("METEOR",config)
|
||||
{
|
||||
meteor_jar = getConfig("jar", "");
|
||||
meteor_lang = getConfig("lang", "en");
|
||||
meteor_task = getConfig("task", "tune");
|
||||
@ -88,7 +89,8 @@ MeteorScorer::MeteorScorer(const string& config)
|
||||
m_from_meteor = new ifdstream(CHILD_STDOUT_READ);
|
||||
}
|
||||
|
||||
MeteorScorer::~MeteorScorer() {
|
||||
MeteorScorer::~MeteorScorer()
|
||||
{
|
||||
// Cleanup IO
|
||||
delete m_to_meteor;
|
||||
delete m_from_meteor;
|
||||
@ -171,7 +173,8 @@ float MeteorScorer::calculateScore(const vector<int>& comps) const
|
||||
// Meteor unsupported, throw error if used
|
||||
|
||||
MeteorScorer::MeteorScorer(const string& config)
|
||||
: StatisticsBasedScorer("METEOR",config) {
|
||||
: StatisticsBasedScorer("METEOR",config)
|
||||
{
|
||||
throw runtime_error("Meteor unsupported, requires GLIBCXX");
|
||||
}
|
||||
|
||||
|
@ -20,7 +20,7 @@ class ifdstream;
|
||||
class ScoreStats;
|
||||
|
||||
/**
|
||||
* Meteor scoring
|
||||
* Meteor scoring
|
||||
*
|
||||
* https://github.com/mjdenkowski/meteor
|
||||
* http://statmt.org/wmt11/pdf/WMT07.pdf
|
||||
|
@ -35,7 +35,7 @@ PreProcessFilter::PreProcessFilter(const string& filterCommand)
|
||||
m_fromFilter(NULL)
|
||||
{
|
||||
#if defined __MINGW32__
|
||||
//TODO(jie): replace this function with boost implementation
|
||||
//TODO(jie): replace this function with boost implementation
|
||||
#else
|
||||
// Child error signal install
|
||||
// sigaction is the replacement for the traditional signal() method
|
||||
|
@ -132,7 +132,7 @@ IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
|
||||
m_alignmentInfoStream = new std::ofstream(staticData.GetAlignmentOutputFile().c_str());
|
||||
m_alignmentInfoCollector = new Moses::OutputCollector(m_alignmentInfoStream);
|
||||
UTIL_THROW_IF2(!m_alignmentInfoStream->good(),
|
||||
"File for alignment output could not be opened: " << staticData.GetAlignmentOutputFile());
|
||||
"File for alignment output could not be opened: " << staticData.GetAlignmentOutputFile());
|
||||
}
|
||||
|
||||
if (!staticData.GetOutputUnknownsFile().empty()) {
|
||||
@ -140,7 +140,7 @@ IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
|
||||
m_unknownsCollector = new Moses::OutputCollector(m_unknownsStream);
|
||||
UTIL_THROW_IF2(!m_unknownsStream->good(),
|
||||
"File for unknowns words could not be opened: " <<
|
||||
staticData.GetOutputUnknownsFile());
|
||||
staticData.GetOutputUnknownsFile());
|
||||
}
|
||||
}
|
||||
|
||||
@ -188,7 +188,7 @@ InputType*IOWrapper::GetInput(InputType* inputType)
|
||||
void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<FactorType> &outputFactorOrder, bool reportAllFactors)
|
||||
{
|
||||
UTIL_THROW_IF2(outputFactorOrder.size() == 0,
|
||||
"Cannot be empty phrase");
|
||||
"Cannot be empty phrase");
|
||||
if (reportAllFactors == true) {
|
||||
out << phrase;
|
||||
} else {
|
||||
@ -197,12 +197,12 @@ void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<Fa
|
||||
const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
|
||||
out << *factor;
|
||||
UTIL_THROW_IF2(factor == NULL,
|
||||
"Empty factor 0 at position " << pos);
|
||||
"Empty factor 0 at position " << pos);
|
||||
|
||||
for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
|
||||
const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
|
||||
UTIL_THROW_IF2(factor == NULL,
|
||||
"Empty factor " << i << " at position " << pos);
|
||||
"Empty factor " << i << " at position " << pos);
|
||||
|
||||
out << "|" << *factor;
|
||||
}
|
||||
@ -246,7 +246,7 @@ void IOWrapper::OutputBestHypo(const std::vector<const Factor*>& mbrBestHypo, l
|
||||
for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
|
||||
const Factor *factor = mbrBestHypo[i];
|
||||
UTIL_THROW_IF(factor == NULL, util::Exception,
|
||||
"No factor at position " << i );
|
||||
"No factor at position " << i );
|
||||
|
||||
cout << *factor << " ";
|
||||
}
|
||||
@ -403,7 +403,7 @@ void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &
|
||||
// recursive
|
||||
const search::Applied *child = applied->Children();
|
||||
for (size_t i = 0; i < applied->GetArity(); i++) {
|
||||
OutputTranslationOptions(out, applicationContext, child++, sentence, translationId);
|
||||
OutputTranslationOptions(out, applicationContext, child++, sentence, translationId);
|
||||
}
|
||||
}
|
||||
|
||||
@ -459,7 +459,7 @@ void IOWrapper::OutputTreeFragmentsTranslationOptions(std::ostream &out, Applica
|
||||
// recursive
|
||||
const search::Applied *child = applied->Children();
|
||||
for (size_t i = 0; i < applied->GetArity(); i++) {
|
||||
OutputTreeFragmentsTranslationOptions(out, applicationContext, child++, sentence, translationId);
|
||||
OutputTreeFragmentsTranslationOptions(out, applicationContext, child++, sentence, translationId);
|
||||
}
|
||||
}
|
||||
|
||||
@ -476,7 +476,7 @@ void IOWrapper::OutputDetailedTranslationReport(
|
||||
|
||||
OutputTranslationOptions(out, applicationContext, hypo, sentence, translationId);
|
||||
UTIL_THROW_IF2(m_detailOutputCollector == NULL,
|
||||
"No ouput file for detailed reports specified");
|
||||
"No ouput file for detailed reports specified");
|
||||
m_detailOutputCollector->Write(translationId, out.str());
|
||||
}
|
||||
|
||||
@ -493,7 +493,7 @@ void IOWrapper::OutputDetailedTranslationReport(
|
||||
|
||||
OutputTranslationOptions(out, applicationContext, applied, sentence, translationId);
|
||||
UTIL_THROW_IF2(m_detailOutputCollector == NULL,
|
||||
"No ouput file for detailed reports specified");
|
||||
"No ouput file for detailed reports specified");
|
||||
m_detailOutputCollector->Write(translationId, out.str());
|
||||
}
|
||||
|
||||
@ -510,18 +510,18 @@ void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
|
||||
|
||||
OutputTreeFragmentsTranslationOptions(out, applicationContext, hypo, sentence, translationId);
|
||||
UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
|
||||
"No output file for tree fragments specified");
|
||||
"No output file for tree fragments specified");
|
||||
|
||||
//Tree of full sentence
|
||||
const StatefulFeatureFunction* treeStructure = StaticData::Instance().GetTreeStructure();
|
||||
if (treeStructure != NULL) {
|
||||
const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
|
||||
for( size_t i=0; i<sff.size(); i++ ) {
|
||||
if (sff[i] == treeStructure) {
|
||||
if (sff[i] == treeStructure) {
|
||||
const TreeState* tree = dynamic_cast<const TreeState*>(hypo->GetFFState(i));
|
||||
out << "Full Tree " << translationId << ": " << tree->GetTree()->GetString() << "\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -542,7 +542,7 @@ void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
|
||||
|
||||
OutputTreeFragmentsTranslationOptions(out, applicationContext, applied, sentence, translationId);
|
||||
UTIL_THROW_IF2(m_detailTreeFragmentsOutputCollector == NULL,
|
||||
"No output file for tree fragments specified");
|
||||
"No output file for tree fragments specified");
|
||||
|
||||
//Tree of full sentence
|
||||
//TODO: incremental search doesn't support stateful features
|
||||
@ -581,7 +581,7 @@ void IOWrapper::OutputDetailedAllTranslationReport(
|
||||
}
|
||||
}
|
||||
UTIL_THROW_IF2(m_detailAllOutputCollector == NULL,
|
||||
"No output file for details specified");
|
||||
"No output file for details specified");
|
||||
m_detailAllOutputCollector->Write(translationId, out.str());
|
||||
}
|
||||
|
||||
@ -609,7 +609,7 @@ void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId)
|
||||
|
||||
// delete 1st & last
|
||||
UTIL_THROW_IF2(outPhrase.GetSize() < 2,
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
|
||||
outPhrase.RemoveWord(0);
|
||||
outPhrase.RemoveWord(outPhrase.GetSize() - 1);
|
||||
@ -641,7 +641,7 @@ void IOWrapper::OutputBestHypo(search::Applied applied, long translationId)
|
||||
Incremental::ToPhrase(applied, outPhrase);
|
||||
// delete 1st & last
|
||||
UTIL_THROW_IF2(outPhrase.GetSize() < 2,
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
outPhrase.RemoveWord(0);
|
||||
outPhrase.RemoveWord(outPhrase.GetSize() - 1);
|
||||
out << outPhrase.GetStringRep(StaticData::Instance().GetOutputFactorOrder());
|
||||
@ -730,7 +730,7 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long tran
|
||||
|
||||
// delete 1st & last
|
||||
UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
|
||||
outputPhrase.RemoveWord(0);
|
||||
outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
|
||||
@ -805,7 +805,7 @@ void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList,
|
||||
}
|
||||
|
||||
bool includeWordAlignment =
|
||||
StaticData::Instance().PrintAlignmentInfoInNbest();
|
||||
StaticData::Instance().PrintAlignmentInfoInNbest();
|
||||
|
||||
for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
|
||||
p != nBestList.end(); ++p) {
|
||||
@ -816,7 +816,7 @@ void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList,
|
||||
|
||||
// delete <s> and </s>
|
||||
UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
outputPhrase.RemoveWord(0);
|
||||
outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
|
||||
|
||||
@ -858,7 +858,7 @@ void IOWrapper::OutputNBestList(const std::vector<search::Applied> &nbest, long
|
||||
Incremental::PhraseAndFeatures(*i, outputPhrase, features);
|
||||
// <s> and </s>
|
||||
UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
||||
|
||||
outputPhrase.RemoveWord(0);
|
||||
outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
|
||||
@ -980,9 +980,9 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
|
||||
}
|
||||
|
||||
size_t IOWrapper::OutputAlignmentNBest(
|
||||
Alignments &retAlign,
|
||||
const Moses::ChartKBestExtractor::Derivation &derivation,
|
||||
size_t startTarget)
|
||||
Alignments &retAlign,
|
||||
const Moses::ChartKBestExtractor::Derivation &derivation,
|
||||
size_t startTarget)
|
||||
{
|
||||
const ChartHypothesis &hypo = derivation.edge.head->hypothesis;
|
||||
|
||||
@ -1023,7 +1023,7 @@ size_t IOWrapper::OutputAlignmentNBest(
|
||||
// Recursively look thru child hypos
|
||||
size_t currStartTarget = startTarget + totalTargetSize;
|
||||
size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
|
||||
currStartTarget);
|
||||
currStartTarget);
|
||||
targetOffsets[targetPos] = targetSize;
|
||||
|
||||
totalTargetSize += targetSize;
|
||||
@ -1114,7 +1114,7 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
|
||||
size_t targetInd = 0;
|
||||
for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
|
||||
if (tp.GetWord(targetPos).IsNonTerminal()) {
|
||||
UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
|
||||
UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
|
||||
size_t sourceInd = targetPos2SourceInd[targetPos];
|
||||
size_t sourcePos = sourceInd2pos[sourceInd];
|
||||
|
||||
|
@ -234,8 +234,7 @@ static void ShowWeights()
|
||||
const StatefulFeatureFunction *ff = sff[i];
|
||||
if (ff->IsTuneable()) {
|
||||
PrintFeatureWeight(ff);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
|
||||
}
|
||||
}
|
||||
@ -243,8 +242,7 @@ static void ShowWeights()
|
||||
const StatelessFeatureFunction *ff = slf[i];
|
||||
if (ff->IsTuneable()) {
|
||||
PrintFeatureWeight(ff);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
|
||||
}
|
||||
}
|
||||
|
@ -253,17 +253,17 @@ public:
|
||||
if ( appendSuffix ) {
|
||||
fileName << "." << compression;
|
||||
}
|
||||
boost::iostreams::filtering_ostream *file
|
||||
= new boost::iostreams::filtering_ostream;
|
||||
boost::iostreams::filtering_ostream *file
|
||||
= new boost::iostreams::filtering_ostream;
|
||||
|
||||
if ( compression == "gz" ) {
|
||||
file->push( boost::iostreams::gzip_compressor() );
|
||||
} else if ( compression == "bz2" ) {
|
||||
file->push( boost::iostreams::bzip2_compressor() );
|
||||
} else if ( compression != "txt" ) {
|
||||
TRACE_ERR("Unrecognized hypergraph compression format ("
|
||||
<< compression
|
||||
<< ") - using uncompressed plain txt" << std::endl);
|
||||
TRACE_ERR("Unrecognized hypergraph compression format ("
|
||||
<< compression
|
||||
<< ") - using uncompressed plain txt" << std::endl);
|
||||
compression = "txt";
|
||||
}
|
||||
|
||||
@ -274,10 +274,10 @@ public:
|
||||
manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
|
||||
file -> flush();
|
||||
} else {
|
||||
TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber
|
||||
<< " because the output file " << fileName.str()
|
||||
<< " is not open or not ready for writing"
|
||||
<< std::endl);
|
||||
TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber
|
||||
<< " because the output file " << fileName.str()
|
||||
<< " is not open or not ready for writing"
|
||||
<< std::endl);
|
||||
}
|
||||
file -> pop();
|
||||
delete file;
|
||||
@ -504,8 +504,7 @@ static void ShowWeights()
|
||||
const StatefulFeatureFunction *ff = sff[i];
|
||||
if (ff->IsTuneable()) {
|
||||
PrintFeatureWeight(ff);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
|
||||
}
|
||||
}
|
||||
@ -513,8 +512,7 @@ static void ShowWeights()
|
||||
const StatelessFeatureFunction *ff = slf[i];
|
||||
if (ff->IsTuneable()) {
|
||||
PrintFeatureWeight(ff);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
|
||||
}
|
||||
}
|
||||
@ -585,7 +583,7 @@ int main(int argc, char** argv)
|
||||
#ifdef HAVE_PROTOBUF
|
||||
GOOGLE_PROTOBUF_VERIFY_VERSION;
|
||||
#endif
|
||||
|
||||
|
||||
// echo command line, if verbose
|
||||
IFVERBOSE(1) {
|
||||
TRACE_ERR("command: ");
|
||||
@ -604,15 +602,15 @@ int main(int argc, char** argv)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::cerr <<"Before StaticData::LoadDataStatic" << std::endl;
|
||||
std::cerr <<"Before StaticData::LoadDataStatic" << std::endl;
|
||||
// initialize all "global" variables, which are stored in StaticData
|
||||
// note: this also loads models such as the language model, etc.
|
||||
if (!StaticData::LoadDataStatic(¶ms, argv[0])) {
|
||||
exit(1);
|
||||
}
|
||||
std::cerr <<"After StaticData::LoadDataStatic" << std::endl;
|
||||
std::cerr <<"After StaticData::LoadDataStatic" << std::endl;
|
||||
|
||||
std::cerr <<"Before ShowWeights" << std::endl;
|
||||
std::cerr <<"Before ShowWeights" << std::endl;
|
||||
// setting "-show-weights" -> just dump out weights and exit
|
||||
if (params.isParamSpecified("show-weights")) {
|
||||
ShowWeights();
|
||||
|
@ -55,9 +55,9 @@ DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
|
||||
m_domain.load(domainFile);
|
||||
}
|
||||
|
||||
void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
int sentenceId) const
|
||||
void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
int sentenceId) const
|
||||
{
|
||||
std::string value = m_domain.getDomainOfSentence(sentenceId);
|
||||
phrasePair.AddProperty(m_propertyKey, value, count);
|
||||
@ -69,13 +69,13 @@ void DomainFeature::add(const ScoreFeatureContext& context,
|
||||
{
|
||||
const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
|
||||
assert( domainCount != NULL );
|
||||
add(*domainCount,
|
||||
context.phrasePair.GetCount(),
|
||||
context.maybeLog,
|
||||
add(*domainCount,
|
||||
context.phrasePair.GetCount(),
|
||||
context.maybeLog,
|
||||
denseValues, sparseValues);
|
||||
}
|
||||
|
||||
void SubsetDomainFeature::add(const map<string,float>& domainCount,
|
||||
void SubsetDomainFeature::add(const map<string,float>& domainCount,
|
||||
float count,
|
||||
const MaybeLog& maybeLog,
|
||||
std::vector<float>& denseValues,
|
||||
|
@ -35,8 +35,8 @@ public:
|
||||
|
||||
DomainFeature(const std::string& domainFile);
|
||||
|
||||
void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
int sentenceId) const;
|
||||
|
||||
void add(const ScoreFeatureContext& context,
|
||||
|
@ -29,7 +29,8 @@
|
||||
using namespace std;
|
||||
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
|
||||
extern Vocabulary vcbT;
|
||||
@ -38,23 +39,23 @@ extern Vocabulary vcbS;
|
||||
extern bool hierarchicalFlag;
|
||||
|
||||
|
||||
ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
|
||||
const PHRASE *phraseTarget,
|
||||
ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum ) :
|
||||
m_phraseSource(phraseSource),
|
||||
m_phraseTarget(phraseTarget),
|
||||
m_count(count),
|
||||
m_pcfgSum(pcfgSum)
|
||||
ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
|
||||
const PHRASE *phraseTarget,
|
||||
ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum ) :
|
||||
m_phraseSource(phraseSource),
|
||||
m_phraseTarget(phraseTarget),
|
||||
m_count(count),
|
||||
m_pcfgSum(pcfgSum)
|
||||
{
|
||||
assert(phraseSource->empty());
|
||||
assert(phraseTarget->empty());
|
||||
|
||||
m_count = count;
|
||||
m_pcfgSum = pcfgSum;
|
||||
|
||||
|
||||
std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
|
||||
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
|
||||
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
|
||||
|
||||
m_lastTargetToSourceAlignment = insertedAlignment.first;
|
||||
m_lastCount = m_count;
|
||||
@ -64,29 +65,30 @@ ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
|
||||
}
|
||||
|
||||
|
||||
ExtractionPhrasePair::~ExtractionPhrasePair( ) {
|
||||
ExtractionPhrasePair::~ExtractionPhrasePair( )
|
||||
{
|
||||
Clear();
|
||||
}
|
||||
|
||||
|
||||
// return value: true if the given alignment was seen for the first time and thus will be stored,
|
||||
// false if it was present already (the pointer may thus be deleted(
|
||||
bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum )
|
||||
bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum )
|
||||
{
|
||||
m_count += count;
|
||||
m_pcfgSum += pcfgSum;
|
||||
|
||||
m_lastCount = count;
|
||||
m_lastPcfgSum = pcfgSum;
|
||||
|
||||
|
||||
std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
|
||||
if ( *(iter->first) == *targetToSourceAlignment ) {
|
||||
iter->second += count;
|
||||
return false;
|
||||
} else {
|
||||
std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
|
||||
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
|
||||
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
|
||||
if ( !insertedAlignment.second ) {
|
||||
// the alignment already exists: increment count
|
||||
insertedAlignment.first->second += count;
|
||||
@ -105,7 +107,7 @@ void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
|
||||
m_pcfgSum += pcfgSum;
|
||||
m_lastTargetToSourceAlignment->second += count;
|
||||
// properties
|
||||
for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
|
||||
for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
|
||||
iter !=m_properties.end(); ++iter ) {
|
||||
LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
|
||||
(*lastPropertyValue)->second += count;
|
||||
@ -116,7 +118,7 @@ void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
|
||||
}
|
||||
|
||||
|
||||
// Check for lexical match
|
||||
// Check for lexical match
|
||||
// and in case of SCFG rules for equal non-terminal alignment.
|
||||
bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
|
||||
const PHRASE *otherPhraseTarget,
|
||||
@ -132,9 +134,9 @@ bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
|
||||
return MatchesAlignment( otherTargetToSourceAlignment );
|
||||
}
|
||||
|
||||
// Check for lexical match
|
||||
// Check for lexical match
|
||||
// and in case of SCFG rules for equal non-terminal alignment.
|
||||
// Set boolean indicators.
|
||||
// Set boolean indicators.
|
||||
// (Note that we check in the order: target - source - alignment
|
||||
// and do not touch the subsequent boolean indicators once a previous one has been set to false.)
|
||||
bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
|
||||
@ -194,7 +196,7 @@ bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlign
|
||||
return true;
|
||||
}
|
||||
|
||||
void ExtractionPhrasePair::Clear()
|
||||
void ExtractionPhrasePair::Clear()
|
||||
{
|
||||
delete m_phraseSource;
|
||||
delete m_phraseTarget;
|
||||
@ -218,7 +220,7 @@ void ExtractionPhrasePair::Clear()
|
||||
m_lastCount = 0.0f;
|
||||
m_lastPcfgSum = 0.0f;
|
||||
m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
|
||||
|
||||
|
||||
m_isValid = false;
|
||||
}
|
||||
|
||||
@ -252,7 +254,7 @@ const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
|
||||
|
||||
std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
|
||||
|
||||
for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin();
|
||||
for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin();
|
||||
iter!=m_targetToSourceAlignments.end(); ++iter) {
|
||||
if ( (iter->second > bestAlignmentCount) ||
|
||||
( (iter->second == bestAlignmentCount) &&
|
||||
@ -281,7 +283,7 @@ const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string
|
||||
|
||||
PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
|
||||
|
||||
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||
iter!=allPropertyValues->end(); ++iter) {
|
||||
if ( (iter->second > bestPropertyCount) ||
|
||||
( (iter->second == bestPropertyCount) &&
|
||||
@ -308,7 +310,7 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke
|
||||
}
|
||||
|
||||
std::ostringstream oss;
|
||||
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||
iter!=allPropertyValues->end(); ++iter) {
|
||||
if (iter!=allPropertyValues->begin()) {
|
||||
oss << " ";
|
||||
|
@ -24,20 +24,22 @@
|
||||
#include <set>
|
||||
#include <map>
|
||||
|
||||
namespace MosesTraining {
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
|
||||
typedef std::vector< std::set<size_t> > ALIGNMENT;
|
||||
|
||||
|
||||
class ExtractionPhrasePair {
|
||||
class ExtractionPhrasePair
|
||||
{
|
||||
|
||||
protected:
|
||||
|
||||
typedef std::map<std::string,float> PROPERTY_VALUES;
|
||||
typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
|
||||
|
||||
|
||||
|
||||
bool m_isValid;
|
||||
|
||||
const PHRASE *m_phraseSource;
|
||||
@ -47,8 +49,8 @@ protected:
|
||||
float m_pcfgSum;
|
||||
|
||||
std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
|
||||
std::map<std::string,
|
||||
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
|
||||
std::map<std::string,
|
||||
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
|
||||
|
||||
float m_lastCount;
|
||||
float m_lastPcfgSum;
|
||||
@ -56,14 +58,14 @@ protected:
|
||||
|
||||
public:
|
||||
|
||||
ExtractionPhrasePair( const PHRASE *phraseSource,
|
||||
const PHRASE *phraseTarget,
|
||||
ALIGNMENT *targetToSourceAlignment,
|
||||
ExtractionPhrasePair( const PHRASE *phraseSource,
|
||||
const PHRASE *phraseTarget,
|
||||
ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum );
|
||||
|
||||
~ExtractionPhrasePair();
|
||||
|
||||
bool Add( ALIGNMENT *targetToSourceAlignment,
|
||||
bool Add( ALIGNMENT *targetToSourceAlignment,
|
||||
float count, float pcfgSum );
|
||||
|
||||
void IncrementPrevious( float count, float pcfgSum );
|
||||
@ -91,7 +93,7 @@ public:
|
||||
const PHRASE *GetSource() const {
|
||||
return m_phraseSource;
|
||||
}
|
||||
|
||||
|
||||
const PHRASE *GetTarget() const {
|
||||
return m_phraseTarget;
|
||||
}
|
||||
@ -126,10 +128,9 @@ public:
|
||||
|
||||
void AddProperties( const std::string &str, float count );
|
||||
|
||||
void AddProperty( const std::string &key, const std::string &value, float count )
|
||||
{
|
||||
void AddProperty( const std::string &key, const std::string &value, float count ) {
|
||||
std::map<std::string,
|
||||
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
|
||||
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
|
||||
if ( iter == m_properties.end() ) {
|
||||
// key not found: insert property key and value
|
||||
PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
|
||||
|
@ -8,7 +8,8 @@ namespace MosesTraining
|
||||
|
||||
void InternalStructFeature::add(const ScoreFeatureContext& context,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const {
|
||||
std::map<std::string,float>& sparseValues) const
|
||||
{
|
||||
const std::map<std::string,float> *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only?
|
||||
for ( std::map<std::string,float>::const_iterator iter=allTrees->begin();
|
||||
iter!=allTrees->end(); ++iter ) {
|
||||
@ -19,24 +20,26 @@ void InternalStructFeature::add(const ScoreFeatureContext& context,
|
||||
void InternalStructFeatureDense::add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const {
|
||||
//cout<<"Dense: "<<*internalStruct<<endl;
|
||||
size_t start=0;
|
||||
int countNP=0;
|
||||
while((start = treeFragment->find("NP", start)) != string::npos) {
|
||||
countNP += count;
|
||||
start+=2; //length of "NP"
|
||||
}
|
||||
//should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
|
||||
//should use this but don't know what it does? -> maybeLog( (bitmap == i) ? 2.718 : 1 )
|
||||
denseValues.push_back(exp(countNP));
|
||||
std::map<std::string,float>& sparseValues) const
|
||||
{
|
||||
//cout<<"Dense: "<<*internalStruct<<endl;
|
||||
size_t start=0;
|
||||
int countNP=0;
|
||||
while((start = treeFragment->find("NP", start)) != string::npos) {
|
||||
countNP += count;
|
||||
start+=2; //length of "NP"
|
||||
}
|
||||
//should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
|
||||
//should use this but don't know what it does? -> maybeLog( (bitmap == i) ? 2.718 : 1 )
|
||||
denseValues.push_back(exp(countNP));
|
||||
|
||||
}
|
||||
|
||||
void InternalStructFeatureSparse::add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const {
|
||||
std::map<std::string,float>& sparseValues) const
|
||||
{
|
||||
//cout<<"Sparse: "<<*internalStruct<<endl;
|
||||
if(treeFragment->find("VBZ")!=std::string::npos)
|
||||
sparseValues["NTVBZ"] += count;
|
||||
|
@ -21,20 +21,20 @@ namespace MosesTraining
|
||||
class InternalStructFeature : public ScoreFeature
|
||||
{
|
||||
public:
|
||||
InternalStructFeature() : m_type(0) {};
|
||||
/** Add the values for this feature function. */
|
||||
void add(const ScoreFeatureContext& context,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
InternalStructFeature() : m_type(0) {};
|
||||
/** Add the values for this feature function. */
|
||||
void add(const ScoreFeatureContext& context,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
|
||||
|
||||
protected:
|
||||
/** Overridden in subclass */
|
||||
virtual void add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const = 0;
|
||||
int m_type;
|
||||
/** Overridden in subclass */
|
||||
virtual void add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const = 0;
|
||||
int m_type;
|
||||
};
|
||||
|
||||
class InternalStructFeatureDense : public InternalStructFeature
|
||||
@ -45,10 +45,10 @@ public:
|
||||
m_type=1;
|
||||
} //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
|
||||
protected:
|
||||
virtual void add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
virtual void add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
};
|
||||
|
||||
class InternalStructFeatureSparse : public InternalStructFeature
|
||||
@ -59,10 +59,10 @@ public:
|
||||
m_type=2;
|
||||
}// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
|
||||
protected:
|
||||
virtual void add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
virtual void add(const std::string *treeFragment,
|
||||
float count,
|
||||
std::vector<float>& denseValues,
|
||||
std::map<std::string,float>& sparseValues) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -77,12 +77,12 @@ void ScoreFeatureManager::configure(const std::vector<std::string> args)
|
||||
}
|
||||
sparseDomainAdded = true;
|
||||
m_includeSentenceId = true;
|
||||
} else if(args[i] == "--TreeFeatureSparse"){
|
||||
//MARIA
|
||||
m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse()));
|
||||
} else if(args[i] == "--TreeFeatureDense"){
|
||||
//MARIA
|
||||
m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense()));
|
||||
} else if(args[i] == "--TreeFeatureSparse") {
|
||||
//MARIA
|
||||
m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse()));
|
||||
} else if(args[i] == "--TreeFeatureDense") {
|
||||
//MARIA
|
||||
m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense()));
|
||||
} else {
|
||||
UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]);
|
||||
}
|
||||
@ -91,9 +91,9 @@ void ScoreFeatureManager::configure(const std::vector<std::string> args)
|
||||
|
||||
}
|
||||
|
||||
void ScoreFeatureManager::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
int sentenceId) const
|
||||
void ScoreFeatureManager::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
int sentenceId) const
|
||||
{
|
||||
for (size_t i = 0; i < m_features.size(); ++i) {
|
||||
m_features[i]->addPropertiesToPhrasePair(phrasePair, count, sentenceId);
|
||||
|
@ -84,10 +84,10 @@ class ScoreFeature
|
||||
public:
|
||||
|
||||
/** Some features might need to store properties in ExtractionPhrasePair,
|
||||
* e.g. to pass along external information loaded by a feature
|
||||
* e.g. to pass along external information loaded by a feature
|
||||
* which may distinguish several phrase occurrences based on sentence ID */
|
||||
virtual void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
virtual void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
int sentenceId) const {};
|
||||
|
||||
/** Add the values for this feature function. */
|
||||
@ -113,10 +113,10 @@ public:
|
||||
void configure(const std::vector<std::string> args);
|
||||
|
||||
/** Some features might need to store properties in ExtractionPhrasePair,
|
||||
* e.g. to pass along external information loaded by a feature
|
||||
* e.g. to pass along external information loaded by a feature
|
||||
* which may distinguish several phrase occurrences based on sentence ID */
|
||||
void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
||||
float count,
|
||||
int sentenceId) const;
|
||||
|
||||
/** Add all the features */
|
||||
|
@ -92,9 +92,9 @@ class ExtractTask
|
||||
public:
|
||||
ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFileOrientation)
|
||||
:m_sentence(sentence),
|
||||
m_options(initoptions),
|
||||
m_extractFileOrientation(extractFileOrientation)
|
||||
{}
|
||||
m_options(initoptions),
|
||||
m_extractFileOrientation(extractFileOrientation)
|
||||
{}
|
||||
void Run();
|
||||
private:
|
||||
void extract(SentenceAlignment &);
|
||||
@ -151,11 +151,11 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
options.initInstanceWeightsFile(argv[++i]);
|
||||
} else if (strcmp(argv[i], "--Debug") == 0) {
|
||||
options.debug = true;
|
||||
options.debug = true;
|
||||
} else if (strcmp(argv[i], "--MinPhraseLength") == 0) {
|
||||
options.minPhraseLength = atoi(argv[++i]);
|
||||
options.minPhraseLength = atoi(argv[++i]);
|
||||
} else if (strcmp(argv[i], "--Separator") == 0) {
|
||||
options.separator = argv[++i];
|
||||
options.separator = argv[++i];
|
||||
} else if(strcmp(argv[i],"--model") == 0) {
|
||||
if (i+1 >= argc) {
|
||||
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
|
||||
@ -605,16 +605,14 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
|
||||
|
||||
int getClass(const std::string &str)
|
||||
{
|
||||
size_t pos = str.find("swap");
|
||||
if (pos == str.npos) {
|
||||
return 0;
|
||||
}
|
||||
else if (pos == 0) {
|
||||
return 1;
|
||||
}
|
||||
else {
|
||||
return 2;
|
||||
}
|
||||
size_t pos = str.find("swap");
|
||||
if (pos == str.npos) {
|
||||
return 0;
|
||||
} else if (pos == 0) {
|
||||
return 1;
|
||||
} else {
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
|
||||
@ -635,19 +633,19 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
// start
|
||||
m_extractFileOrientation << "<s> ";
|
||||
for(int fi=0; fi<startF; fi++) {
|
||||
m_extractFileOrientation << sentence.source[fi] << " ";
|
||||
m_extractFileOrientation << sentence.source[fi] << " ";
|
||||
}
|
||||
m_extractFileOrientation << sep << " ";
|
||||
|
||||
// middle
|
||||
for(int fi=startF; fi<=endF; fi++) {
|
||||
m_extractFileOrientation << sentence.source[fi] << " ";
|
||||
m_extractFileOrientation << sentence.source[fi] << " ";
|
||||
}
|
||||
m_extractFileOrientation << sep << " ";
|
||||
|
||||
// end
|
||||
for(int fi=endF+1; fi<sentence.source.size(); fi++) {
|
||||
m_extractFileOrientation << sentence.source[fi] << " ";
|
||||
m_extractFileOrientation << sentence.source[fi] << " ";
|
||||
}
|
||||
m_extractFileOrientation << "</s> ";
|
||||
|
||||
@ -655,7 +653,7 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
// target
|
||||
/*
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
m_extractFileOrientation << sentence.target[ei] << " ";
|
||||
m_extractFileOrientation << sentence.target[ei] << " ";
|
||||
}
|
||||
*/
|
||||
m_extractFileOrientation << endl;
|
||||
|
@ -68,7 +68,7 @@ std::map<std::string,float> sourceLHSCounts;
|
||||
std::map<std::string, std::map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
|
||||
|
||||
std::set<std::string> sourceLabelSet;
|
||||
std::map<std::string,size_t> sourceLabels;
|
||||
std::map<std::string,size_t> sourceLabels;
|
||||
std::vector<std::string> sourceLabelsByIndex;
|
||||
|
||||
Vocabulary vcbT;
|
||||
@ -79,12 +79,12 @@ Vocabulary vcbS;
|
||||
std::vector<std::string> tokenize( const char [] );
|
||||
|
||||
void processLine( std::string line,
|
||||
int lineID, bool includeSentenceIdFlag, int &sentenceId,
|
||||
int lineID, bool includeSentenceIdFlag, int &sentenceId,
|
||||
PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
|
||||
std::string &additionalPropertiesString,
|
||||
float &count, float &pcfgSum );
|
||||
void writeCountOfCounts( const std::string &fileNameCountOfCounts );
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb );
|
||||
void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog );
|
||||
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
|
||||
@ -100,7 +100,7 @@ void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, co
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::cerr << "Score v2.1 -- "
|
||||
std::cerr << "Score v2.1 -- "
|
||||
<< "scoring methods for extracted rules" << std::endl;
|
||||
|
||||
ScoreFeatureManager featureManager;
|
||||
@ -155,7 +155,7 @@ int main(int argc, char* argv[])
|
||||
} else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
|
||||
unalignedFWFlag = true;
|
||||
if (i+1==argc) {
|
||||
std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
|
||||
std::cerr << "ERROR: specify function words file for unaligned function word penalty!" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
fileNameFunctionWords = argv[++i];
|
||||
@ -224,8 +224,8 @@ int main(int argc, char* argv[])
|
||||
Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
|
||||
bool success = outputFile->Open(fileNamePhraseTable);
|
||||
if (!success) {
|
||||
std::cerr << "ERROR: could not open file phrase table file "
|
||||
<< fileNamePhraseTable << std::endl;
|
||||
std::cerr << "ERROR: could not open file phrase table file "
|
||||
<< fileNamePhraseTable << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
phraseTableFile = outputFile;
|
||||
@ -251,12 +251,12 @@ int main(int argc, char* argv[])
|
||||
tmpPhraseSource = new PHRASE();
|
||||
tmpPhraseTarget = new PHRASE();
|
||||
tmpTargetToSourceAlignment = new ALIGNMENT();
|
||||
processLine( std::string(line),
|
||||
processLine( std::string(line),
|
||||
i, featureManager.includeSentenceId(), tmpSentenceId,
|
||||
tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
||||
tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
||||
tmpAdditionalPropertiesString,
|
||||
tmpCount, tmpPcfgSum);
|
||||
phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
|
||||
phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
|
||||
tmpTargetToSourceAlignment,
|
||||
tmpCount, tmpPcfgSum );
|
||||
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
|
||||
@ -288,14 +288,16 @@ int main(int argc, char* argv[])
|
||||
tmpPhraseTarget = new PHRASE();
|
||||
tmpTargetToSourceAlignment = new ALIGNMENT();
|
||||
tmpAdditionalPropertiesString.clear();
|
||||
processLine( std::string(line),
|
||||
processLine( std::string(line),
|
||||
i, featureManager.includeSentenceId(), tmpSentenceId,
|
||||
tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
||||
tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment,
|
||||
tmpAdditionalPropertiesString,
|
||||
tmpCount, tmpPcfgSum);
|
||||
tmpCount, tmpPcfgSum);
|
||||
|
||||
bool matchesPrevious = false;
|
||||
bool sourceMatch = true; bool targetMatch = true; bool alignmentMatch = true; // be careful with these,
|
||||
bool sourceMatch = true;
|
||||
bool targetMatch = true;
|
||||
bool alignmentMatch = true; // be careful with these,
|
||||
// ExtractionPhrasePair::Matches() checks them in order and does not continue with the others
|
||||
// once the first of them has been found to have to be set to false
|
||||
|
||||
@ -330,7 +332,7 @@ int main(int argc, char* argv[])
|
||||
if ( !phrasePairsWithSameSource.empty() &&
|
||||
!sourceMatch ) {
|
||||
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
||||
delete *iter;
|
||||
}
|
||||
@ -347,8 +349,8 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
|
||||
tmpTargetToSourceAlignment,
|
||||
phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget,
|
||||
tmpTargetToSourceAlignment,
|
||||
tmpCount, tmpPcfgSum );
|
||||
phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount );
|
||||
featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId );
|
||||
@ -364,7 +366,7 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
|
||||
processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
||||
delete *iter;
|
||||
}
|
||||
@ -384,7 +386,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
|
||||
void processLine( std::string line,
|
||||
int lineID, bool includeSentenceIdFlag, int &sentenceId,
|
||||
int lineID, bool includeSentenceIdFlag, int &sentenceId,
|
||||
PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment,
|
||||
std::string &additionalPropertiesString,
|
||||
float &count, float &pcfgSum )
|
||||
@ -474,7 +476,7 @@ void writeCountOfCounts( const string &fileNameCountOfCounts )
|
||||
}
|
||||
|
||||
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
|
||||
{
|
||||
if (phrasePairsWithSameSource.size() == 0) {
|
||||
@ -486,23 +488,23 @@ void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSa
|
||||
//std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl;
|
||||
|
||||
// loop through phrase pairs
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
||||
// add to total count
|
||||
totalSource += (*iter)->GetCount();
|
||||
}
|
||||
|
||||
// output the distinct phrase pairs, one at a time
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin();
|
||||
iter!=phrasePairsWithSameSource.end(); ++iter) {
|
||||
// add to total count
|
||||
outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb );
|
||||
}
|
||||
}
|
||||
|
||||
void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
float totalCount, int distinctCount,
|
||||
ostream &phraseTableFile,
|
||||
void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
float totalCount, int distinctCount,
|
||||
ostream &phraseTableFile,
|
||||
const ScoreFeatureManager& featureManager,
|
||||
const MaybeLog& maybeLogProb )
|
||||
{
|
||||
@ -557,45 +559,45 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
|
||||
// alignment
|
||||
if ( hierarchicalFlag ) {
|
||||
// always output alignment if hiero style
|
||||
assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
|
||||
std::vector<std::string> alignment;
|
||||
for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
|
||||
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
|
||||
if ( bestAlignmentT2S->at(j).size() != 1 ) {
|
||||
std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
|
||||
phraseTableFile.flush();
|
||||
assert(bestAlignmentT2S->at(j).size() == 1);
|
||||
}
|
||||
size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
|
||||
//phraseTableFile << sourcePos << "-" << j << " ";
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
} else {
|
||||
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
||||
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
||||
size_t sourcePos = *setIter;
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
}
|
||||
// always output alignment if hiero style
|
||||
assert(phraseTarget->size() == bestAlignmentT2S->size()+1);
|
||||
std::vector<std::string> alignment;
|
||||
for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) {
|
||||
if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) {
|
||||
if ( bestAlignmentT2S->at(j).size() != 1 ) {
|
||||
std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl;
|
||||
phraseTableFile.flush();
|
||||
assert(bestAlignmentT2S->at(j).size() == 1);
|
||||
}
|
||||
}
|
||||
// now print all alignments, sorted by source index
|
||||
sort(alignment.begin(), alignment.end());
|
||||
for (size_t i = 0; i < alignment.size(); ++i) {
|
||||
phraseTableFile << alignment[i] << " ";
|
||||
}
|
||||
} else if ( !inverseFlag && wordAlignmentFlag) {
|
||||
// alignment info in pb model
|
||||
for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
|
||||
size_t sourcePos = *(bestAlignmentT2S->at(j).begin());
|
||||
//phraseTableFile << sourcePos << "-" << j << " ";
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
} else {
|
||||
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
||||
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
||||
size_t sourcePos = *setIter;
|
||||
phraseTableFile << sourcePos << "-" << j << " ";
|
||||
std::stringstream point;
|
||||
point << sourcePos << "-" << j;
|
||||
alignment.push_back(point.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
// now print all alignments, sorted by source index
|
||||
sort(alignment.begin(), alignment.end());
|
||||
for (size_t i = 0; i < alignment.size(); ++i) {
|
||||
phraseTableFile << alignment[i] << " ";
|
||||
}
|
||||
} else if ( !inverseFlag && wordAlignmentFlag) {
|
||||
// alignment info in pb model
|
||||
for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) {
|
||||
for ( std::set<size_t>::iterator setIter = (bestAlignmentT2S->at(j)).begin();
|
||||
setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) {
|
||||
size_t sourcePos = *setIter;
|
||||
phraseTableFile << sourcePos << "-" << j << " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
phraseTableFile << " ||| ";
|
||||
@ -646,7 +648,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
if (kneserNeyFlag)
|
||||
phraseTableFile << " " << distinctCount;
|
||||
|
||||
if ((treeFragmentsFlag) &&
|
||||
if ((treeFragmentsFlag) &&
|
||||
!inverseFlag) {
|
||||
phraseTableFile << " |||";
|
||||
}
|
||||
@ -671,7 +673,7 @@ bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *al
|
||||
// skip
|
||||
} else {
|
||||
const std::set<size_t> &sourceSet = alignmentTargetToSource->at(currTarget);
|
||||
for (std::set<size_t>::const_iterator iter = sourceSet.begin();
|
||||
for (std::set<size_t>::const_iterator iter = sourceSet.begin();
|
||||
iter != sourceSet.end(); ++iter) {
|
||||
size_t currSource = *iter;
|
||||
|
||||
@ -808,9 +810,9 @@ void LexicalTable::load( const string &fileName )
|
||||
|
||||
std::vector<string> token = tokenize( line );
|
||||
if (token.size() != 3) {
|
||||
std::cerr << "line " << i << " in " << fileName
|
||||
<< " has wrong number of tokens, skipping:" << std::endl
|
||||
<< token.size() << " " << token[0] << " " << line << std::endl;
|
||||
std::cerr << "line " << i << " in " << fileName
|
||||
<< " has wrong number of tokens, skipping:" << std::endl
|
||||
<< token.size() << " " << token[0] << " " << line << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -889,15 +891,16 @@ void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
||||
|
||||
|
||||
void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget,
|
||||
const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment) {
|
||||
// typedef std::vector< std::set<size_t> > ALIGNMENT;
|
||||
const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment)
|
||||
{
|
||||
// typedef std::vector< std::set<size_t> > ALIGNMENT;
|
||||
|
||||
outSourceToTargetAlignment->clear();
|
||||
size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size());
|
||||
outSourceToTargetAlignment->resize(numberOfSourceSymbols);
|
||||
// add alignment point
|
||||
for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) {
|
||||
for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin();
|
||||
for ( std::set<size_t>::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin();
|
||||
setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) {
|
||||
size_t sourcePosition = *setIter;
|
||||
outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition);
|
||||
|
Loading…
Reference in New Issue
Block a user