Support for printing out word-to-word alignments (besides phrase-to-phrase alignments)

as contained in the phrase table.
If PT contains word-to-word alignments between source and target phrases,
Moses can optionally output them in the nbest and in the log file (if verbose).
W2w alignments from source to target and from target to source can differ,
if they differ in the PT.

Detailed documentation will be added in the Moses webpages very soon.


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1886 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
nicolabertoldi 2008-09-12 18:09:06 +00:00
parent e376f9f994
commit dd6c36640b
26 changed files with 1699 additions and 254 deletions

View File

@ -134,8 +134,6 @@ void IOWrapper::Initialization(const std::vector<FactorType> &inputFactorOrder
if (staticData.GetOutputWordGraph())
{
string fileName = staticData.GetParam("output-word-graph")[0];
bool outputNBest = Scan<bool>(staticData.GetParam("output-word-graph")[1]);
std::ofstream *file = new std::ofstream;
m_outputWordGraphStream = file;
file->open(fileName.c_str());
@ -144,8 +142,8 @@ void IOWrapper::Initialization(const std::vector<FactorType> &inputFactorOrder
// search graph output
if (staticData.GetOutputSearchGraph())
{
std::ofstream *file = new std::ofstream;
string fileName = staticData.GetParam("output-search-graph")[0];
std::ofstream *file = new std::ofstream;
m_outputSearchGraphStream = file;
file->open(fileName.c_str());
}
@ -211,6 +209,48 @@ void OutputSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<
}
}
void OutputWordAlignment(std::ostream &out, const TargetPhrase &phrase, size_t srcoffset, size_t trgoffset, FactorDirection direction)
{
size_t size = phrase.GetSize();
if (size){
out << " ";
/* out << phrase;
out << " ===> offset: (" << srcoffset << "," << trgoffset << ")";
out << " ===> size: (" << phrase.GetAlignmentPair().GetAlignmentPhrase(Input).GetSize() << ","
<< phrase.GetAlignmentPair().GetAlignmentPhrase(Output).GetSize() << ") ===> ";
*/
AlignmentPhrase alignphrase=phrase.GetAlignmentPair().GetAlignmentPhrase(direction);
/* alignphrase.print(out,0);
out << " ===> ";
// out << alignphrase << " ===> ";
*/
if (direction == Input){
alignphrase.Shift(trgoffset);
alignphrase.print(out,srcoffset);
}
else{
alignphrase.Shift(srcoffset);
alignphrase.print(out,trgoffset);
}
/*
// out << alignphrase << " ===> ";
out << "\n";
*/
}
}
void OutputWordAlignment(std::ostream &out, const Hypothesis *hypo, FactorDirection direction)
{
size_t srcoffset, trgoffset;
if ( hypo != NULL)
{
srcoffset=hypo->GetCurrSourceWordsRange().GetStartPos();
trgoffset=hypo->GetCurrTargetWordsRange().GetStartPos();
OutputWordAlignment(out, hypo->GetPrevHypo(),direction);
OutputWordAlignment(out, hypo->GetCurrTargetPhrase(), srcoffset, trgoffset, direction);
}
}
void IOWrapper::Backtrack(const Hypothesis *hypo){
if (hypo->GetPrevHypo() != NULL) {
@ -281,6 +321,7 @@ void IOWrapper::OutputNBestList(const TrellisPathList &nBestList, long translati
{
bool labeledOutput = StaticData::Instance().IsLabeledNBestList();
bool includeAlignment = StaticData::Instance().NBestIncludesAlignment();
bool includeWordAlignment = StaticData::Instance().PrintAlignmentInfoInNbest();
TrellisPathList::const_iterator iter;
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter)
@ -404,28 +445,52 @@ void IOWrapper::OutputNBestList(const TrellisPathList &nBestList, long translati
}
}
// total
*m_nBestStream << "||| " << path.GetTotalScore();
//phrase-to-phrase alignment
if (includeAlignment) {
*m_nBestStream << " |||";
for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--)
{
const Hypothesis &edge = *edges[currEdge];
const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
WordsRange targetRange = path.GetTargetWordsRange(edge);
*m_nBestStream << " " << sourceRange.GetStartPos();
if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
*m_nBestStream << "-" << sourceRange.GetEndPos();
*m_nBestStream << " |||";
for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--)
{
const Hypothesis &edge = *edges[currEdge];
const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
WordsRange targetRange = path.GetTargetWordsRange(edge);
*m_nBestStream << " " << sourceRange.GetStartPos();
if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
*m_nBestStream << "-" << sourceRange.GetEndPos();
}
*m_nBestStream << "=" << targetRange.GetStartPos();
if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
*m_nBestStream << "-" << targetRange.GetEndPos();
}
}
*m_nBestStream << "=" << targetRange.GetStartPos();
if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
*m_nBestStream << "-" << targetRange.GetEndPos();
}
if (includeWordAlignment){
//word-to-word alignment (source-to-target)
*m_nBestStream << " |||";
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--)
{
const Hypothesis &edge = *edges[currEdge];
WordsRange targetRange = path.GetTargetWordsRange(edge);
OutputWordAlignment(*m_nBestStream, edge.GetCurrTargetPhrase(),edge.GetCurrSourceWordsRange().GetStartPos(),targetRange.GetStartPos(), Input);
}
//word-to-word alignment (target-to-source)
*m_nBestStream << " |||";
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--)
{
const Hypothesis &edge = *edges[currEdge];
WordsRange targetRange = path.GetTargetWordsRange(edge);
OutputWordAlignment(*m_nBestStream, edge.GetCurrTargetPhrase(),edge.GetCurrSourceWordsRange().GetStartPos(),targetRange.GetStartPos(), Output);
}
}
}
*m_nBestStream << endl;
*m_nBestStream << endl;
}
*m_nBestStream<<std::flush;
}

View File

@ -0,0 +1,97 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <algorithm>
#include "AlignmentElement.h"
using namespace std;
AlignmentElement::AlignmentElement(const ContainerType &alignInfo)
{
insert_iterator<ContainerType> insertIter( m_collection, m_collection.end() );
copy(alignInfo.begin(), alignInfo.end(), insertIter);
};
AlignmentElement::AlignmentElement(const vector<AlignmentElementType> &alignInfo)
{
insert_iterator<ContainerType> insertIter( m_collection, m_collection.end() );
copy(alignInfo.begin(), alignInfo.end(), insertIter);
};
AlignmentElement::AlignmentElement(const AlignmentElement &alignInfo)
{
insert_iterator<ContainerType> insertIter( m_collection, m_collection.end() );
copy(alignInfo.begin(), alignInfo.end(), insertIter);
};
AlignmentElement& AlignmentElement::operator=(const AlignmentElement& alignInfo)
{
insert_iterator<ContainerType> insertIter( m_collection, m_collection.end() );
copy(alignInfo.begin(), alignInfo.end(), insertIter);
return *this;
}
void AlignmentElement::Shift(int shift)
{
ContainerType newColl;
ContainerType::const_iterator iter;
for (iter = m_collection.begin() ; iter != m_collection.end() ; ++iter){
if (*iter!=-1) newColl.insert(*iter + shift);
else newColl.insert(*iter);
}
m_collection = newColl;
}
std::ostream& operator<<(std::ostream& out, const AlignmentElement &alignElement)
{
const AlignmentElement::ContainerType &elemSet = alignElement.GetCollection();
// out << "(";
if (elemSet.size() > 0)
{
AlignmentElement::ContainerType::const_iterator iter = elemSet.begin();
out << *iter;
for (++iter ; iter != elemSet.end() ; ++iter)
out << "," << *iter;
}
// out << ")";
return out;
}
void AlignmentElement::SetIntersect(const AlignmentElement &otherElement)
{
ContainerType newElement;
set_intersection(m_collection.begin() , m_collection.end()
,otherElement.begin() , otherElement.end()
,inserter(newElement , newElement.begin()) );
m_collection = newElement;
}
void AlignmentElement::SetUniformAlignment(size_t otherPhraseSize)
{
for (size_t pos = 0 ; pos < otherPhraseSize ; ++pos)
m_collection.insert(pos);
}
TO_STRING_BODY(AlignmentElement);

View File

@ -0,0 +1,105 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <iostream>
#include <set>
#include <vector>
#include "Util.h"
typedef short int AlignmentElementType;
//! set of alignments of 1 word
class AlignmentElement
{
friend std::ostream& operator<<(std::ostream& out, const AlignmentElement &alignElement);
protected:
typedef std::set<AlignmentElementType> ContainerType;
ContainerType m_collection;
public:
typedef ContainerType::iterator iterator;
typedef ContainerType::const_iterator const_iterator;
const_iterator begin() const { return m_collection.begin(); }
const_iterator end() const { return m_collection.end(); }
AlignmentElement(){};
~AlignmentElement(){};
//! inital constructor from parsed info from phrase table
AlignmentElement(const ContainerType &alignInfo);
AlignmentElement(const std::vector<AlignmentElementType> &alignInfo);
AlignmentElement(const AlignmentElement &alignInfo);
AlignmentElement& operator=(const AlignmentElement &copy);
//! number of words this element aligns to
size_t GetSize() const
{
return m_collection.size();
}
bool IsEmpty() const
{
return m_collection.empty();
}
//! return internal collection of elements
const ContainerType &GetCollection() const
{
return m_collection;
}
/** compare all alignments for this word.
* Return true iff both words are aligned to the same words
*/
bool Equals(const AlignmentElement &compare) const
{
return m_collection == compare.GetCollection();
}
/** used by the unknown word handler.
* Set alignment to 0
*/
void SetIdentityAlignment()
{
m_collection.insert(0);
}
/** align to all elements on other side, where the size of the other
* phrase is otherPhraseSize. Used when element has no alignment info
*/
void SetUniformAlignment(size_t otherPhraseSize);
/** set intersect with other element. Used when applying trans opt to a hypo
*/
void SetIntersect(const AlignmentElement &otherElement);
void Add(size_t pos)
{
m_collection.insert(pos);
}
// shift alignment so that it is comparitable to another alignment.
void Shift(int shift);
TO_STRING();
};

View File

@ -0,0 +1,97 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "AlignmentPair.h"
#include "AlignmentPhrase.h"
#include "WordsRange.h"
using namespace std;
AlignmentPhraseInserter AlignmentPair::GetInserter(FactorDirection direction)
{
return (direction == Input) ? back_insert_iterator<AlignmentPhrase::CollectionType>(m_sourceAlign.GetVector())
: back_insert_iterator<AlignmentPhrase::CollectionType>(m_targetAlign.GetVector());
}
void AlignmentPair::SetIdentityAlignment()
{
AlignmentElement alignment;
alignment.SetIdentityAlignment();
m_sourceAlign.Add(alignment);
m_targetAlign.Add(alignment);
}
bool AlignmentPair::IsCompatible(const AlignmentPair &compare
, size_t sourceStart
, size_t targetStart) const
{
// source
bool ret = GetAlignmentPhrase(Input).IsCompatible(
compare.GetAlignmentPhrase(Input)
, sourceStart
, targetStart);
if (!ret)
return false;
// target
return GetAlignmentPhrase(Output).IsCompatible(
compare.GetAlignmentPhrase(Output)
, targetStart
, sourceStart);
}
void AlignmentPair::Add(const AlignmentPair &newAlignment
, const WordsRange &sourceRange
, const WordsRange &targetRange)
{
m_sourceAlign.Add(newAlignment.m_sourceAlign
, targetRange.GetStartPos()
, sourceRange.GetStartPos());
m_targetAlign.Add(newAlignment.m_targetAlign
, sourceRange.GetStartPos()
, targetRange.GetStartPos());
}
void AlignmentPair::Merge(const AlignmentPair &newAlignment, const WordsRange &sourceRange, const WordsRange &targetRange)
{
m_sourceAlign.Merge(newAlignment.m_sourceAlign
, targetRange.GetStartPos()
, sourceRange.GetStartPos());
m_targetAlign.Merge(newAlignment.m_targetAlign
, sourceRange.GetStartPos()
, targetRange.GetStartPos());
}
TO_STRING_BODY(AlignmentPair);
std::ostream& operator<<(std::ostream &out, const AlignmentPair &alignmentPair)
{
// out << "f2e: " << alignmentPair.m_sourceAlign << ""
// << " , e2f: " << alignmentPair.m_targetAlign << " ";
out << "f2e: ";
alignmentPair.m_sourceAlign.print(out);
out << " , e2f: ";
alignmentPair.m_targetAlign.print(out);
out << " ";
return out;
}

107
moses/src/AlignmentPair.h Normal file
View File

@ -0,0 +1,107 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <iostream>
#include <vector>
#include <iterator>
#include "TypeDef.h"
#include "Util.h"
#include "AlignmentPhrase.h"
typedef std::back_insert_iterator<AlignmentPhrase::CollectionType> AlignmentPhraseInserter;
/** represent the alignment info between source and target phrase */
class AlignmentPair
{
friend std::ostream& operator<<(std::ostream&, const AlignmentPair&);
protected:
AlignmentPhrase m_sourceAlign, m_targetAlign;
public:
// constructor
AlignmentPair()
{}
// constructor, init source size. used in hypo
AlignmentPair(size_t sourceSize)
:m_sourceAlign(sourceSize)
{}
// constructor, by copy
AlignmentPair(const AlignmentPair& a){
m_sourceAlign=a.GetAlignmentPhrase(Input);
m_targetAlign=a.GetAlignmentPhrase(Output);
};
// constructor, by copy
AlignmentPair(const AlignmentPhrase& a, const AlignmentPhrase& b){
SetAlignmentPhrase(a,b);
};
~AlignmentPair(){};
/** get the back_insert_iterator to the source or target alignment vector so that
* they could be populated
*/
AlignmentPhraseInserter GetInserter(FactorDirection direction);
const AlignmentPhrase &GetAlignmentPhrase(FactorDirection direction) const
{
return (direction == Input) ? m_sourceAlign : m_targetAlign;
}
AlignmentPhrase &GetAlignmentPhrase(FactorDirection direction)
{
return (direction == Input) ? m_sourceAlign : m_targetAlign;
}
void SetAlignmentPhrase(FactorDirection direction, const AlignmentPhrase& a)
{
if (direction == Input) m_sourceAlign=a;
else m_targetAlign=a;
}
void SetAlignmentPhrase(const AlignmentPhrase& a, const AlignmentPhrase& b)
{
m_sourceAlign=a;
m_targetAlign=b;
}
/** used by the unknown word handler.
* Set alignment to 0
*/
void SetIdentityAlignment();
//! call Merge() for source and and Add() target alignment phrase
void Add(const AlignmentPair &newAlignment, const WordsRange &sourceRange, const WordsRange &targetRange);
//! call Merge for both source and target alignment phrase
void Merge(const AlignmentPair &newAlignment, const WordsRange &sourceRange, const WordsRange &targetRange);
bool IsCompatible(const AlignmentPair &compare
, size_t sourceStart
, size_t targetStart) const;
TO_STRING();
};

View File

@ -0,0 +1,229 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "AlignmentPhrase.h"
#include "WordsRange.h"
#include "WordsBitmap.h"
#include "UserMessage.h"
using namespace std;
void EmptyAlignment(string &Align, size_t Size)
{
Align = " ";
for (size_t pos = 0 ; pos < Size ; ++pos)
Align += "() ";
}
void UniformAlignment(string &Align, size_t fSize, size_t eSize)
{
std::stringstream AlignStream;
for (size_t fpos = 0 ; fpos < fSize ; ++fpos){
AlignStream << "(";
for (size_t epos = 0 ; epos < eSize ; ++epos){
if (epos) AlignStream << ",";
AlignStream << epos;
}
AlignStream << ") ";
}
Align = AlignStream.str();
}
AlignmentPhrase::AlignmentPhrase(const AlignmentPhrase &copy)
: m_collection(copy.m_collection.size())
{
for (size_t pos = 0 ; pos < copy.m_collection.size() ; ++pos)
{
if (copy.Exists(pos))
m_collection[pos] = new AlignmentElement(copy.GetElement(pos));
else
m_collection[pos] = NULL;
}
}
AlignmentPhrase& AlignmentPhrase::operator=(const AlignmentPhrase &copy)
{
m_collection.resize(copy.GetSize());
// m_collection=AlignmentPhrase(copy.GetSize());
for (size_t pos = 0 ; pos < copy.GetSize() ; ++pos)
{
if (copy.Exists(pos))
m_collection[pos] = new AlignmentElement(copy.GetElement(pos));
else
m_collection[pos] = NULL;
}
return *this;
}
AlignmentPhrase::AlignmentPhrase(size_t size)
:m_collection(size)
{
for (size_t pos = 0 ; pos < size ; ++pos)
{
m_collection[pos] = NULL;
}
}
AlignmentPhrase::~AlignmentPhrase()
{
RemoveAllInColl(m_collection);
}
bool AlignmentPhrase::IsCompatible(const AlignmentPhrase &compare, size_t mergePosStart, size_t shiftPos) const
{
const size_t compareSize = min(GetSize() - mergePosStart , compare.GetSize());
size_t posThis = mergePosStart;
for (size_t posCompare = 0 ; posCompare < compareSize ; ++posCompare)
{
if (!Exists(posThis))
continue;
assert(posThis < GetSize());
const AlignmentElement &alignThis = GetElement(posThis);
AlignmentElement alignCompare = compare.GetElement(posCompare);
// shift alignment
alignCompare.Shift( (int)shiftPos);
if (!alignThis.Equals(alignCompare))
return false;
posThis++;
}
return true;
}
void AlignmentPhrase::Add(const AlignmentPhrase &newAlignment, size_t shift, size_t startPos)
{
size_t insertPos = startPos;
for (size_t pos = 0 ; pos < newAlignment.GetSize() ; ++pos)
{
// shift alignment
AlignmentElement alignElement = newAlignment.GetElement(pos);
alignElement.Shift( (int)shift );
if (insertPos >= GetSize())
{ // probably doing target. append alignment to end
assert(insertPos == GetSize());
Add(alignElement);
}
else
{
if (Exists(insertPos))
{ // add
m_collection[insertPos]->SetIntersect(alignElement);
}
else
m_collection[insertPos] = new AlignmentElement(alignElement);
}
insertPos++;
}
}
void AlignmentPhrase::Shift(size_t shift)
{
for (size_t pos = 0 ; pos < GetSize() ; ++pos)
{
// shift alignment
GetElement(pos).Shift( (int)shift );
}
}
void AlignmentPhrase::Merge(const AlignmentPhrase &newAlignment, size_t shift, size_t startPos)
{
assert(startPos < GetSize());
size_t insertPos = startPos;
for (size_t pos = 0 ; pos < newAlignment.GetSize() ; ++pos)
{
// shift alignment
AlignmentElement alignElement = newAlignment.GetElement(pos);
alignElement.Shift( (int)shift );
// merge elements to only contain co-joined elements
GetElement(insertPos).SetIntersect(alignElement);
insertPos++;
}
}
void AlignmentPhrase::AddUniformAlignmentElement(std::list<size_t> &uniformAlignmentTarget)
{
list<size_t>::iterator iter;
for (iter = uniformAlignmentTarget.begin() ; iter != uniformAlignmentTarget.end() ; ++iter)
{
for (size_t pos = 0 ; pos < GetSize() ; ++pos)
{
AlignmentElement &alignElement = GetElement(pos);
alignElement.Add(*iter);
}
}
}
std::ostream& operator<<(std::ostream& out, const AlignmentPhrase &alignmentPhrase)
{
for (size_t pos = 0 ; pos < alignmentPhrase.GetSize() ; ++pos)
{
if (alignmentPhrase.Exists(pos))
{
if (pos) out << " ";
const AlignmentElement &alignElement = alignmentPhrase.GetElement(pos);
out << alignElement;
}
else{
stringstream strme;
strme << "No alignment at position " << pos;
UserMessage::Add(strme.str());
abort();
}
}
return out;
}
void AlignmentPhrase::print(std::ostream& out, size_t offset) const
{
for (size_t pos = 0 ; pos < GetSize() ; ++pos)
{
if (Exists(pos))
{
if (pos) out << " ";
out << pos+offset << "=";
const AlignmentElement &alignElement = GetElement(pos);
out << alignElement;
}
else{
stringstream strme;
strme << "No alignment at position " << pos;
UserMessage::Add(strme.str());
abort();
// out << pos+offset << "=";
}
}
}
TO_STRING_BODY(AlignmentPhrase);

103
moses/src/AlignmentPhrase.h Normal file
View File

@ -0,0 +1,103 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <iostream>
#include <vector>
#include "AlignmentElement.h"
#include "Util.h"
void EmptyAlignment(std::string &Align, size_t Size);
void UniformAlignment(std::string &Align, size_t fSize, size_t eSize);
class WordsRange;
class WordsBitmap;
//! alignments of each word in a phrase
class AlignmentPhrase
{
friend std::ostream& operator<<(std::ostream& out, const AlignmentPhrase &alignmentPhrase);
public:
typedef std::vector<AlignmentElement*> CollectionType;
protected:
CollectionType m_collection;
public:
AlignmentPhrase(){};
AlignmentPhrase(size_t size);
/** copy constructor */
AlignmentPhrase(const AlignmentPhrase &copy);
AlignmentPhrase& operator=(const AlignmentPhrase&);
/** destructor */
~AlignmentPhrase();
/** compare with another alignment phrase, return true if the other alignment phrase is a
* subset of this. Used to see whether a trans opt can be used to expand a hypo
*/
bool IsCompatible(const AlignmentPhrase &compare, size_t mergePosStart, size_t shiftPos) const;
//! add newAlignment to end of this alignment phrase, offsetting by newAlignmentRange.GetStartPos()
void Add(const AlignmentPhrase &newAlignment, size_t shift, size_t startPos);
/*< merge newAlignment to this alignment phrase, offsetting by newAlignmentRange.GetStartPos().
Use intersection of each alignment element
*/
void Merge(const AlignmentPhrase &newAlignment, size_t shift, size_t startPos);
void Shift(size_t shift);
size_t GetSize() const
{
return m_collection.size();
}
CollectionType &GetVector()
{
return m_collection;
}
void Add(const AlignmentElement &element)
{
m_collection.push_back(new AlignmentElement(element));
}
// add elements which didn't have alignments, so are set to uniform on the other side
void AddUniformAlignmentElement(std::list<size_t> &uniformAlignmentTarget);
AlignmentElement &GetElement(size_t pos)
{ return *m_collection[pos]; }
const AlignmentElement &GetElement(size_t pos) const
{ return *m_collection[pos]; }
bool Exists(size_t pos) const
{
return m_collection[pos] != NULL;
}
void print(std::ostream& out, size_t offset=0) const;
TO_STRING();
};

View File

@ -31,8 +31,7 @@ static const OFF_T InvalidOffT=-1;
// these functions work only for bitwise read/write-able types
template<typename T> inline size_t fWrite(FILE* f,const T& t) {
if(fwrite(&t,sizeof(t),1,f)!=1) {
TRACE_ERR("ERROR:: fwrite!\n");abort();}
if(fwrite(&t,sizeof(t),1,f)!=1) {TRACE_ERR("ERROR:: fwrite!\n");abort();}
return sizeof(t);
}
@ -60,10 +59,37 @@ template<typename C> inline size_t fWriteVector(FILE* f,const C& v) {
}
template<typename C> inline void fReadVector(FILE* f, C& v) {
UINT32 s;fRead(f,s);v.resize(s);
UINT32 s;fRead(f,s);
v.resize(s);
size_t r=fread(&(*v.begin()),sizeof(typename C::value_type),s,f);
if(r!=s) {
TRACE_ERR("ERROR: freadVec! "<<r<<" "<<s<<"\n");abort();}
if(r!=s) {TRACE_ERR("ERROR: freadVec! "<<r<<" "<<s<<"\n");abort();}
}
inline size_t fWriteString(FILE* f,const char* e, UINT32 s) {
size_t rv=fWrite(f,s);
if(fwrite(e,sizeof(char),s,f)!=s) {TRACE_ERR("ERROR:: fwrite!\n");abort();}
return rv+sizeof(char)*s;
}
inline void fReadString(FILE* f,std::string& e) {
UINT32 s;fRead(f,s);
char* a=new char[s+1];
if(fread(a,sizeof(char),s,f)!=s) {TRACE_ERR("ERROR: fread!\n");abort();}
a[s]='\0';
e.assign(a);
}
inline size_t fWriteStringVector(FILE* f,const std::vector<std::string>& v) {
UINT32 s=v.size();
size_t totrv=fWrite(f,s);
for (size_t i=0;i<s;i++){ totrv+=fWriteString(f,v.at(i).c_str(),v.at(i).size()); }
return totrv;
}
inline void fReadStringVector(FILE* f, std::vector<std::string>& v) {
UINT32 s;fRead(f,s);v.resize(s);
for (size_t i=0;i<s;i++){ fReadString(f,v.at(i)); }
}
inline OFF_T fTell(FILE* f) {return FTELLO(f);}

View File

@ -57,7 +57,8 @@ Hypothesis::Hypothesis(InputType const& source, const TargetPhrase &emptyTarget)
, m_languageModelStates(StaticData::Instance().GetLMSize(), LanguageModelSingleFactor::UnknownState)
, m_arcList(NULL)
, m_id(0)
, m_lmstats(NULL)
, m_lmstats(NULL)
, m_alignPair(source.GetSize())
{ // used for initial seeding of trans process
// initialize scores
//_hash_computed = false;
@ -85,7 +86,8 @@ Hypothesis::Hypothesis(const Hypothesis &prevHypo, const TranslationOption &tran
, m_languageModelStates(prevHypo.m_languageModelStates)
, m_arcList(NULL)
, m_id(s_HypothesesCreated++)
, m_lmstats(NULL)
, m_lmstats(NULL)
, m_alignPair(prevHypo.m_alignPair)
{
// assert that we are not extending our hypothesis by retranslating something
// that this hypothesis has already translated!
@ -462,7 +464,11 @@ void Hypothesis::PrintHypothesis() const
TRACE_ERR( "\tbase score "<< (m_prevHypo->m_totalScore - m_prevHypo->m_futureScore) <<endl);
TRACE_ERR( "\tcovering "<<m_currSourceWordsRange.GetStartPos()<<"-"<<m_currSourceWordsRange.GetEndPos()<<": "
<< *m_sourcePhrase <<endl);
TRACE_ERR( "\ttranslated as: "<<m_targetPhrase<<endl); // <<" => translation cost "<<m_score[ScoreType::PhraseTrans];
TRACE_ERR( "\ttranslated as: "<<(Phrase&) m_targetPhrase<<endl); // <<" => translation cost "<<m_score[ScoreType::PhraseTrans];
if (PrintAlignmentInfo()){
TRACE_ERR( "\tsource-target word alignment: "<< m_targetPhrase.GetAlignmentPair().GetAlignmentPhrase(Input) << endl); // <<" => source to target word-to-word alignment
TRACE_ERR( "\ttarget-source word alignment: "<< m_targetPhrase.GetAlignmentPair().GetAlignmentPhrase(Output) << endl); // <<" => target to source word-to-word alignment
}
if (m_wordDeleted) TRACE_ERR( "\tword deleted"<<endl);
// TRACE_ERR( "\tdistance: "<<GetCurrSourceWordsRange().CalcDistortion(m_prevHypo->GetCurrSourceWordsRange())); // << " => distortion cost "<<(m_score[ScoreType::Distortion]*weightDistortion)<<endl;
// TRACE_ERR( "\tlanguage model cost "); // <<m_score[ScoreType::LanguageModelScore]<<endl;
@ -526,6 +532,16 @@ ostream& operator<<(ostream& out, const Hypothesis& hypothesis)
// scores
out << " [total=" << hypothesis.GetTotalScore() << "]";
out << " " << hypothesis.GetScoreBreakdown();
// alignment
if (hypothesis.PrintAlignmentInfo()){
out << " [f2e:";
hypothesis.SourceAlignmentToStream(out);
out << "]";
out << " [e2f:";
hypothesis.TargetAlignmentToStream(out);
out << "]";
}
return out;
}

View File

@ -36,6 +36,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "LexicalReordering.h"
#include "InputType.h"
#include "ObjectPool.h"
#include "AlignmentPair.h"
class SquareMatrix;
class StaticData;
@ -61,7 +62,8 @@ protected:
static ObjectPool<Hypothesis> s_objectPool;
const Hypothesis* m_prevHypo; /*! backpointer to previous hypothesis (from which this one was created) */
const Phrase &m_targetPhrase; /*! target phrase being created at the current decoding step */
// const Phrase &m_targetPhrase; /*! target phrase being created at the current decoding step */
const TargetPhrase &m_targetPhrase; /*! target phrase being created at the current decoding step */
Phrase const* m_sourcePhrase; /*! input sentence */
WordsBitmap m_sourceCompleted; /*! keeps track of which words have been translated so far */
//TODO: how to integrate this into confusion network framework; what if
@ -76,6 +78,7 @@ protected:
std::vector<LanguageModelSingleFactor::State> m_languageModelStates; /*! relevant history for language model scoring -- used for recombination */
const Hypothesis *m_winningHypo;
ArcList *m_arcList; /*! all arcs that end at the same trellis point as this hypothesis */
AlignmentPair m_alignPair;
const TranslationOption *m_transOpt;
int m_id; /*! numeric ID of this hypothesis, used for logging */
@ -117,7 +120,8 @@ public:
void PrintHypothesis( const InputType &source, float weightDistortion, float weightWordPenalty) const;
/** return target phrase used to create this hypothesis */
const Phrase &GetCurrTargetPhrase() const
// const Phrase &GetCurrTargetPhrase() const
const TargetPhrase &GetCurrTargetPhrase() const
{
return m_targetPhrase;
}
@ -211,14 +215,49 @@ public:
// GenerateNGramCompareHash();
// return _hash;
// }
void ToStream(std::ostream& out) const
{
if (m_prevHypo != NULL)
{
m_prevHypo->ToStream(out);
}
out << GetCurrTargetPhrase();
out << (Phrase) GetCurrTargetPhrase();
}
inline bool PrintAlignmentInfo() const{ return GetCurrTargetPhrase().PrintAlignmentInfo(); }
void SourceAlignmentToStream(std::ostream& out) const
{
if (m_prevHypo != NULL)
{
m_prevHypo->SourceAlignmentToStream(out);
AlignmentPhrase alignSourcePhrase=GetCurrTargetPhrase().GetAlignmentPair().GetAlignmentPhrase(Input);
alignSourcePhrase.Shift(m_currTargetWordsRange.GetStartPos());
out << " ";
/*
out << "\nGetCurrTargetPhrase(): " << GetCurrTargetPhrase();
out << "\nm_currTargetWordsRange: " << m_currTargetWordsRange << "->";
*/
alignSourcePhrase.print(out,m_currSourceWordsRange.GetStartPos());
}
}
void TargetAlignmentToStream(std::ostream& out) const
{
if (m_prevHypo != NULL)
{
m_prevHypo->TargetAlignmentToStream(out);
AlignmentPhrase alignTargetPhrase=GetCurrTargetPhrase().GetAlignmentPair().GetAlignmentPhrase(Output);
alignTargetPhrase.Shift(m_currSourceWordsRange.GetStartPos());
out << " ";
/*
out << "\nGetCurrTargetPhrase(): " << GetCurrTargetPhrase();
out << "\nm_currSourceWordsRange: " << m_currSourceWordsRange << "->";
*/
alignTargetPhrase.print(out,m_currTargetWordsRange.GetStartPos());
}
}
TO_STRING();
@ -247,6 +286,16 @@ public:
float GetTotalScore() const { return m_totalScore; }
float GetScore() const { return m_totalScore-m_futureScore; }
//! vector of what source words were aligned to each target
const AlignmentPair &GetAlignmentPair() const
{
return m_alignPair;
}
//! target span that trans opt would populate if applied to this hypo. Used for alignment check
size_t GetNextStartPos(const TranslationOption &transOpt) const;
std::vector<std::vector<unsigned int> > *GetLMStats() const
{
return m_lmstats;

View File

@ -53,8 +53,6 @@ bool LanguageModelSRI::Load(const std::string &filePath
, float weight
, size_t nGramOrder)
{
FactorCollection &factorCollection = FactorCollection::Instance();
m_srilmVocab = new Vocab();
m_srilmModel = new Ngram(*m_srilmVocab, nGramOrder);
m_factorType = factorType;

View File

@ -47,7 +47,6 @@ Manager::Manager(InputType const& source, SearchAlgorithm searchAlgorithm)
,m_start(clock())
,interrupted_flag(0)
{
VERBOSE(1, "Translating: " << m_source << endl);
const StaticData &staticData = StaticData::Instance();
staticData.InitializeBeforeSentenceProcessing(source);
}
@ -72,6 +71,7 @@ Manager::~Manager()
*/
void Manager::ProcessSentence()
{
//VERBOSE(2,"m_source:" << m_source <<"\n");
const StaticData &staticData = StaticData::Instance();
staticData.ResetSentenceStats(m_source);
const vector <DecodeGraph*>
@ -83,7 +83,6 @@ void Manager::ProcessSentence()
// 2. initial hypothesis factors are given in the sentence
//CreateTranslationOptions(m_source, phraseDictionary, lmListInitial);
m_transOptColl->CreateTranslationOptions(decodeStepVL);
m_search->ProcessSentence();
}
@ -199,23 +198,21 @@ void OutputWordGraph(std::ostream &outputWordGraphStream, const Hypothesis *hypo
const StaticData &staticData = StaticData::Instance();
const Hypothesis *prevHypo = hypo->GetPrevHypo();
const Phrase *sourcePhrase = hypo->GetSourcePhrase();
const Phrase &targetPhrase = hypo->GetCurrTargetPhrase();
outputWordGraphStream << "J=" << linkId++
<< "\tS=" << prevHypo->GetId()
<< "\tE=" << hypo->GetId()
<< "\ta=";
outputWordGraphStream << "J=" << linkId++
<< "\tS=" << prevHypo->GetId()
<< "\tE=" << hypo->GetId()
<< "\ta=";
// phrase table scores
const std::vector<PhraseDictionary*> &phraseTables = staticData.GetPhraseDictionaries();
std::vector<PhraseDictionary*>::const_iterator iterPhraseTable;
for (iterPhraseTable = phraseTables.begin() ; iterPhraseTable != phraseTables.end() ; ++iterPhraseTable)
{
// phrase table scores
const std::vector<PhraseDictionary*> &phraseTables = staticData.GetPhraseDictionaries();
std::vector<PhraseDictionary*>::const_iterator iterPhraseTable;
for (iterPhraseTable = phraseTables.begin() ; iterPhraseTable != phraseTables.end() ; ++iterPhraseTable)
{
const PhraseDictionary *phraseTable = *iterPhraseTable;
vector<float> scores = hypo->GetScoreBreakdown().GetScoresForProducer(phraseTable);
outputWordGraphStream << scores[0];
vector<float>::const_iterator iterScore;
for (iterScore = ++scores.begin() ; iterScore != scores.end() ; ++iterScore)

View File

@ -113,6 +113,7 @@ public:
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase)
{
cerr << "AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase)" << endl;
assert(GetTargetPhraseCollection(source)==0);
VERBOSE(2, "adding unk source phrase "<<source<<"\n");
@ -131,6 +132,7 @@ public:
TargetPhraseCollection const*
GetTargetPhraseCollection(Phrase const &src) const
{
assert(m_dict);
if(src.GetSize()==0) return 0;
@ -155,7 +157,10 @@ public:
// get target phrases in string representation
std::vector<StringTgtCand> cands;
m_dict->GetTargetCandidates(srcString,cands);
std::vector<StringWordAlignmentCand> swacands;
std::vector<StringWordAlignmentCand> twacands;
// m_dict->GetTargetCandidates(srcString,cands);
m_dict->GetTargetCandidates(srcString,cands,swacands,twacands);
if(cands.empty())
{
return 0;
@ -166,36 +171,39 @@ public:
// convert into TargetPhrases
for(size_t i=0;i<cands.size();++i)
{
TargetPhrase targetPhrase(Output);
StringTgtCand::first_type const& factorStrings=cands[i].first;
StringTgtCand::second_type const& probVector=cands[i].second;
std::vector<float> scoreVector(probVector.size());
std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
TransformScore);
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
FloorScore);
CreateTargetPhrase(targetPhrase,factorStrings,scoreVector);
costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),
tCands.size()));
tCands.push_back(targetPhrase);
}
TargetPhraseCollection *rv=PruneTargetCandidates(tCands,costs);
{
TargetPhrase targetPhrase(Output);
StringTgtCand::first_type const& factorStrings=cands[i].first;
StringTgtCand::second_type const& probVector=cands[i].second;
StringWordAlignmentCand::second_type const& swaVector=swacands[i].second;
StringWordAlignmentCand::second_type const& twaVector=twacands[i].second;
std::vector<float> scoreVector(probVector.size());
std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
TransformScore);
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
FloorScore);
// CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,&src);
CreateTargetPhrase(targetPhrase,factorStrings,scoreVector,swaVector,twaVector,&src); costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),
tCands.size()));
tCands.push_back(targetPhrase);
}
TargetPhraseCollection *rv;
rv=PruneTargetCandidates(tCands,costs);
if(rv->IsEmpty())
{
delete rv;
return 0;
}
{
delete rv;
return 0;
}
else
{
if(useCache) piter.first->second=rv;
m_tgtColls.push_back(rv);
return rv;
}
{
if(useCache) piter.first->second=rv;
m_tgtColls.push_back(rv);
return rv;
}
}
@ -226,7 +234,14 @@ public:
m_dict->Create(in,filePath);
}
TRACE_ERR( "reading bin ttable\n");
m_dict->Read(filePath);
// m_dict->Read(filePath);
bool res=m_dict->Read(filePath);
if (!res) {
stringstream strme;
strme << "bin ttable was read in a wrong way\n";
UserMessage::Add(strme.str());
exit(1);
}
}
typedef PhraseDictionaryTree::PrefixPtr PPtr;
@ -257,13 +272,36 @@ public:
};
void CreateTargetPhrase(TargetPhrase& targetPhrase,
StringTgtCand::first_type const& factorStrings,
StringTgtCand::second_type const& scoreVector,
Phrase const* srcPtr=0) const
{
FactorCollection &factorCollection = FactorCollection::Instance();
for(size_t k=0;k<factorStrings.size();++k)
{
std::vector<std::string> factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter());
Word& w=targetPhrase.AddWord();
for(size_t l=0;l<m_output.size();++l)
w[m_output[l]]= factorCollection.AddFactor(Output, m_output[l], factors[l]);
}
targetPhrase.SetScore(m_obj, scoreVector, m_weights, m_weightWP, *m_languageModels);
targetPhrase.SetSourcePhrase(srcPtr);
// targetPhrase.CreateAlignmentInfo("???", "???", 44);
}
void CreateTargetPhrase(TargetPhrase& targetPhrase,
StringTgtCand::first_type const& factorStrings,
StringTgtCand::second_type const& scoreVector,
StringWordAlignmentCand::second_type const& swaVector,
StringWordAlignmentCand::second_type const& twaVector,
Phrase const* srcPtr=0) const
{
FactorCollection &factorCollection = FactorCollection::Instance();
for(size_t k=0;k<factorStrings.size();++k)
{
@ -274,6 +312,8 @@ public:
}
targetPhrase.SetScore(m_obj, scoreVector, m_weights, m_weightWP, *m_languageModels);
targetPhrase.SetSourcePhrase(srcPtr);
targetPhrase.CreateAlignmentInfo(swaVector, twaVector);
}

View File

@ -88,6 +88,10 @@ Parameter::Parameter()
AddParam("cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)");
AddParam("search-algorithm", "", "Which search algorithm to use. 0=normal stack, 1=cube pruning, 2=cube growing. (default = 0)");
AddParam("constraint","","Target sentence to produce");
AddParam("use-alignment-info", "Use word-to-word alignment: actually it is only used to output the word-to-word alignment. Word-to-word alignments are taken from the phrase table if any. Default is false.");
AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
}
Parameter::~Parameter()

View File

@ -33,6 +33,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "StaticData.h"
#include "WordsRange.h"
#include "UserMessage.h"
#include "AlignmentPair.h"
using namespace std;
@ -45,7 +46,7 @@ bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
, float weightWP)
{
const StaticData &staticData = StaticData::Instance();
m_tableLimit = tableLimit;
m_filePath = filePath;
@ -83,26 +84,41 @@ bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
stringstream strme;
strme << "Syntax error at " << filePath << ":" << line_num;
UserMessage::Add(strme.str());
return false;
abort();
}
bool isLHSEmpty = (tokens[1].find_first_not_of(" \t", 0) == string::npos);
string sourcePhraseString, targetPhraseString;
string scoreString;
string sourceAlignString, targetAlignString;
sourcePhraseString=tokens[0];
targetPhraseString=tokens[1];
if (numElement==3){
scoreString=tokens[2];
}
else{
sourceAlignString=tokens[2];
targetAlignString=tokens[3];
scoreString=tokens[4];
}
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty target, skipping\n");
continue;
}
const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
if (tokens[0] != prevSourcePhrase)
phraseVector = Phrase::Parse(tokens[0], input, factorDelimiter);
if (sourcePhraseString != prevSourcePhrase)
phraseVector = Phrase::Parse(sourcePhraseString, input, factorDelimiter);
vector<float> scoreVector = Tokenize<float>(tokens[(numElement==3) ? 2 : 4]);
vector<float> scoreVector = Tokenize<float>(scoreString);
if (scoreVector.size() != m_numScoreComponent)
{
stringstream strme;
strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
UserMessage::Add(strme.str());
return false;
abort();
}
// assert(scoreVector.size() == m_numScoreComponent);
@ -111,8 +127,27 @@ bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
sourcePhrase.CreateFromString( input, phraseVector);
//target
TargetPhrase targetPhrase(Output);
targetPhrase.CreateFromString( output, tokens[1], factorDelimiter);
targetPhrase.SetSourcePhrase(&sourcePhrase);
targetPhrase.CreateFromString( output, targetPhraseString, factorDelimiter);
if (!staticData.UseAlignmentInfo()){
UniformAlignment(sourceAlignString, sourcePhrase.GetSize(), targetPhrase.GetSize());
UniformAlignment(targetAlignString, targetPhrase.GetSize(), sourcePhrase.GetSize());
/*
EmptyAlignment(sourceAlignString, sourcePhrase.GetSize());
EmptyAlignment(targetAlignString, targetPhrase.GetSize());
*/
}
else if (numElement==3){
stringstream strme;
strme << "You are using AlignmentInfo, but this info not available in the Phrase Table. Only " <<numElement<<" fields on line " << line_num;
UserMessage::Add(strme.str());
return false;
}
// alignment info
targetPhrase.CreateAlignmentInfo(sourceAlignString, targetAlignString);
// component score, for n-best output
std::vector<float> scv(scoreVector.size());
std::transform(scoreVector.begin(),scoreVector.end(),scv.begin(),TransformScore);

View File

@ -8,12 +8,6 @@
#include <fstream>
#include <string>
#include "PrefixTree.h"
#include "File.h"
#include "ObjectPool.h"
#include "LVoc.h"
#include "TypeDef.h"
#include "Util.h"
template<typename T>
std::ostream& operator<<(std::ostream& out,const std::vector<T>& x)
@ -25,52 +19,95 @@ std::ostream& operator<<(std::ostream& out,const std::vector<T>& x)
return out;
}
typedef std::vector<float> Scores;
typedef PrefixTreeF<LabelId,OFF_T> PTF;
class TgtCand {
IPhrase e;
Scores sc;
WordAlignments m_sourceAlignment, m_targetAlignment;
public:
TgtCand() {}
TgtCand(const IPhrase& a, const Scores& b
, const WordAlignments &sourceAlignment, const WordAlignments &targetAlignment)
: e(a)
, sc(b)
, m_sourceAlignment(sourceAlignment)
, m_targetAlignment(targetAlignment)
{}
TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {}
TgtCand(FILE* f) {readBin(f);}
void writeBin(FILE* f) const
{
fWriteVector(f,e);
fWriteVector(f,sc);
}
void readBin(FILE* f)
{
fReadVector(f,e);
fReadVector(f,sc);
}
void writeBinWithAlignment(FILE* f) const
{
fWriteVector(f,e);
fWriteVector(f,sc);
fWriteStringVector(f, m_sourceAlignment);
fWriteStringVector(f, m_targetAlignment);
}
void readBinWithAlignment(FILE* f)
{
fReadVector(f,e);
fReadVector(f,sc);
fReadStringVector(f, m_sourceAlignment);
fReadStringVector(f, m_targetAlignment);
}
const IPhrase& GetPhrase() const {return e;}
const Scores& GetScores() const {return sc;}
void writeBin(FILE* f) const {fWriteVector(f,e);fWriteVector(f,sc);}
void readBin(FILE* f) {fReadVector(f,e);fReadVector(f,sc);}
const WordAlignments& GetSourceAlignment() const {return m_sourceAlignment;}
const WordAlignments& GetTargetAlignment() const {return m_targetAlignment;}
};
class TgtCands : public std::vector<TgtCand> {
typedef std::vector<TgtCand> MyBase;
public:
TgtCands() : MyBase() {}
void writeBin(FILE* f) const
{
unsigned s=size();fWrite(f,s);
unsigned s=size();
fWrite(f,s);
for(size_t i=0;i<s;++i) MyBase::operator[](i).writeBin(f);
}
void writeBinWithAlignment(FILE* f) const
{
unsigned s=size();
fWrite(f,s);
for(size_t i=0;i<s;++i) MyBase::operator[](i).writeBinWithAlignment(f);
}
void readBin(FILE* f)
{
unsigned s;fRead(f,s);resize(s);
for(size_t i=0;i<s;++i) MyBase::operator[](i).readBin(f);
}
};
struct PPimp {
PTF const*p;unsigned idx;bool root;
PPimp(PTF const* x,unsigned i,bool b) : p(x),idx(i),root(b) {}
bool isValid() const {return root || (p && idx<p->size());}
bool isRoot() const {return root;}
PTF const* ptr() const {return p;}
void readBinWithAlignment(FILE* f)
{
unsigned s;fRead(f,s);resize(s);
for(size_t i=0;i<s;++i) MyBase::operator[](i).readBinWithAlignment(f);
}
};
PhraseDictionaryTree::PrefixPtr::operator bool() const
{
return imp && imp->isValid();
@ -91,11 +128,19 @@ struct PDTimp {
ObjectPool<PPimp> pPool;
// a comparison with the Boost MemPools might be useful
bool usewordalign;
bool printwordalign;
PDTimp() : os(0),ot(0) {PTF::setDefault(InvalidOffT);}
PDTimp() : os(0),ot(0), usewordalign(false), printwordalign(false) {PTF::setDefault(InvalidOffT);}
~PDTimp() {if(os) fClose(os);if(ot) fClose(ot);FreeMemory();}
inline void UseWordAlignment(bool a){ usewordalign=a; }
inline bool UseWordAlignment(){ return usewordalign; };
inline void PrintWordAlignment(bool a){ printwordalign=a; };
inline bool PrintWordAlignment(){ return printwordalign; };
void FreeMemory()
{
for(Data::iterator i=data.begin();i!=data.end();++i) (*i).free();
@ -113,20 +158,24 @@ struct PDTimp {
OFF_T tCandOffset=data[f[0]]->find(f);
if(tCandOffset==InvalidOffT) return;
fSeek(ot,tCandOffset);
tgtCands.readBin(ot);
if (UseWordAlignment()) tgtCands.readBinWithAlignment(ot);
else tgtCands.readBin(ot);
}
typedef PhraseDictionaryTree::PrefixPtr PPtr;
void GetTargetCandidates(PPtr p,TgtCands& tgtCands)
void GetTargetCandidates(PPtr p,TgtCands& tgtCands)
{
assert(p);
if(p.imp->isRoot()) return;
OFF_T tCandOffset=p.imp->ptr()->getData(p.imp->idx);
if(tCandOffset==InvalidOffT) return;
fSeek(ot,tCandOffset);
tgtCands.readBin(ot);
if (UseWordAlignment()) tgtCands.readBinWithAlignment(ot);
else tgtCands.readBin(ot);
}
void PrintTgtCand(const TgtCands& tcands,std::ostream& out) const;
// convert target candidates from internal data structure to the external one
@ -142,6 +191,25 @@ struct PDTimp {
rv.push_back(StringTgtCand(vs,i->GetScores()));
}
}
// convert target candidates from internal data structure to the external one
void ConvertTgtCand(const TgtCands& tcands,std::vector<StringTgtCand>& rv,
std::vector<StringWordAlignmentCand>& swa,
std::vector<StringWordAlignmentCand>& twa) const
{
for(TgtCands::const_iterator i=tcands.begin();i!=tcands.end();++i)
{
const IPhrase& iphrase=i->GetPhrase();
std::vector<std::string const*> vs;
vs.reserve(iphrase.size());
for(size_t j=0;j<iphrase.size();++j)
vs.push_back(&tv.symbol(iphrase[j]));
rv.push_back(StringTgtCand(vs,i->GetScores()));
swa.push_back(StringWordAlignmentCand(vs,(i->GetSourceAlignment())));
twa.push_back(StringWordAlignmentCand(vs,(i->GetTargetAlignment())));
}
}
PPtr GetRoot()
{
@ -182,11 +250,42 @@ struct PDTimp {
int PDTimp::Read(const std::string& fn)
{
std::string ifs(fn+".binphr.srctree"),
ift(fn+".binphr.tgtdata"),
ifi(fn+".binphr.idx"),
ifsv(fn+".binphr.srcvoc"),
iftv(fn+".binphr.tgtvoc");
const StaticData &staticData = StaticData::Instance();
std::string ifs, ift, ifi, ifsv, iftv;
if (staticData.UseAlignmentInfo()){//asking for word-to-word alignment
if (!FileExists(fn+".binphr.srctree.wa") || !FileExists(fn+".binphr.tgtdata.wa")){
// ERROR
std::stringstream strme;
strme << "You are asking for word alignment but the binary phrase table does not contain any alignment info. Please check if you had generated the correct phrase table with word alignment (.wa)\n";
UserMessage::Add(strme.str());
return false;
}
ifs=fn+".binphr.srctree.wa";
ift=fn+".binphr.tgtdata.wa";
ifi=fn+".binphr.idx";
ifsv=fn+".binphr.srcvoc";
iftv=fn+".binphr.tgtvoc";
UseWordAlignment(true);
}
else{
if (!FileExists(fn+".binphr.srctree") || !FileExists(fn+".binphr.tgtdata")){
// ERROR
std::stringstream strme;
strme << "You are asking binary phrase table without word alignments but the file do not exist. Please check if you had generated the correct phrase table without word alignment (" << (fn+".binphr.srctree") << "," << (fn+".binphr.tgtdata")<< ")\n";
UserMessage::Add(strme.str());
return false;
}
ifs=fn+".binphr.srctree";
ift=fn+".binphr.tgtdata";
ifi=fn+".binphr.idx";
ifsv=fn+".binphr.srcvoc";
iftv=fn+".binphr.tgtvoc";
UseWordAlignment(false);
}
FILE *ii=fOpen(ifi.c_str(),"rb");
fReadVector(ii,srcOffsets);
@ -210,13 +309,22 @@ int PDTimp::Read(const std::string& fn)
void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const
{
for(size_t i=0;i<tcand.size();++i)
{
out<<i<<" -- "<<tcand[i].GetScores()<<" -- ";
const IPhrase& iphr=tcand[i].GetPhrase();
for(size_t j=0;j<iphr.size();++j)
out<<tv.symbol(iphr[j])<<" ";
out<<'\n';
}
{
Scores sc=tcand[i].GetScores();
WordAlignments srcAlign=tcand[i].GetSourceAlignment();
WordAlignments trgAlign=tcand[i].GetTargetAlignment();
const IPhrase& iphr=tcand[i].GetPhrase();
out << i << " -- " << sc << " -- ";
for(size_t j=0;j<iphr.size();++j) out << tv.symbol(iphr[j])<<" ";
out<< " -- ";
for (size_t j=0;j<srcAlign.size();j++) out << " " << srcAlign[j];
out << " -- ";
for (size_t j=0;j<trgAlign.size();j++) out << " " << trgAlign[j];
out << std::endl;
}
}
////////////////////////////////////////////////////////////
@ -241,6 +349,13 @@ PhraseDictionaryTree::~PhraseDictionaryTree()
{
delete imp;
}
void PhraseDictionaryTree::UseWordAlignment(bool a){ imp->UseWordAlignment(a); };
bool PhraseDictionaryTree::UseWordAlignment(){ return imp->UseWordAlignment(); };
void PhraseDictionaryTree::PrintWordAlignment(bool a){ imp->PrintWordAlignment(a); };
bool PhraseDictionaryTree::PrintWordAlignment(){ return imp->PrintWordAlignment(); };
void PhraseDictionaryTree::FreeMemory() const
{
imp->FreeMemory();
@ -262,6 +377,25 @@ GetTargetCandidates(const std::vector<std::string>& src,
imp->ConvertTgtCand(tgtCands,rv);
}
void PhraseDictionaryTree::
GetTargetCandidates(const std::vector<std::string>& src,
std::vector<StringTgtCand>& rv,
std::vector<StringWordAlignmentCand>& swa,
std::vector<StringWordAlignmentCand>& twa) const
{
IPhrase f(src.size());
for(size_t i=0;i<src.size();++i)
{
f[i]=imp->sv.index(src[i]);
if(f[i]==InvalidLabelId) return;
}
TgtCands tgtCands;
imp->GetTargetCandidates(f,tgtCands);
imp->ConvertTgtCand(tgtCands,rv,swa,twa);
}
void PhraseDictionaryTree::
PrintTargetCandidates(const std::vector<std::string>& src,
std::ostream& out) const
@ -280,7 +414,6 @@ PrintTargetCandidates(const std::vector<std::string>& src,
TgtCands tcand;
imp->GetTargetCandidates(f,tcand);
out<<"there are "<<tcand.size()<<" target candidates\n";
imp->PrintTgtCand(tcand,out);
}
@ -294,7 +427,12 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
ofi(out+".binphr.idx"),
ofsv(out+".binphr.srcvoc"),
oftv(out+".binphr.tgtvoc");
if (PrintWordAlignment()){
ofn+=".wa";
oft+=".wa";
}
FILE *os=fOpen(ofn.c_str(),"wb"),
*ot=fOpen(oft.c_str(),"wb");
@ -309,108 +447,175 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
while(getline(inFile, line))
{
++lnc;
std::vector<std::string> tokens = TokenizeMultiCharSeparator( line , "|||" );
if (numElement == NOT_FOUND)
{ // init numElement
numElement = tokens.size();
assert(numElement == 3 || numElement == 5);
}
if (tokens.size() != numElement)
{
++lnc;
std::stringstream strme;
strme << "Syntax error at line " << lnc << " : " << line;
UserMessage::Add(strme.str());
abort();
}
std::string sourcePhraseString, targetPhraseString;
std::string scoreString;
std::string sourceAlignString, targetAlignString;
sourcePhraseString=tokens[0];
targetPhraseString=tokens[1];
if (numElement==3){
scoreString=tokens[2];
}
else{
sourceAlignString=tokens[2];
targetAlignString=tokens[3];
scoreString=tokens[4];
}
IPhrase f,e;
Scores sc;
WordAlignments sourceAlignment, targetAlignment;
std::vector<std::string> tokens = TokenizeMultiCharSeparator( line , "|||" );
if (numElement == NOT_FOUND)
{ // init numElement
numElement = tokens.size();
assert(numElement == 3 || numElement == 5);
}
else if (tokens.size() != numElement)
std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
for (size_t i = 0 ; i < wordVec.size() ; ++i)
f.push_back(imp->sv.add(wordVec[i]));
wordVec = Tokenize(targetPhraseString);
for (size_t i = 0 ; i < wordVec.size() ; ++i)
e.push_back(imp->tv.add(wordVec[i]));
if (!PrintWordAlignment()){// word-to-word alignment are not used, create empty word-to-word alignment
EmptyAlignment(sourceAlignString, f.size());
EmptyAlignment(targetAlignString, e.size());
}
else if (numElement==3){
stringstream strme;
strme << "You are asking for AlignmentInfo, but this info not available in the Phrase Table. Only " <<numElement<<" fields on line " << lnc << " : " << line;
strme << endl << "Deleting files " << ofn << " and " << oft << "..." << endl;
if( remove( ofn.c_str() ) != 0 ) strme << "Error deleting file " << ofn;
else strme << "File " << ofn << " successfully deleted";
strme << endl;
if( remove( oft.c_str() ) != 0 ) strme << "Error deleting file " << oft;
else strme << "File " << oft << " successfully deleted";
strme << endl;
UserMessage::Add(strme.str());
exit(1);
}
//change "()" into "(-1)" for both source and target word-to-word alignments
std::string emtpyAlignStr="()";
std::string replaceAlignStr="(-1)";
sourceAlignString=Replace(sourceAlignString,emtpyAlignStr,replaceAlignStr);
targetAlignString=Replace(targetAlignString,emtpyAlignStr,replaceAlignStr);
//remove all "(" from both source and target word-to-word alignments
emtpyAlignStr="(";
replaceAlignStr="";
sourceAlignString=Replace(sourceAlignString,emtpyAlignStr,replaceAlignStr);
targetAlignString=Replace(targetAlignString,emtpyAlignStr,replaceAlignStr);
//remove all ")" from both source and target word-to-word alignments
emtpyAlignStr=")";
replaceAlignStr="";
sourceAlignString=Replace(sourceAlignString,emtpyAlignStr,replaceAlignStr);
targetAlignString=Replace(targetAlignString,emtpyAlignStr,replaceAlignStr);
sourceAlignment = Tokenize(sourceAlignString);
targetAlignment = Tokenize(targetAlignString);
// while(is>>w && w!="|||") sc.push_back(atof(w.c_str()));
// Mauro: to handle 0 probs in phrase tables
std::vector<float> scoreVector = Tokenize<float>(scoreString);
for (size_t i = 0 ; i < scoreVector.size() ; ++i)
{
float tmp = scoreVector[i];
sc.push_back(((tmp>0.0)?tmp:(float)1.0e-38));
}
if(f.empty())
{
TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
continue;
}
if(currFirstWord==InvalidLabelId) currFirstWord=f[0];
if(currF.empty())
{
++count;
currF=f;
// insert src phrase in prefix tree
assert(psa);
PSA::Data& d=psa->insert(f);
if(d==InvalidOffT) d=fTell(ot);
else
{
std::stringstream strme;
strme << "Syntax error at line " << lnc << " : " << line;
UserMessage::Add(strme.str());
TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '"
<<line<<"'\nf: "<<f<<"\n");
abort();
}
IPhrase f,e;Scores sc;
std::vector<std::string> wordVec = Tokenize(tokens[0]);
for (size_t i = 0 ; i < wordVec.size() ; ++i)
f.push_back(imp->sv.add(wordVec[i]));
}
wordVec = Tokenize(tokens[1]);
for (size_t i = 0 ; i < wordVec.size() ; ++i)
e.push_back(imp->tv.add(wordVec[i]));
// while(is>>w && w!="|||") sc.push_back(atof(w.c_str()));
// Mauro: to handle 0 probs in phrase tables
std::vector<float> scoreVector = Tokenize<float>(tokens[(numElement==3) ? 2 : 4]);
for (size_t i = 0 ; i < scoreVector.size() ; ++i)
if(currF!=f)
{
// new src phrase
currF=f;
if (PrintWordAlignment())
tgtCands.writeBinWithAlignment(ot);
else
tgtCands.writeBin(ot);
tgtCands.clear();
if(++count%10000==0)
{
float tmp = scoreVector[i];
sc.push_back(((tmp>0.0)?tmp:(float)1.0e-38));
TRACE_ERR(".");
if(count%500000==0) TRACE_ERR("[phrase:"<<count<<"]\n");
}
if(f[0]!=currFirstWord)
{
// write src prefix tree to file and clear
PTF pf;
if(currFirstWord>=vo.size())
vo.resize(currFirstWord+1,InvalidOffT);
vo[currFirstWord]=fTell(os);
pf.create(*psa,os);
// clear
delete psa;psa=new PSA;
currFirstWord=f[0];
}
if(f.empty())
{
TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
continue;
}
if(currFirstWord==InvalidLabelId) currFirstWord=f[0];
if(currF.empty())
{
++count;
currF=f;
// insert src phrase in prefix tree
assert(psa);
PSA::Data& d=psa->insert(f);
if(d==InvalidOffT) d=fTell(ot);
else
{
TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '"
<<line<<"'\nf: "<<f<<"\n");
abort();
}
}
if(currF!=f)
{
// new src phrase
currF=f;
tgtCands.writeBin(ot);
tgtCands.clear();
if(++count%10000==0)
{
TRACE_ERR(".");
if(count%500000==0) TRACE_ERR("[phrase:"<<count<<"]\n");
}
if(f[0]!=currFirstWord)
{
// write src prefix tree to file and clear
PTF pf;
if(currFirstWord>=vo.size())
vo.resize(currFirstWord+1,InvalidOffT);
vo[currFirstWord]=fTell(os);
pf.create(*psa,os);
// clear
delete psa;psa=new PSA;
currFirstWord=f[0];
}
// insert src phrase in prefix tree
assert(psa);
PSA::Data& d=psa->insert(f);
if(d==InvalidOffT) d=fTell(ot);
else
{
TRACE_ERR("ERROR: xsource phrase already inserted (B)!\nline(" << lnc << "): '"
<<line<<"'\nf: "<<f<<"\n");
abort();
}
}
tgtCands.push_back(TgtCand(e,sc));
assert(currFirstWord!=InvalidLabelId);
// insert src phrase in prefix tree
assert(psa);
PSA::Data& d=psa->insert(f);
if(d==InvalidOffT) d=fTell(ot);
else
{
TRACE_ERR("ERROR: xsource phrase already inserted (B)!\nline(" << lnc << "): '"
<<line<<"'\nf: "<<f<<"\n");
abort();
}
}
tgtCands.writeBin(ot);tgtCands.clear();
tgtCands.push_back(TgtCand(e,sc, sourceAlignment, targetAlignment));
assert(currFirstWord!=InvalidLabelId);
}
if (PrintWordAlignment())
tgtCands.writeBinWithAlignment(ot);
else
tgtCands.writeBin(ot);
tgtCands.clear();
PTF pf;
if(currFirstWord>=vo.size()) vo.resize(currFirstWord+1,InvalidOffT);
vo[currFirstWord]=fTell(os);
@ -484,8 +689,17 @@ GetTargetCandidates(PrefixPtr p,
imp->ConvertTgtCand(tcands,rv);
}
std::string PhraseDictionaryTree::GetScoreProducerDescription() const
void PhraseDictionaryTree::
GetTargetCandidates(PrefixPtr p,
std::vector<StringTgtCand>& rv,
std::vector<StringWordAlignmentCand>& swa,
std::vector<StringWordAlignmentCand>& twa) const
{
return "Phrase dictionary tree";
TgtCands tcands;
imp->GetTargetCandidates(p,tcands);
imp->ConvertTgtCand(tcands,rv,swa,twa);
}
std::string PhraseDictionaryTree::GetScoreProducerDescription() const{
return "Phrase dictionary tree";
}

View File

@ -8,11 +8,21 @@
#include "TypeDef.h"
#include "Dictionary.h"
#include "PrefixTree.h"
#include "File.h"
#include "ObjectPool.h"
#include "LVoc.h"
#include "TypeDef.h"
#include "Util.h"
#include "StaticData.h"
class Phrase;
class Word;
class ConfusionNet;
typedef std::pair<std::vector<std::string const*>,std::vector<float> > StringTgtCand;
typedef PrefixTreeF<LabelId,OFF_T> PTF;
class PDTimp;
class PPimp;
@ -24,7 +34,14 @@ class PhraseDictionaryTree : public Dictionary {
PhraseDictionaryTree(const PhraseDictionaryTree&); //not implemented
void operator=(const PhraseDictionaryTree&); //not implemented
public:
PhraseDictionaryTree(size_t numScoreComponent);
PhraseDictionaryTree(size_t numScoreComponent);
void UseWordAlignment(bool a);
bool UseWordAlignment();
void PrintWordAlignment(bool a);
bool PrintWordAlignment();
virtual ~PhraseDictionaryTree();
@ -48,10 +65,16 @@ public:
// print target candidates for a given phrase, mainly for debugging
void PrintTargetCandidates(const std::vector<std::string>& src,
std::ostream& out) const;
// get the target candidates for a given phrase
void GetTargetCandidates(const std::vector<std::string>& src,
std::vector<StringTgtCand>& rv) const;
// get the target candidates for a given phrase
void GetTargetCandidates(const std::vector<std::string>& src,
std::vector<StringTgtCand>& rv,
std::vector<StringWordAlignmentCand>& swa,
std::vector<StringWordAlignmentCand>& twa) const;
/*****************************
* access to prefix tree *
@ -81,6 +104,10 @@ public:
// requirement: the pointer has to evaluate to true
void GetTargetCandidates(PrefixPtr p,
std::vector<StringTgtCand>& rv) const;
void GetTargetCandidates(PrefixPtr p,
std::vector<StringTgtCand>& rv,
std::vector<StringWordAlignmentCand>& swa,
std::vector<StringWordAlignmentCand>& twa) const;
// print target candidates for a given prefix pointer to a stream, mainly
// for debugging

View File

@ -55,8 +55,6 @@ bool PhraseDictionaryTreeAdaptor::Load(const std::vector<FactorType> &input
, float weightWP
)
{
FactorCollection &factorCollection = FactorCollection::Instance();
if(m_numScoreComponent!=weight.size()) {
stringstream strme;
strme << "ERROR: mismatch of number of scaling factors: "<<weight.size()
@ -84,6 +82,7 @@ PhraseDictionaryTreeAdaptor::GetTargetPhraseCollection(Phrase const &src) const
{
return imp->GetTargetPhraseCollection(src);
}
TargetPhraseCollection const*
PhraseDictionaryTreeAdaptor::GetTargetPhraseCollection(InputType const& src,WordsRange const &range) const
{

View File

@ -115,7 +115,21 @@ bool StaticData::LoadData(Parameter *parameter)
if (m_parameter->GetParam("factor-delimiter").size() > 0) {
m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
}
//word-to-word alignment
SetBooleanParameter( &m_UseAlignmentInfo, "use-alignment-info", false );
SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
if (!m_UseAlignmentInfo && m_PrintAlignmentInfo){
TRACE_ERR("--print-alignment-info should only be used together with \"--use-alignment-info true\". Continue forcing to false.\n");
m_PrintAlignmentInfo=false;
}
if (!m_UseAlignmentInfo && m_PrintAlignmentInfoNbest){
TRACE_ERR("--print-alignment-info-in-n-best should only be used together with \"--use-alignment-info true\". Continue forcing to false.\n");
m_PrintAlignmentInfoNbest=false;
}
// n-best
if (m_parameter->GetParam("n-best-list").size() >= 2)
{
@ -731,6 +745,7 @@ bool StaticData::LoadPhraseTables()
IFVERBOSE(1)
PrintUserTime(string("Start loading PhraseTable ") + filePath);
std::cerr << "filePath: " << filePath << std::endl;
if (!FileExists(filePath+".binphr.idx"))
{ // memory phrase table
VERBOSE(2,"using standard phrase tables");

View File

@ -111,7 +111,10 @@ protected:
bool m_isDetailedTranslationReportingEnabled;
bool m_onlyDistinctNBest;
bool m_computeLMBackoffStats;
bool m_UseAlignmentInfo;
bool m_PrintAlignmentInfo;
bool m_PrintAlignmentInfoNbest;
mutable std::auto_ptr<SentenceStats> m_sentenceStats;
std::string m_factorDelimiter; //! by default, |, but it can be changed
size_t m_maxFactorIdx[2]; //! number of factors on source and target side
@ -392,6 +395,10 @@ public:
const UnknownWordPenaltyProducer *GetUnknownWordPenaltyProducer() const { return m_unknownWordPenaltyProducer; }
bool UseDistortionFutureCosts() const {return m_useDistortionFutureCosts;}
bool UseAlignmentInfo() const { return m_UseAlignmentInfo;}
void UseAlignmentInfo(bool a){ m_UseAlignmentInfo=a; };
bool PrintAlignmentInfo() const { return m_PrintAlignmentInfo; }
bool PrintAlignmentInfoInNbest() const {return m_PrintAlignmentInfoNbest;}
bool GetDistinctNBest() const {return m_onlyDistinctNBest;}
const std::string& GetFactorDelimiter() const {return m_factorDelimiter;}
size_t GetMaxNumFactors(FactorDirection direction) const { return m_maxFactorIdx[(size_t)direction]+1; }

View File

@ -33,19 +33,30 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
bool TargetPhrase::wordalignflag=StaticData::Instance().UseAlignmentInfo();
bool TargetPhrase::printalign=StaticData::Instance().PrintAlignmentInfo();
//bool TargetPhrase::wordalignflag;
//bool TargetPhrase::printalign;
TargetPhrase::TargetPhrase(FactorDirection direction)
//:Phrase(direction), m_ngramScore(0.0), m_fullScore(0.0), m_sourcePhrase(0)
:Phrase(direction),m_transScore(0.0), m_ngramScore(0.0), m_fullScore(0.0), m_sourcePhrase(0)
{
wordalignflag=StaticData::Instance().UseAlignmentInfo();
printalign=StaticData::Instance().PrintAlignmentInfo();
}
void TargetPhrase::SetScore()
{ // used when creating translations of unknown words:
m_transScore = m_ngramScore = 0;
//m_ngramScore = 0;
m_fullScore = - StaticData::Instance().GetWeightWordPenalty();
}
void TargetPhrase::SetAlignment()
{
m_alignmentPair.SetIdentityAlignment();
}
void TargetPhrase::SetScore(float score)
{
//we use an existing score producer to figure out information for score setting (number of scores and weights)
@ -72,24 +83,24 @@ void TargetPhrase::SetScore(float score)
//Now we have what we need to call the full SetScore method
SetScore(prod,scoreVector,weights,StaticData::Instance().GetWeightWordPenalty(),StaticData::Instance().GetAllLM());
}
void TargetPhrase::SetScore(const ScoreProducer* translationScoreProducer,
const vector<float> &scoreVector, const vector<float> &weightT,
const Scores &scoreVector,
const vector<float> &weightT,
float weightWP, const LMList &languageModels)
{
assert(weightT.size() == scoreVector.size());
// calc average score if non-best
m_transScore = std::inner_product(scoreVector.begin(), scoreVector.end(), weightT.begin(), 0.0f);
m_scoreBreakdown.PlusEquals(translationScoreProducer, scoreVector);
// Replicated from TranslationOptions.cpp
float totalFutureScore = 0;
float totalNgramScore = 0;
float totalFullScore = 0;
LMList::const_iterator lmIter;
for (lmIter = languageModels.begin(); lmIter != languageModels.end(); ++lmIter)
{
@ -99,10 +110,10 @@ void TargetPhrase::SetScore(const ScoreProducer* translationScoreProducer,
{ // contains factors used by this LM
const float weightLM = lm.GetWeight();
float fullScore, nGramScore;
lm.CalcScore(*this, fullScore, nGramScore);
m_scoreBreakdown.Assign(&lm, nGramScore);
// total LM score so far
totalNgramScore += nGramScore * weightLM;
totalFullScore += fullScore * weightLM;
@ -110,9 +121,9 @@ void TargetPhrase::SetScore(const ScoreProducer* translationScoreProducer,
}
}
m_ngramScore = totalNgramScore;
m_fullScore = m_transScore + totalFutureScore + totalFullScore
- (this->GetSize() * weightWP); // word penalty
- (this->GetSize() * weightWP); // word penalty
}
void TargetPhrase::SetWeights(const ScoreProducer* translationScoreProducer, const vector<float> &weightT)
@ -158,11 +169,151 @@ TargetPhrase *TargetPhrase::MergeNext(const TargetPhrase &inputPhrase) const
return clone;
}
// helper functions
void AddAlignmentElement(AlignmentPhraseInserter &inserter
, const string &str
, size_t phraseSize
, size_t otherPhraseSize
, list<size_t> &uniformAlignment)
{
// input
vector<string> alignPhraseVector = Tokenize(str);
// from
// "(0) (3) (1,2)"
// to
// "(0)" "(3)" "(1,2)"
assert (alignPhraseVector.size() == phraseSize) ;
const size_t inputSize = alignPhraseVector.size();
for (size_t pos = 0 ; pos < inputSize ; ++pos)
{
string alignElementStr = alignPhraseVector[pos];
//change "()" into "(-1)" for both source and target word-to-word alignments
size_t pos=0;
std::string emtpyAlignStr="()";
std::string replaceAlignStr="(-1)";
alignElementStr=Replace(alignElementStr,emtpyAlignStr,replaceAlignStr);
//remove all "(" from both source and target word-to-word alignments
emtpyAlignStr="(";
replaceAlignStr="";
alignElementStr=Replace(alignElementStr,emtpyAlignStr,replaceAlignStr);
//remove all ")" from both source and target word-to-word alignments
emtpyAlignStr=")";
replaceAlignStr="";
alignElementStr=Replace(alignElementStr,emtpyAlignStr,replaceAlignStr);
AlignmentElement *alignElement = new AlignmentElement(Tokenize<AlignmentElementType>(alignElementStr, ","));
// "(1,2)"
// to
// [1] [2]
if (alignElement->GetSize() == 0)
{ // no alignment info. add uniform alignment, ie. can be aligned to any word
alignElement->SetUniformAlignment(otherPhraseSize);
uniformAlignment.push_back(pos);
}
**inserter = alignElement;
(*inserter)++;
}
}
// helper functions
void AddAlignmentElement(AlignmentPhraseInserter &inserter
, const WordAlignments &wa
, size_t phraseSize
, size_t otherPhraseSize
, list<size_t> &uniformAlignment)
{
// from
// "(0) (3) (1,2)"
// to
// "(0)" "(3)" "(1,2)"
assert (wa.size() == phraseSize) ;
const size_t inputSize = wa.size();
for (size_t pos = 0 ; pos < inputSize ; ++pos)
{
string alignElementStr = wa[pos];
AlignmentElement *alignElement = new AlignmentElement(Tokenize<AlignmentElementType>(alignElementStr, ","));
// "(1,2)"
// to
// [1] [2]
if (alignElement->GetSize() == 0)
{ // no alignment info. add uniform alignment, ie. can be aligned to any word
alignElement->SetUniformAlignment(otherPhraseSize);
uniformAlignment.push_back(pos);
}
**inserter = alignElement;
(*inserter)++;
}
}
void TargetPhrase::CreateAlignmentInfo(const WordAlignments &swa
, const WordAlignments &twa)
{
AlignmentPhraseInserter sourceInserter = m_alignmentPair.GetInserter(Input);
AlignmentPhraseInserter targetInserter = m_alignmentPair.GetInserter(Output);
list<size_t> uniformAlignmentSource, uniformAlignmentTarget;
if (!UseWordAlignment()){ //build uniform word-to-word alignment to fit the internal structure which requires their presence
std::string srcAlignStr,trgAlignStr;
UniformAlignment(srcAlignStr, m_sourcePhrase->GetSize(), GetSize());
UniformAlignment(trgAlignStr, GetSize(), m_sourcePhrase->GetSize());
CreateAlignmentInfo(srcAlignStr,trgAlignStr);
}
else{
AddAlignmentElement(sourceInserter
, swa
, m_sourcePhrase->GetSize()
, GetSize()
, uniformAlignmentSource);
AddAlignmentElement(targetInserter
, twa
, GetSize()
, m_sourcePhrase->GetSize()
, uniformAlignmentTarget);
}
// propergate uniform alignments to other side
// m_alignmentPair.GetAlignmentPhrase(Output).AddUniformAlignmentElement(uniformAlignmentSource);
// m_alignmentPair.GetAlignmentPhrase(Input).AddUniformAlignmentElement(uniformAlignmentTarget);
}
void TargetPhrase::CreateAlignmentInfo(const string &sourceStr
, const string &targetStr)
{
AlignmentPhraseInserter sourceInserter = m_alignmentPair.GetInserter(Input);
AlignmentPhraseInserter targetInserter = m_alignmentPair.GetInserter(Output);
list<size_t> uniformAlignmentSource, uniformAlignmentTarget;
AddAlignmentElement(sourceInserter
, sourceStr
, m_sourcePhrase->GetSize()
, GetSize()
, uniformAlignmentSource);
AddAlignmentElement(targetInserter
, targetStr
, GetSize()
, m_sourcePhrase->GetSize()
, uniformAlignmentTarget);
// propergate uniform alignments to other side
// m_alignmentPair.GetAlignmentPhrase(Output).AddUniformAlignmentElement(uniformAlignmentSource);
// m_alignmentPair.GetAlignmentPhrase(Input).AddUniformAlignmentElement(uniformAlignmentTarget);
}
TO_STRING_BODY(TargetPhrase);
std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp)
{
os << static_cast<const Phrase&>(tp) << ", pC=" << tp.m_transScore << ", c=" << tp.m_fullScore;
//os << static_cast<const Phrase&>(tp) << ", c=" << tp.m_fullScore;
os << static_cast<const Phrase&>(tp);
os << ", pC=" << tp.m_transScore << ", c=" << tp.m_fullScore;
if (tp.PrintAlignmentInfo())
os << ", " << tp.GetAlignmentPair();
return os;
}

View File

@ -22,15 +22,17 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#pragma once
#include <vector>
#include "TypeDef.h"
#include "Phrase.h"
#include "ScoreComponentCollection.h"
#include "AlignmentPair.h"
class LMList;
class PhraseDictionary;
class GenerationDictionary;
class ScoreProducer;
/** represents an entry on the target side of a phrase table (scores, translation)
/** represents an entry on the target side of a phrase table (scores, translation, alignment)
*/
class TargetPhrase: public Phrase
{
@ -39,11 +41,22 @@ protected:
float m_transScore, m_ngramScore, m_fullScore;
//float m_ngramScore, m_fullScore;
ScoreComponentCollection m_scoreBreakdown;
AlignmentPair m_alignmentPair;
// in case of confusion net, ptr to source phrase
Phrase const* m_sourcePhrase;
static bool wordalignflag;
static bool printalign;
public:
TargetPhrase(FactorDirection direction=Output);
TargetPhrase(FactorDirection direction=Output);
~TargetPhrase(){};
/** used by the unknown word handler.
* Set alignment to 0
*/
void SetAlignment();
//! used by the unknown word handler- these targets
//! don't have a translation score, so wp is the only thing used
@ -62,13 +75,14 @@ public:
* @param weightWP the weight of the word penalty
*
* @TODO should this be part of the constructor? If not, add explanation why not.
*/
*/
void SetScore(const ScoreProducer* translationScoreProducer,
const std::vector<float> &scoreVector,
const Scores &scoreVector,
const std::vector<float> &weightT,
float weightWP,
const LMList &languageModels);
// used when creating translations of unknown words:
void ResetScore();
void SetWeights(const ScoreProducer*, const std::vector<float> &weightT);
@ -104,6 +118,33 @@ public:
{
return m_sourcePhrase;
}
AlignmentPair &GetAlignmentPair()
{
return m_alignmentPair;
}
const AlignmentPair &GetAlignmentPair() const
{
return m_alignmentPair;
}
/** Parse the alignment info portion of phrase table string to create alignment info */
void CreateAlignmentInfo(const std::string &sourceStr
, const std::string &targetStr);
void CreateAlignmentInfo(const WordAlignments &swa
, const WordAlignments &twa);
void UseWordAlignment(bool a){
wordalignflag=a;
};
bool UseWordAlignment() const {
return wordalignflag;
};
void PrintAlignmentInfo(bool a) {
printalign=a;
}
bool PrintAlignmentInfo() const {
return printalign;
}
TO_STRING();
};

View File

@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Util.h"
#include "TypeDef.h"
#include "ScoreComponentCollection.h"
#include "AlignmentPair.h"
#include "StaticData.h"
class PhraseDictionary;
@ -56,7 +57,7 @@ class TranslationOption
protected:
Phrase m_targetPhrase; /*< output phrase when using this translation option */
TargetPhrase m_targetPhrase; /*< output phrase when using this translation option */
Phrase *m_sourcePhrase; /*< input phrase translated by this */
const WordsRange m_sourceWordsRange; /*< word position in the input that are covered by this translation option */
float m_futureScore; /*< estimate of total cost when using this translation option, includes language model probabilities */
@ -97,7 +98,7 @@ public:
void MergeNewFeatures(const Phrase& phrase, const ScoreComponentCollection& score, const std::vector<FactorType>& featuresToMerge);
/** returns target phrase */
inline const Phrase &GetTargetPhrase() const
inline const TargetPhrase &GetTargetPhrase() const
{
return m_targetPhrase;
}
@ -172,7 +173,7 @@ public:
/** Calculate future score and n-gram score of this trans option, plus the score breakdowns */
void CalcScore();
void CacheReorderingProb(const LexicalReordering &lexreordering
, const Score &score);

View File

@ -222,13 +222,16 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,
}
targetPhrase.SetScore();
targetPhrase.SetSourcePhrase(m_unksrc);
targetPhrase.SetSourcePhrase(m_unksrc);
//create a one-to-one aignment between UNKNOWN_FACTOR and its verbatim translation
targetPhrase.CreateAlignmentInfo("(0)","(0)");
transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase, m_source, 0);
}
else
{ // drop source word. create blank trans opt
TargetPhrase targetPhrase(Output);
targetPhrase.SetSourcePhrase(m_unksrc);
targetPhrase.SetAlignment();
transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase, m_source, 0);
}
@ -337,6 +340,7 @@ void TranslationOptionCollection::CreateTranslationOptions(const vector <DecodeG
// in the phraseDictionary (which is the- possibly filtered-- phrase
// table loaded on initialization), generate TranslationOption objects
// for all phrases
size_t size = m_source.GetSize();
for (size_t startVL = 0 ; startVL < decodeStepVL.size() ; startVL++)
{
@ -440,7 +444,7 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange(
static_cast<const DecodeStepTranslation&>(decodeStep).ProcessInitialTranslation
(m_source, *oldPtoc
, startPos, endPos, adhereTableLimit );
// do rest of decode steps
int indexStep = 0;
for (++iterStep ; iterStep != decodeStepList.end() ; ++iterStep)
@ -503,12 +507,11 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange(
if (useCache)
delete sourcePhrase;
} // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
if ((StaticData::Instance().GetXmlInputType() != XmlPassThrough) && HasXmlOptionsOverlappingRange(startPos,endPos))
{
CreateXmlOptionsForRange(startPos, endPos);
}
}
/** Check if this range overlaps with any XML options. This doesn't need to be an exact match, only an overlap.

View File

@ -169,3 +169,11 @@ typedef uint32_t UINT32;
class TranslationOption;
typedef std::vector<TranslationOption*> TranslationOptionList;
typedef std::vector<float> Scores;
typedef std::vector<std::string> WordAlignments;
typedef std::pair<std::vector<std::string const*>,Scores > StringTgtCand;
typedef std::pair<std::vector<std::string const*>,WordAlignments > StringWordAlignmentCand;

View File

@ -21,6 +21,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#pragma once
#include <iostream>
#include <cassert>
#include <fstream>
#include <sstream>
@ -106,6 +107,17 @@ inline std::vector<T> Scan(const std::vector< std::string > &input)
return output;
}
/** replace all occurrences of todelStr in str with the string toaddStr */
inline std::string Replace(const std::string& str,
const std::string& todelStr,
const std::string& toaddStr)
{
size_t pos=0;
std::string newStr=str;
while ((pos=newStr.find(todelStr,pos))!=std::string::npos){ newStr.replace(pos++,todelStr.size(),toaddStr); }
return newStr;
}
/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument.
The separator can only be 1 character long. The default delimiters are space or tab
*/
@ -259,7 +271,6 @@ void RemoveAllInColl(COLL &coll)
delete (*iter);
}
coll.clear();
}
//! x-platform reference to temp folder