mosesdecoder/contrib/eppex/phrase-extract.cpp

/**
 * Common lossy counting phrase extraction functionality implementation.
 *
 * Note: The bulk of this unit is based on Philipp Koehn's code from
 * phrase-extract/extract.cpp.
 *
 * (C) Moses: http://www.statmt.org/moses/
 * (C) Ceslav Przywara, UFAL MFF UK, 2011
 *
 * $Id$
 */

#include <iostream>
#include <iomanip>
#include <sstream>

#include "phrase-extract.h"
#include "ISS.h"
// I'm using my own version of SafeGetline (without "using namespace std;"):
#include "SafeGetline.h"


#define LINE_MAX_LENGTH 60000


//////// Helping functions ////////

// For sorted output.
typedef std::pair<indexed_phrases_pair_t, PhrasePairsLossyCounter::frequency_t> output_pair_t;
typedef std::vector<output_pair_t> output_vector_t;

class PhraseComp {
    /** @var If true, sort by target phrase first. */
    bool _inverted;

    bool compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b);

    int comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b);

public:
    PhraseComp(bool inverted): _inverted(inverted) {}

    bool operator()(const output_pair_t& a, const output_pair_t& b);
};

void processSortedOutput(OutputProcessor& processor);

void processUnsortedOutput(OutputProcessor& processor);

void flushPhrasePair(OutputProcessor& processor, const indexed_phrases_pair_t& indexedPhrasePair, PhrasePairsLossyCounter::frequency_t frequency, int mode);


//////// Define variables declared as extern in the header /////////////////////
bool allModelsOutputFlag = false;

bool wordModel = false; // IBM word model.
REO_MODEL_TYPE wordType = REO_MSD;
bool phraseModel = false; // Std phrase-based model.
REO_MODEL_TYPE phraseType = REO_MSD;
bool hierModel = false; // Hierarchical model.
REO_MODEL_TYPE hierType = REO_MSD;

int maxPhraseLength = 0; // Eg. 7
bool translationFlag = true; // Generate extract and extract.inv
bool orientationFlag = false; // Ordering info needed?
bool sortedOutput = false; // Sort output?

LossyCountersVector lossyCounters;

#ifdef GET_COUNTS_ONLY
std::vector<size_t> phrasePairsCounters;
#endif


//////// Internal module variables /////////////////////////////////////////////

IndexedStringsStorage<word_index_t> strings;
IndexedStringsStorage<orientation_info_index_t> orientations;


//////// Untouched Philipp Koehn's code :) /////////////////////////////////////

REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
                           bool connectedLeftTop, bool connectedRightTop,
                           int startF, int endF, int startE, int endE, int countF, int zero, int unit,
                           bool (*ge)(int, int), bool (*lt)(int, int) )
{

  if( connectedLeftTop && !connectedRightTop)
    return LEFT;
  if(modelType == REO_MONO)
    return UNKNOWN;
  if (!connectedLeftTop &&  connectedRightTop)
    return RIGHT;
  if(modelType == REO_MSD)
    return UNKNOWN;
  for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
    connectedLeftTop = isAligned(sentence, indexF, startE-unit);
  for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit)
    connectedRightTop = isAligned(sentence, indexF, startE-unit);
  if(connectedLeftTop && !connectedRightTop)
    return DRIGHT;
  else if(!connectedLeftTop && connectedRightTop)
    return DLEFT;
  return UNKNOWN;
}

// to be called with countF-1 instead of countF
REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
                              bool connectedLeftTop, bool connectedRightTop,
                              int startF, int endF, int startE, int endE, int countF, int zero, int unit,
                              bool (*ge)(int, int), bool (*lt)(int, int),
                              const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft)
{

  HSentenceVertices::const_iterator it;

  if((connectedLeftTop && !connectedRightTop) ||
      //(startE == 0 && startF == 0) ||
      //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) ||
      ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
       it->second.find(startF-unit) != it->second.end()))
    return LEFT;
  if(modelType == REO_MONO)
    return UNKNOWN;
  if((!connectedLeftTop &&  connectedRightTop) ||
      ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end()))
    return RIGHT;
  if(modelType == REO_MSD)
    return UNKNOWN;
  connectedLeftTop = false;
  for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
    if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
                          it->second.find(indexF) != it->second.end())
      return DRIGHT;
  connectedRightTop = false;
  for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
    if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
                           it->second.find(indexF) != it->second.end())
      return DLEFT;
  return UNKNOWN;
}

// to be called with countF-1 instead of countF
REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
                            bool connectedLeftTop, bool connectedRightTop,
                            int startF, int endF, int startE, int endE, int countF, int zero, int unit,
                            bool (*ge)(int, int), bool (*lt)(int, int),
                            const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft,
                            const HSentenceVertices & outBottomRight, const HSentenceVertices & outBottomLeft,
                            REO_POS phraseOrient)
{

  HSentenceVertices::const_iterator it;

  if(phraseOrient == LEFT ||
      (connectedLeftTop && !connectedRightTop) ||
      //    (startE == 0 && startF == 0) ||
      //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) ||
      ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
       it->second.find(startF-unit) != it->second.end()) ||
      ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
       it->second.find(startF-unit) != it->second.end()))
    return LEFT;
  if(modelType == REO_MONO)
    return UNKNOWN;
  if(phraseOrient == RIGHT ||
      (!connectedLeftTop &&  connectedRightTop) ||
      ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() &&
       it->second.find(endF + unit) != it->second.end()) ||
      ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() &&
       it->second.find(endF + unit) != it->second.end()))
    return RIGHT;
  if(modelType == REO_MSD)
    return UNKNOWN;
  if(phraseOrient != UNKNOWN)
    return phraseOrient;
  connectedLeftTop = false;
  for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) {
    if((connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
                           it->second.find(indexF) != it->second.end()) ||
        (connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
                            it->second.find(indexF) != it->second.end()))
      return DRIGHT;
  }
  connectedRightTop = false;
  for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) {
    if((connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
                            it->second.find(indexF) != it->second.end()) ||
        (connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() &&
                             it->second.find(indexF) != it->second.end()))
      return DLEFT;
  }
  return UNKNOWN;
}

void insertVertex( HSentenceVertices & corners, int x, int y )
{
  std::set<int> tmp;
  tmp.insert(x);
  std::pair< HSentenceVertices::iterator, bool > ret = corners.insert( std::pair<int, std::set<int> > (y, tmp) );
  if(ret.second == false) {
    ret.first->second.insert(x);
  }
}

void insertPhraseVertices(
  HSentenceVertices & topLeft,
  HSentenceVertices & topRight,
  HSentenceVertices & bottomLeft,
  HSentenceVertices & bottomRight,
  int startF, int startE, int endF, int endE)
{

  insertVertex(topLeft, startF, startE);
  insertVertex(topRight, endF, startE);
  insertVertex(bottomLeft, startF, endE);
  insertVertex(bottomRight, endF, endE);
}

std::string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
{
  switch(orient) {
  case LEFT:
    return "mono";
    break;
  case RIGHT:
    return "swap";
    break;
  case DRIGHT:
    return "dright";
    break;
  case DLEFT:
    return "dleft";
    break;
  case UNKNOWN:
    switch(modelType) {
    case REO_MONO:
      return "nomono";
      break;
    case REO_MSD:
      return "other";
      break;
    case REO_MSLR:
      return "dright";
      break;
    }
    break;
  }
}

bool ge(int first, int second)
{
  return first >= second;
}

bool le(int first, int second)
{
  return first <= second;
}

bool lt(int first, int second)
{
  return first < second;
}

bool isAligned ( SentenceAlignment &sentence, int fi, int ei )
{
  if (ei == -1 && fi == -1)
    return true;
  if (ei <= -1 || fi <= -1)
    return false;
  if (ei == sentence.target.size() && fi == sentence.source.size())
    return true;
  if (ei >= sentence.target.size() || fi >= sentence.source.size())
    return false;
  for(int i=0; i<sentence.alignedToT[ei].size(); i++)
    if (sentence.alignedToT[ei][i] == fi)
      return true;
  return false;
}

//////// END OF untouched Philipp Koehn's code :) //////////////////////////////


/////// Slightly modified Philipp Koehn's code :) //////////////////////////////

void extract(SentenceAlignment &sentence) {

    int countE = sentence.target.size();
    int countF = sentence.source.size();

    HPhraseVector inboundPhrases;

    HSentenceVertices inTopLeft;
    HSentenceVertices inTopRight;
    HSentenceVertices inBottomLeft;
    HSentenceVertices inBottomRight;

    HSentenceVertices outTopLeft;
    HSentenceVertices outTopRight;
    HSentenceVertices outBottomLeft;
    HSentenceVertices outBottomRight;

    HSentenceVertices::const_iterator it;

    bool relaxLimit = hierModel;
    bool buildExtraStructure = phraseModel || hierModel;

    // check alignments for target phrase startE...endE
    // loop over extracted phrases which are compatible with the word-alignments
    for (int startE = 0; startE < countE; startE++) {
        for (
                int endE = startE;
                ((endE < countE) && (relaxLimit || (endE < (startE + maxPhraseLength))));
                endE++
            ) {

            int minF = 9999;
            int maxF = -1;
            std::vector< int > usedF = sentence.alignedCountS;

            for (int ei = startE; ei <= endE; ei++) {
                for (int i = 0; i < sentence.alignedToT[ei].size(); i++) {
                    int fi = sentence.alignedToT[ei][i];
                    if (fi < minF) {
                        minF = fi;
                    }
                    if (fi > maxF) {
                        maxF = fi;
                    }
                    usedF[ fi ]--;
                }
            }

        if (maxF >= 0 && // aligned to any source words at all
            (relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits

            // check if source words are aligned to out of bound target words
            bool out_of_bounds = false;

            for (int fi=minF; fi<=maxF && !out_of_bounds; fi++) {
                if (usedF[fi]>0) {
                    // cout << "ouf of bounds: " << fi << "\n";
                    out_of_bounds = true;
                }
            }

            // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
            if (!out_of_bounds) {
                // start point of source phrase may retreat over unaligned
                for (int startF=minF;
                        (startF>=0 &&
                        (relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
                        (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
                        startF--
                    )
                    // end point of source phrase may advance over unaligned
                    for (int endF=maxF;
                            (endF<countF &&
                            (relaxLimit || endF<startF+maxPhraseLength) && // within length limit
                            (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
                            endF++
                        ) { // at this point we have extracted a phrase
                        if (buildExtraStructure) { // phrase || hier
                            if (endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
                                inboundPhrases.push_back(
                                    HPhrase(HPhraseVertex(startF,startE), HPhraseVertex(endF,endE))
                                );
                                insertPhraseVertices(
                                    inTopLeft, inTopRight, inBottomLeft, inBottomRight,
                                    startF, startE, endF, endE
                                );
                            } else {
                                insertPhraseVertices(
                                    outTopLeft, outTopRight, outBottomLeft, outBottomRight,
                                    startF, startE, endF, endE
                                );
                            }
                        } else {
                            std::string orientationInfo = "";
                            if (orientationFlag && wordModel) { // Added orientationFlag check.
                                REO_POS wordPrevOrient, wordNextOrient;
                                bool connectedLeftTopP  = isAligned( sentence, startF-1, startE-1 );
                                bool connectedRightTopP = isAligned( sentence, endF+1,   startE-1 );
                                bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
                                bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );
                                wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
                                wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
                                orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
                            }
                            addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
                        }
                    }
                }
            }
        }
    } // end of main for loop

    if (buildExtraStructure) { // phrase || hier
        std::string orientationInfo = "";
        REO_POS wordPrevOrient, wordNextOrient, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient;

        for (int i = 0; i < inboundPhrases.size(); i++) {
            int startF = inboundPhrases[i].first.first;
            int startE = inboundPhrases[i].first.second;
            int endF = inboundPhrases[i].second.first;
            int endE = inboundPhrases[i].second.second;

            if ( orientationFlag ) { // Added orientationFlag check.

                bool connectedLeftTopP  = isAligned( sentence, startF-1, startE-1 );
                bool connectedRightTopP = isAligned( sentence, endF+1,   startE-1 );
                bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
                bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );

                if (wordModel) {
                    wordPrevOrient = getOrientWordModel(sentence, wordType,
                                                connectedLeftTopP, connectedRightTopP,
                                                startF, endF, startE, endE, countF, 0, 1,
                                                &ge, &lt);

                    wordNextOrient = getOrientWordModel(sentence, wordType,
                                                connectedLeftTopN, connectedRightTopN,
                                                endF, startF, endE, startE, 0, countF, -1,
                                                &lt, &ge);
                }
                if (phraseModel) {
                    phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
                                                    connectedLeftTopP, connectedRightTopP,
                                                    startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
                    phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
                                                    connectedLeftTopN, connectedRightTopN,
                                                    endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
                } else {
                    phrasePrevOrient = phraseNextOrient = UNKNOWN;
                }
                if(hierModel) {
                    hierPrevOrient = getOrientHierModel(sentence, hierType,
                                                connectedLeftTopP, connectedRightTopP,
                                                startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
                    hierNextOrient = getOrientHierModel(sentence, hierType,
                                                connectedLeftTopN, connectedRightTopN,
                                                endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
                }

                orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
                            ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
                            ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
            }

            addPhrase(sentence, startE, endE, startF, endF, orientationInfo);

        } // end of for loop through inbound phrases

    } // end if buildExtraStructure

} // end of extract()


/**
 * @param sentence
 * @param startE
 * @param endE
 * @param startF
 * @param endF
 * @param orientationInfo
 */
void addPhrase(SentenceAlignment &sentence, int startE, int endE, int startF, int endF, std::string &orientationInfo) {

#ifdef GET_COUNTS_ONLY
    // Just get the length of phrase pair (which is now defined as maximum of the two).
    phrasePairsCounters[std::max(endF - startF, endE - startE) + 1] += 1; // Don't forget +1 (span is inclusive)!
#else
    alignment_t alignment;

    // alignment
    for (int ei = startE; ei <= endE; ++ei) {
        for (int i = 0; i < sentence.alignedToT[ei].size(); ++i) {
            int fi = sentence.alignedToT[ei][i];
            alignment.push_back(alignment_t::value_type(fi-startF, ei-startE));
        }
    }

    indexed_phrases_pair_t::phrase_t srcPhraseIndices, tgtPhraseIndices;

    // source phrase
    for (int fi = startF; fi <= endF; ++fi) {
        srcPhraseIndices.push_back(strings.put(sentence.source[fi].c_str()));
    }

    // target phrase
    for (int ei = startE; ei <= endE; ++ei) {
        tgtPhraseIndices.push_back(strings.put(sentence.target[ei].c_str()));
    }

    // TODO: Allow for switching between min and max here.
    size_t idx = std::max(srcPhraseIndices.size(), tgtPhraseIndices.size());

    // Add phrase pair.
    lossyCounters[idx]->lossyCounter.add(indexed_phrases_pair_t(srcPhraseIndices, tgtPhraseIndices, orientations.put(orientationInfo.c_str()), alignment));
    //
    if ( lossyCounters[idx]->lossyCounter.aboutToPrune() ) {
        // Next addition will lead to pruning, inform:
        std::cerr << 'P' << idx << std::flush;
    }
#endif
} // end of addPhrase()


/////// Lossy Counting related code ////////////////////////////////////////////

void readInput(std::istream& eFile, std::istream& fFile, std::istream& aFile) {

    // Note: moved out of the loop.
    char englishString[LINE_MAX_LENGTH];
    char foreignString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];

    int i = 0;

    while(true) {
        // Report progress?
        if (++i%10000 == 0) std::cerr << "." << std::flush;

        SAFE_GETLINE(eFile, englishString, LINE_MAX_LENGTH, '\n', __FILE__);
        if (eFile.eof()) break;
        SAFE_GETLINE(fFile, foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
        SAFE_GETLINE(aFile, alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);

        SentenceAlignment sentence;

        if (sentence.create(englishString, foreignString, alignmentString, i)) {
            extract(sentence);
        }
    }

}


void processOutput(OutputProcessor& processor) {
    if ( sortedOutput ) {
        processSortedOutput(processor);
    }
    else {
        processUnsortedOutput(processor);
    }
}


bool PhraseComp::operator()(const output_pair_t& a, const output_pair_t& b) {

    int cmp = _inverted ? comparePhrases(a.first.tgtPhrase(), b.first.tgtPhrase()) : comparePhrases(a.first.srcPhrase(), b.first.srcPhrase());

    if ( cmp == 0 ) {
        // First part of pairs matches, compare the second part.
        cmp = _inverted ? comparePhrases(a.first.srcPhrase(), b.first.srcPhrase()) : comparePhrases(a.first.tgtPhrase(), b.first.tgtPhrase());

        if ( cmp == 0 ) {
            // Also second part matches, compare alignments.
            return compareAlignments(a.first, b.first);
        }
        else {
            return cmp < 0;
        }
    }
    else {
        return cmp < 0;
    }

}


bool PhraseComp::compareAlignments(const indexed_phrases_pair_t& a, const indexed_phrases_pair_t& b) {

    size_t aSize = a.alignmentLength();
    size_t bSize = b.alignmentLength();
    size_t min = std::min(aSize, bSize);
    const indexed_phrases_pair_t::alignment_point_t * aAlignment = a.alignmentData();
    const indexed_phrases_pair_t::alignment_point_t * bAlignment = b.alignmentData();

    int cmp = 0;
    for ( size_t i = 0; i < min; ++i ) {
        // Important: alignments have to be eventually inverted as well!
        if ( _inverted ) {
            // Inverted = compare TGT phrase alignment points first.
            cmp = memcmp(aAlignment + i*2 + 1, bAlignment + i*2 + 1, sizeof(indexed_phrases_pair_t::alignment_point_t));
        }
        else{
            // NOT inverted = compare SRC phrase alignment points first.
            cmp = memcmp(aAlignment+ i*2, bAlignment + i*2, sizeof(indexed_phrases_pair_t::alignment_point_t));
        }
        if ( cmp == 0 ) {
            if ( _inverted ) {
                // Inverted = compare SRC phrase alignment points second.
                cmp = memcmp(aAlignment + i*2, bAlignment + i*2, sizeof(indexed_phrases_pair_t::alignment_point_t));
            }
            else{
                // NOT inverted = compare TGT phrase alignment points second.
                cmp = memcmp(aAlignment + i*2 + 1, bAlignment + i*2 + 1, sizeof(indexed_phrases_pair_t::alignment_point_t));
            }
            if ( cmp != 0 ) {
                return cmp < 0;
            } // Otherwise continue looping.
        }
        else {
            return cmp < 0;
        }
    }

    // Note: LC_ALL=C GNU sort treats shorter item as lesser than longer one.
    return (cmp == 0) ? (aSize < bSize) : (cmp < 0);

}


int PhraseComp::comparePhrases(const indexed_phrases_pair_t::phrase_t& a, const indexed_phrases_pair_t::phrase_t& b) {

    size_t aSize = a.size();
    size_t bSize = b.size();
    size_t min = std::min(aSize, bSize);
    int cmp = 0;

    for ( size_t i = 0; i < min; ++i ) {
        cmp = strcmp(strings.get(a[i]), strings.get(b[i]));
        if ( cmp != 0 ) {
            return cmp;
        }
    }

    if ( aSize == bSize ) {
        return 0;
    }

    if ( aSize < bSize ) {
        return strcmp("|||", strings.get(b[min]));
    }
    else {
        return strcmp(strings.get(a[min]), "|||");
    }

}


void processSortedOutput(OutputProcessor& processor) {

    output_vector_t output;

    LossyCountersVector::value_type current = NULL, prev = NULL;

    for ( size_t i = 1; i < lossyCounters.size(); ++i ) { // Intentionally skip 0.
        current = lossyCounters[i];
        if ( current != prev ) {
            PhrasePairsLossyCounter& lossyCounter = current->lossyCounter;
            for ( PhrasePairsLossyCounter::erasing_iterator phraseIter = lossyCounter.beginErase(); phraseIter != lossyCounter.endErase(); ++phraseIter ) {
                // Store and...
                output.push_back(std::make_pair(phraseIter.item(), phraseIter.frequency()));
                // ...update counters.
                current->outputMass += phraseIter.frequency();
                current->outputSize += 1;
            }
            //
            prev = current;
            //delete current;
        }
    }

    // Sort by source phrase.
    std::sort(output.begin(), output.end(), PhraseComp(false));

    // Print.
    for ( output_vector_t::const_iterator iter = output.begin(); iter != output.end(); ++iter ) {
        flushPhrasePair(processor, iter->first, iter->second, 1);
    }

    // Sort by target phrase.
    std::sort(output.begin(), output.end(), PhraseComp(true));

    // Print.
    for ( output_vector_t::const_iterator iter = output.begin(); iter != output.end(); ++iter ) {
        flushPhrasePair(processor, iter->first, iter->second, -1);
    }

}


void processUnsortedOutput(OutputProcessor& processor) {

    LossyCountersVector::value_type current = NULL, prev = NULL;

    for ( size_t i = 1; i < lossyCounters.size(); ++i ) { // Intentionally skip 0.

        current = lossyCounters[i];

        if ( current != prev ) {

            const PhrasePairsLossyCounter& lossyCounter = current->lossyCounter;

            for ( PhrasePairsLossyCounter::const_iterator phraseIter = lossyCounter.begin(); phraseIter != lossyCounter.end(); ++phraseIter ) {
                // Flush and...
                flushPhrasePair(processor, phraseIter.item(), phraseIter.frequency(), 0);
                // ...update counters.
                current->outputMass += phraseIter.frequency();
                current->outputSize += 1;
            }

            //
            prev = current;
        }
    }

}


void flushPhrasePair(OutputProcessor& processor, const indexed_phrases_pair_t& indexedPhrasePair, PhrasePairsLossyCounter::frequency_t frequency, int mode = 0) {

    const indexed_phrases_pair_t::phrase_t srcPhraseIndices = indexedPhrasePair.srcPhrase();
    const indexed_phrases_pair_t::phrase_t tgtPhraseIndices = indexedPhrasePair.tgtPhrase();

    std::string srcPhrase, tgtPhrase;

    for ( indexed_phrases_pair_t::phrase_t::const_iterator indexIter = srcPhraseIndices.begin(); indexIter != srcPhraseIndices.end(); ++indexIter ) {
        srcPhrase += std::string(strings.get(*indexIter)) + " ";
    }
    srcPhrase.resize(srcPhrase.size() - 1); // Trim the trailing " "

    for ( indexed_phrases_pair_t::phrase_t::const_iterator indexIter = tgtPhraseIndices.begin(); indexIter != tgtPhraseIndices.end(); ++indexIter ) {
        tgtPhrase += std::string(strings.get(*indexIter)) + " ";
    }
    tgtPhrase.resize(tgtPhrase.size() - 1); // Trim the trailing " "

    // Actual processing is done via call to functor:
    processor(srcPhrase, tgtPhrase, orientations.get(indexedPhrasePair.orientationInfo()), indexedPhrasePair.alignment(), frequency, mode);
}


void printStats(void) {

    // Total counters.
    size_t outputMass = 0, outputSize = 0, N = 0;

    const std::string hline = "####################################################################################################################";

    std::cerr << "Lossy Counting Phrase Extraction statistics:" << std::endl;

    // Print header: | 3 | 15 | 15 | 15 | 7 | 10 | 10 | 10 |
    std::cerr
        << hline << std::endl
        << "# length #      unique out #       total out #    total in (N) # out/in (%) #  pos. thr. #  neg. thr. #  max. err. #" << std::endl
        << hline << std::endl;

    LossyCountersVector::value_type current = NULL, prev = NULL;
    size_t from = 1, to = 1;

    for ( size_t i = 1; i <= lossyCounters.size(); ++i ) { // Intentionally skip 0, intentionally increment till == size().

        current = (i < lossyCounters.size()) ? lossyCounters[i] : NULL;

        if ( (current == NULL) || ((current != prev) && (prev != NULL)) ) {
            // Time to print.
            to = i-1;

            // Increment overall stats.
            outputMass += prev->outputMass;
            outputSize += prev->outputSize;
            N += prev->lossyCounter.count();

            // Print.
            if ( from == to ) {
                std::cerr << "# " << std::setw(6) << to << " # ";
            }
            else {
                std::stringstream strStr;
                strStr << from << "-" << to;
                std::cerr << "# " << std::setw(6) << strStr.str() << " # ";
            }
            // Print the rest of record.
            std::cerr
                    << std::setw(15) << prev->outputSize << " # "
                    << std::setw(15) << prev->outputMass << " # "
                    << std::setw(15) << prev->lossyCounter.count() << " # "
                    << std::setw(10) << std::setprecision(4) << (static_cast<double>(prev->outputMass) / static_cast<double>(prev->lossyCounter.count())) * 100 << " # "
                    << std::setw(10) << prev->lossyCounter.threshold(true) << " # "
                    << std::setw(10) << prev->lossyCounter.threshold() << " # "
                    << std::setw(10) << prev->lossyCounter.maxError() << " #"
                    << std::endl << hline << std::endl;

            from = i;
        }

        prev = current;

    }

    // Print summary:
    std::cerr
        << "#  TOTAL # "
        << std::setw(15) << outputSize << " # "
        << std::setw(15) << outputMass << " # "
        << std::setw(15) << N << " # "
        << std::setw(10) << std::setprecision(4) << (static_cast<double>(outputMass) / static_cast<double>(N)) * 100 << " #"
        << std::endl
        << "#############################################################################" << std::endl;

}