mosesdecoder/scripts/training/phrase-extract/PhraseAlignment.cpp

/*
 *  PhraseAlignment.cpp
 *  extract
 *
 *  Created by Hieu Hoang on 28/07/2010.
 *  Copyright 2010 __MyCompanyName__. All rights reserved.
 *
 */

#include <sstream>
#include "PhraseAlignment.h"
#include "SafeGetline.h"
#include "tables-core.h"
#include "score.h"

using namespace std;

extern Vocabulary vcbT;
extern Vocabulary vcbS;

extern bool hierarchicalFlag;

//! convert string to variable of type T. Used to reading floats, int etc from files
template<typename T>
inline T Scan(const std::string &input)
{
	std::stringstream stream(input);
	T ret;
	stream >> ret;
	return ret;
}


//! speeded up version of above
template<typename T>
inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
{
	output.resize(input.size());
	for (size_t i = 0 ; i < input.size() ; i++)
	{
		output[i] = Scan<T>( input[i] );
	}
}


inline void Tokenize(std::vector<std::string> &output
                     , const std::string& str
                     , const std::string& delimiters = " \t")
{
  // Skip delimiters at beginning.
  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
  // Find first "non-delimiter".
  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
  
  while (std::string::npos != pos || std::string::npos != lastPos) {
    // Found a token, add it to the vector.
    output.push_back(str.substr(lastPos, pos - lastPos));
    // Skip delimiters.  Note the "not_of"
    lastPos = str.find_first_not_of(delimiters, pos);
    // Find next "non-delimiter"
    pos = str.find_first_of(delimiters, lastPos);
  }
}

// speeded up version of above
template<typename T>
inline void Tokenize( std::vector<T> &output
										 , const std::string &input
										 , const std::string& delimiters = " \t")
{
	std::vector<std::string> stringVector;
	Tokenize(stringVector, input, delimiters);
	return Scan<T>(output, stringVector );
}

// read in a phrase pair and store it
void PhraseAlignment::create( char line[], int lineID )
{
  assert(phraseS.empty());
  assert(phraseT.empty());

  //cerr << "processing " << line;
  vector< string > token = tokenize( line );
  int item = 1;
  for (size_t j=0; j<token.size(); j++) {
    if (token[j] == "|||") item++;
    else if (item == 1) { // source phrase
      phraseS.push_back( vcbS.storeIfNew( token[j] ) );
    }

    else if (item == 2) { // target phrase
      phraseT.push_back( vcbT.storeIfNew( token[j] ) );
    }
    else if (item == 3) { // alignment
      int s,t;
      sscanf(token[j].c_str(), "%d-%d", &s, &t);
      if ((size_t)t >= phraseT.size() || (size_t)s >= phraseS.size()) {
        cerr << "WARNING: phrase pair " << lineID
             << " has alignment point (" << s << ", " << t
             << ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n";
      } else {
        // first alignment point? -> initialize
        createAlignVec(phraseS.size(), phraseT.size());

        // add alignment point
        alignedToT[t].insert( s );
        alignedToS[s].insert( t );
      }
    } else if (item == 4) { // count
      sscanf(token[j].c_str(), "%f", &count);
    }
    else if (item == 5) { // non-term lengths
      addNTLength(token[j]);
    }
  }

  createAlignVec(phraseS.size(), phraseT.size());

  if (item == 3) {
    count = 1.0;
  }
  if (item < 3 || item > 5) {
    cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
  }
}

void PhraseAlignment::addNTLength(const std::string &tok)
{
  vector< string > tokens;
  
  Tokenize(tokens, tok, "=");
  assert(tokens.size() == 2);
  
  size_t sourcePos = Scan<size_t>(tokens[0]);
  assert(sourcePos < phraseS.size());
  
  vector< size_t > ntLengths;
  Tokenize<size_t>(ntLengths, tokens[1], ",");
  assert(ntLengths.size() == 2);
  
  m_ntLengths[sourcePos] = std::pair<size_t, size_t>(ntLengths[0], ntLengths[1]);
}

void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize)
{
  // in case of no align info. always need align info, even if blank
  if (alignedToT.size() == 0) {
    size_t numTgtSymbols = (hierarchicalFlag ? targetSize-1 : targetSize);
    alignedToT.resize(numTgtSymbols);
  }

  if (alignedToS.size() == 0) {
    size_t numSrcSymbols = (hierarchicalFlag ? sourceSize-1 : sourceSize);
    alignedToS.resize(numSrcSymbols);
  }
}

void PhraseAlignment::clear()
{
  phraseS.clear();
  phraseT.clear();
  alignedToT.clear();
  alignedToS.clear();
}

// check if two word alignments between a phrase pair are the same
bool PhraseAlignment::equals( const PhraseAlignment& other )
{
  if (this == &other) return true;
  if (other.GetTarget() != GetTarget()) return false;
  if (other.GetSource() != GetSource()) return false;
  if (other.alignedToT != alignedToT) return false;
  if (other.alignedToS != alignedToS) return false;
  return true;
}

// check if two word alignments between a phrase pairs "match"
// i.e. they do not differ in the alignment of non-termimals
bool PhraseAlignment::match( const PhraseAlignment& other )
{
  if (this == &other) return true;
  if (other.GetTarget() != GetTarget()) return false;
  if (other.GetSource() != GetSource()) return false;
  if (!hierarchicalFlag) return true;

  assert(phraseT.size() == alignedToT.size() + 1);
  assert(alignedToT.size() == other.alignedToT.size());

  // loop over all words (note: 0 = left hand side of rule)
  for(size_t i=0; i<phraseT.size()-1; i++) {
    if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
      if (alignedToT[i].size() != 1 ||
          other.alignedToT[i].size() != 1 ||
          *(alignedToT[i].begin()) != *(other.alignedToT[i].begin()))
        return false;
    }
  }
  return true;
}

int PhraseAlignment::Compare(const PhraseAlignment &other) const
{
  if (this == &other) // comparing with itself
    return 0;

  if (GetTarget() != other.GetTarget()) 
    return ( GetTarget() < other.GetTarget() ) ? -1 : +1;

  if (GetSource() != other.GetSource())
   return ( GetSource() < other.GetSource() ) ? -1 : +1;

  if (!hierarchicalFlag) 
    return 0;

  // loop over all words (note: 0 = left hand side of rule)
  for(size_t i=0; i<phraseT.size()-1; i++) {
    if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
      size_t thisAlign = *(alignedToT[i].begin());
      size_t otherAlign = *(other.alignedToT[i].begin());

      if (alignedToT[i].size() != 1 ||
          other.alignedToT[i].size() != 1 ||
          thisAlign != otherAlign)
      {
        int ret = (thisAlign < otherAlign) ? -1 : +1;
        return ret;
      }
    }
  }
  return 0;
  
}
separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`/*`
			`* PhraseAlignment.cpp`
			`* extract`
			`*`
			`* Created by Hieu Hoang on 28/07/2010.`
			`* Copyright 2010 __MyCompanyName__. All rights reserved.`
			`*`
			`*/`

print out span widths of non-terms. Extra argument --OutputNTLengths git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4224 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-14 14:23:14 +04:00			`#include <sstream>`
separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`#include "PhraseAlignment.h"`
			`#include "SafeGetline.h"`
			`#include "tables-core.h"`
			`#include "score.h"`

			`using namespace std;`

			`extern Vocabulary vcbT;`
			`extern Vocabulary vcbS;`

			`extern bool hierarchicalFlag;`

print out span widths of non-terms. Extra argument --OutputNTLengths git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4224 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-14 14:23:14 +04:00			`//! convert string to variable of type T. Used to reading floats, int etc from files`
			`template<typename T>`
			`inline T Scan(const std::string &input)`
			`{`
			`std::stringstream stream(input);`
			`T ret;`
			`stream >> ret;`
			`return ret;`
			`}`


			`//! speeded up version of above`
			`template<typename T>`
			`inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)`
			`{`
			`output.resize(input.size());`
			`for (size_t i = 0 ; i < input.size() ; i++)`
			`{`
			`output[i] = Scan<T>( input[i] );`
			`}`
			`}`


			`inline void Tokenize(std::vector<std::string> &output`
			`, const std::string& str`
			`, const std::string& delimiters = " \t")`
			`{`
			`// Skip delimiters at beginning.`
			`std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);`
			`// Find first "non-delimiter".`
			`std::string::size_type pos = str.find_first_of(delimiters, lastPos);`

			`while (std::string::npos != pos \|\| std::string::npos != lastPos) {`
			`// Found a token, add it to the vector.`
			`output.push_back(str.substr(lastPos, pos - lastPos));`
			`// Skip delimiters. Note the "not_of"`
			`lastPos = str.find_first_not_of(delimiters, pos);`
			`// Find next "non-delimiter"`
			`pos = str.find_first_of(delimiters, lastPos);`
			`}`
			`}`

			`// speeded up version of above`
			`template<typename T>`
			`inline void Tokenize( std::vector<T> &output`
			`, const std::string &input`
			`, const std::string& delimiters = " \t")`
			`{`
			`std::vector<std::string> stringVector;`
			`Tokenize(stringVector, input, delimiters);`
			`return Scan<T>(output, stringVector );`
			`}`

separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`// read in a phrase pair and store it`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`void PhraseAlignment::create( char line[], int lineID )`
separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`{`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`assert(phraseS.empty());`
			`assert(phraseT.empty());`

			`//cerr << "processing " << line;`
			`vector< string > token = tokenize( line );`
			`int item = 1;`
Merged in fixes for compiler warnings regarding comparing signed and unsigned integer expressions. 2012-05-10 16:48:51 +04:00			`for (size_t j=0; j<token.size(); j++) {`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`if (token[j] == "\|\|\|") item++;`
			`else if (item == 1) { // source phrase`
			`phraseS.push_back( vcbS.storeIfNew( token[j] ) );`
			`}`

			`else if (item == 2) { // target phrase`
			`phraseT.push_back( vcbT.storeIfNew( token[j] ) );`
			`}`
			`else if (item == 3) { // alignment`
			`int s,t;`
			`sscanf(token[j].c_str(), "%d-%d", &s, &t);`
Merged in fixes for compiler warnings regarding comparing signed and unsigned integer expressions. 2012-05-10 16:48:51 +04:00			`if ((size_t)t >= phraseT.size() \|\| (size_t)s >= phraseS.size()) {`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`cerr << "WARNING: phrase pair " << lineID`
			`<< " has alignment point (" << s << ", " << t`
			`<< ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n";`
			`} else {`
			`// first alignment point? -> initialize`
			`createAlignVec(phraseS.size(), phraseT.size());`

			`// add alignment point`
			`alignedToT[t].insert( s );`
			`alignedToS[s].insert( t );`
			`}`
			`} else if (item == 4) { // count`
			`sscanf(token[j].c_str(), "%f", &count);`
			`}`
print out span widths of non-terms. Extra argument --OutputNTLengths git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4224 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-14 14:23:14 +04:00			`else if (item == 5) { // non-term lengths`
			`addNTLength(token[j]);`
			`}`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`}`

			`createAlignVec(phraseS.size(), phraseT.size());`

			`if (item == 3) {`
			`count = 1.0;`
			`}`
print out span widths of non-terms. Extra argument --OutputNTLengths git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4224 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-14 14:23:14 +04:00			`if (item < 3 \|\| item > 5) {`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`cerr << "ERROR: faulty line " << lineID << ": " << line << endl;`
			`}`
separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`}`

print out span widths of non-terms. Extra argument --OutputNTLengths git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4224 1f5c12ca-751b-0410-a591-d2e778427230 2011-09-14 14:23:14 +04:00			`void PhraseAlignment::addNTLength(const std::string &tok)`
			`{`
			`vector< string > tokens;`

			`Tokenize(tokens, tok, "=");`
			`assert(tokens.size() == 2);`

			`size_t sourcePos = Scan<size_t>(tokens[0]);`
			`assert(sourcePos < phraseS.size());`

			`vector< size_t > ntLengths;`
			`Tokenize<size_t>(ntLengths, tokens[1], ",");`
			`assert(ntLengths.size() == 2);`

			`m_ntLengths[sourcePos] = std::pair<size_t, size_t>(ntLengths[0], ntLengths[1]);`
			`}`

bug in Good turing git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3372 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 02:49:37 +04:00			`void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize)`
			`{`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`// in case of no align info. always need align info, even if blank`
			`if (alignedToT.size() == 0) {`
			`size_t numTgtSymbols = (hierarchicalFlag ? targetSize-1 : targetSize);`
			`alignedToT.resize(numTgtSymbols);`
			`}`

			`if (alignedToS.size() == 0) {`
			`size_t numSrcSymbols = (hierarchicalFlag ? sourceSize-1 : sourceSize);`
			`alignedToS.resize(numSrcSymbols);`
			`}`
bug in Good turing git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3372 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 02:49:37 +04:00			`}`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`void PhraseAlignment::clear()`
bug in Good turing git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3372 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 02:49:37 +04:00			`{`
When scoring phrase pairs, store copies of the active pairs' PHRASE objects instead of inserting them into a PhraseTable. In a test on a 21GB target-syntax extract file, this reduced user time from 195 to 120 mins. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3777 1f5c12ca-751b-0410-a591-d2e778427230 2010-12-15 02:49:57 +03:00			`phraseS.clear();`
			`phraseT.clear();`
separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`alignedToT.clear();`
			`alignedToS.clear();`
			`}`

			`// check if two word alignments between a phrase pair are the same`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`bool PhraseAlignment::equals( const PhraseAlignment& other )`
separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`{`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`if (this == &other) return true;`
			`if (other.GetTarget() != GetTarget()) return false;`
			`if (other.GetSource() != GetSource()) return false;`
			`if (other.alignedToT != alignedToT) return false;`
			`if (other.alignedToS != alignedToS) return false;`
			`return true;`
separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`}`

			`// check if two word alignments between a phrase pairs "match"`
			`// i.e. they do not differ in the alignment of non-termimals`
			`bool PhraseAlignment::match( const PhraseAlignment& other )`
			`{`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`if (this == &other) return true;`
			`if (other.GetTarget() != GetTarget()) return false;`
			`if (other.GetSource() != GetSource()) return false;`
			`if (!hierarchicalFlag) return true;`

separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`assert(phraseT.size() == alignedToT.size() + 1);`
			`assert(alignedToT.size() == other.alignedToT.size());`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00
			`// loop over all words (note: 0 = left hand side of rule)`
Merged in fixes for compiler warnings regarding comparing signed and unsigned integer expressions. 2012-05-10 16:48:51 +04:00			`for(size_t i=0; i<phraseT.size()-1; i++) {`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3902 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:57:11 +03:00			`if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {`
			`if (alignedToT[i].size() != 1 \|\|`
			`other.alignedToT[i].size() != 1 \|\|`
			`(alignedToT[i].begin()) != (other.alignedToT[i].begin()))`
			`return false;`
			`}`
			`}`
			`return true;`
separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00			`}`

start on speed optimisation for scoring git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4092 1f5c12ca-751b-0410-a591-d2e778427230 2011-07-27 11:55:03 +04:00			`int PhraseAlignment::Compare(const PhraseAlignment &other) const`
			`{`
			`if (this == &other) // comparing with itself`
			`return 0;`

			`if (GetTarget() != other.GetTarget())`
			`return ( GetTarget() < other.GetTarget() ) ? -1 : +1;`

			`if (GetSource() != other.GetSource())`
			`return ( GetSource() < other.GetSource() ) ? -1 : +1;`

optmised version of score program. Original version is slow when source phrase has many target phrases 'cos it scans a large vector. New version puts it into a set. Slight hack in that it const_cast to get items out of the set. For a source with 100k targets, took 1.2sec, versus 2m20sec. Current version can can take days to run. Won't make it the main score program until regression test for score is set up git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4093 1f5c12ca-751b-0410-a591-d2e778427230 2011-07-27 13:40:58 +04:00			`if (!hierarchicalFlag)`
			`return 0;`
start on speed optimisation for scoring git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4092 1f5c12ca-751b-0410-a591-d2e778427230 2011-07-27 11:55:03 +04:00
optmised version of score program. Original version is slow when source phrase has many target phrases 'cos it scans a large vector. New version puts it into a set. Slight hack in that it const_cast to get items out of the set. For a source with 100k targets, took 1.2sec, versus 2m20sec. Current version can can take days to run. Won't make it the main score program until regression test for score is set up git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4093 1f5c12ca-751b-0410-a591-d2e778427230 2011-07-27 13:40:58 +04:00			`// loop over all words (note: 0 = left hand side of rule)`
Merged in fixes for compiler warnings regarding comparing signed and unsigned integer expressions. 2012-05-10 16:48:51 +04:00			`for(size_t i=0; i<phraseT.size()-1; i++) {`
optmised version of score program. Original version is slow when source phrase has many target phrases 'cos it scans a large vector. New version puts it into a set. Slight hack in that it const_cast to get items out of the set. For a source with 100k targets, took 1.2sec, versus 2m20sec. Current version can can take days to run. Won't make it the main score program until regression test for score is set up git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4093 1f5c12ca-751b-0410-a591-d2e778427230 2011-07-27 13:40:58 +04:00			`if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {`
			`size_t thisAlign = *(alignedToT[i].begin());`
			`size_t otherAlign = *(other.alignedToT[i].begin());`
start on speed optimisation for scoring git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4092 1f5c12ca-751b-0410-a591-d2e778427230 2011-07-27 11:55:03 +04:00
optmised version of score program. Original version is slow when source phrase has many target phrases 'cos it scans a large vector. New version puts it into a set. Slight hack in that it const_cast to get items out of the set. For a source with 100k targets, took 1.2sec, versus 2m20sec. Current version can can take days to run. Won't make it the main score program until regression test for score is set up git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4093 1f5c12ca-751b-0410-a591-d2e778427230 2011-07-27 13:40:58 +04:00			`if (alignedToT[i].size() != 1 \|\|`
			`other.alignedToT[i].size() != 1 \|\|`
			`thisAlign != otherAlign)`
			`{`
			`int ret = (thisAlign < otherAlign) ? -1 : +1;`
			`return ret;`
			`}`
			`}`
			`}`
start on speed optimisation for scoring git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4092 1f5c12ca-751b-0410-a591-d2e778427230 2011-07-27 11:55:03 +04:00			`return 0;`

			`}`

separate PhraseAlignment class into separate file git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3369 1f5c12ca-751b-0410-a591-d2e778427230 2010-07-29 01:28:14 +04:00