From 8595b06dce870857f89f8b4389dd79d31b05cc2f Mon Sep 17 00:00:00 2001
From: hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>
Date: Fri, 1 Jul 2011 05:40:46 +0000
Subject: [PATCH] rewrite lex prob calc

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4058 1f5c12ca-751b-0410-a591-d2e778427230
---
 scripts/training/phrase-extract/Makefile      |   5 +
 .../training/phrase-extract/extract-lex.cpp   | 172 ++++++++++++++++++
 scripts/training/phrase-extract/extract-lex.h | 112 ++++++++++++
 3 files changed, 289 insertions(+)
 create mode 100644 scripts/training/phrase-extract/extract-lex.cpp
 create mode 100644 scripts/training/phrase-extract/extract-lex.h
diff --git a/scripts/training/phrase-extract/Makefile b/scripts/training/phrase-extract/Makefile
index 63652fcb6..8ab6fbfbf 100644
--- a/scripts/training/phrase-extract/Makefile
+++ b/scripts/training/phrase-extract/Makefile
@@ -13,6 +13,9 @@ extract: tables-core.o SentenceAlignment.o extract.o
 extract-rules: tables-core.o SentenceAlignment.o SentenceAlignmentWithSyntax.o SyntaxTree.o XmlTree.o HoleCollection.o extract-rules.o 
 	$(CXX) $^ -o extract-rules
 
+extract-lex: extract-lex
+	$(CXX) $^ -o extract-lex
+
 score: tables-core.o AlignmentPhrase.o score.o PhraseAlignment.o InputFileStream.o
 	$(CXX) $^ -lz -o score
 
@@ -27,3 +30,5 @@ relax-parse: tables-core.o SyntaxTree.o XmlTree.o relax-parse.o
 
 statistics: tables-core.o AlignmentPhrase.o statistics.o
 	$(CXX) $^ -o statistics
+
+
diff --git a/scripts/training/phrase-extract/extract-lex.cpp b/scripts/training/phrase-extract/extract-lex.cpp
new file mode 100644
index 000000000..7e878ae9e
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-lex.cpp
@@ -0,0 +1,172 @@
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <vector>
+#include "extract-lex.h"
+
+using namespace std;
+
+int main(int argc, char* argv[])
+{
+  cerr << "Starting...\n";
+
+  char* &filePathTarget = argv[1];
+  char* &filePathSource = argv[2];
+  char* &filePathAlign  = argv[3];
+  char* &filePathLexS2T = argv[4];
+  char* &filePathLexT2S = argv[5];
+
+  ifstream streamTarget;
+  ifstream streamSource;
+  ifstream streamAlign;
+  streamTarget.open(filePathTarget);
+  streamSource.open(filePathSource);
+  streamAlign.open(filePathAlign);
+
+  ofstream streamLexS2T;
+  ofstream streamLexT2S;
+  streamLexS2T.open(filePathLexS2T);
+  streamLexT2S.open(filePathLexT2S);
+
+  ExtractLex extractSingleton;
+
+  string lineTarget, lineSource, lineAlign;
+  while (getline(streamTarget, lineTarget))
+  {
+    istream &isSource = getline(streamSource, lineSource);
+    assert(isSource);
+    istream &isAlign = getline(streamAlign, lineAlign);
+    assert(isAlign);
+    
+    vector<string> toksTarget, toksSource, toksAlign;
+    Tokenize(toksTarget, lineTarget);
+    Tokenize(toksSource, lineSource);
+    Tokenize(toksAlign, lineAlign);
+
+    cerr  << endl
+          << toksTarget.size() << " " << lineTarget << endl
+          << toksSource.size() << " " << lineSource << endl 
+          << toksAlign.size() << " " << lineAlign << endl;
+
+    extractSingleton.Process(toksTarget, toksSource, toksAlign);
+    
+  }
+
+  extractSingleton.Output(streamLexS2T, streamLexT2S);
+
+  streamLexS2T.close();
+  streamLexT2S.close();
+
+  cerr << "Finished\n";
+}
+
+const std::string *Vocab::GetOrAdd(const std::string &word)
+{
+ 	const string *ret = &(*m_coll.insert(word).first);
+  return ret;
+}
+
+void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource, vector<string> &toksAlign)
+{
+  vector<string>::const_iterator iterAlign;
+  for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign)
+  {
+    const string &alignTok = *iterAlign;
+    
+    vector<size_t> alignPos;
+    Tokenize(alignPos, alignTok, "-");
+    assert(alignPos.size() == 2);
+    assert(alignPos[0] < toksSource.size());
+    assert(alignPos[1] < toksTarget.size());
+
+    const string &tmpSource = toksSource[ alignPos[0] ];
+    const string &tmpTarget = toksTarget[ alignPos[1] ];
+ 
+    const string *source = m_vocab.GetOrAdd(tmpSource);
+    const string *target = m_vocab.GetOrAdd(tmpTarget);
+
+    Process(target, source);
+    
+  }
+
+}
+
+float COUNT_INCR = 1;
+
+void ExtractLex::Process(const std::string *target, const std::string *source)
+{
+  WordCount tmpWCTarget(target, COUNT_INCR);
+  WordCount tmpWCSource(source, COUNT_INCR);
+
+  Process(tmpWCSource, tmpWCTarget, m_collS2T);
+  Process(tmpWCTarget, tmpWCSource, m_collT2S);
+}
+
+void ExtractLex::Process(const WordCount &in, const WordCount &out, std::map<WordCount, WordCountColl> &coll)
+{
+  std::map<WordCount, WordCountColl>::iterator iterMap;
+  // s2t
+  WordCountColl *wcColl = NULL;
+  iterMap = coll.find(in);
+  if (iterMap == coll.end())
+  {
+    wcColl = &coll[in];
+  }
+  else
+  {
+    const WordCount &wcIn = iterMap->first;
+
+    //cerr << wcIn << endl;
+    wcIn.AddCount(COUNT_INCR);
+    //cerr << wcIn << endl;
+
+    wcColl = &iterMap->second;
+  }
+  
+  assert(in.GetCount() == COUNT_INCR);
+  assert(out.GetCount() == COUNT_INCR);
+  assert(wcColl);
+
+  pair<WordCountColl::iterator, bool> iterSet = wcColl->insert(out);
+  const WordCount &outWC = *iterSet.first;
+  outWC.AddCount(COUNT_INCR);
+}
+
+void ExtractLex::Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S)
+{
+  Output(m_collS2T, streamLexS2T);
+  Output(m_collT2S, streamLexT2S);
+}
+
+void ExtractLex::Output(const std::map<WordCount, WordCountColl> &coll, std::ofstream &outStream)
+{
+  std::map<WordCount, WordCountColl>::const_iterator iterOuter;
+  for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter)
+  {
+    const WordCount &in = iterOuter->first;
+    const WordCountColl &outColl = iterOuter->second;
+
+    WordCountColl::const_iterator iterInner;
+    for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner)
+    {
+      const WordCount &out = *iterInner;
+      outStream << in.GetString() << " " << out.GetString() 
+              << " " << in.GetCount() << " " << out.GetCount()
+              << endl;
+    }
+  }
+}
+
+std::ostream& operator<<(std::ostream &out, const WordCount &obj)
+{
+  out << obj.GetString() << "(" << obj.GetCount() << ")";
+  return out;
+}
+
+void WordCount::AddCount(float incr) const
+{
+  m_count += incr;
+  cerr << *this << endl;
+}
+
+
diff --git a/scripts/training/phrase-extract/extract-lex.h b/scripts/training/phrase-extract/extract-lex.h
new file mode 100644
index 000000000..5e186df16
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-lex.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <map>
+#include <set>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+
+//! convert string to variable of type T. Used to reading floats, int etc from files
+template<typename T>
+inline T Scan(const std::string &input)
+{
+	std::stringstream stream(input);
+	T ret;
+	stream >> ret;
+	return ret;
+}
+
+
+//! speeded up version of above
+template<typename T>
+inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
+{
+	output.resize(input.size());
+	for (size_t i = 0 ; i < input.size() ; i++)
+	{
+		output[i] = Scan<T>( input[i] );
+	}
+}
+	
+
+inline void Tokenize(std::vector<std::string> &output
+                     , const std::string& str
+                     , const std::string& delimiters = " \t")
+{
+  // Skip delimiters at beginning.
+  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+  // Find first "non-delimiter".
+  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
+
+  while (std::string::npos != pos || std::string::npos != lastPos) {
+    // Found a token, add it to the vector.
+    output.push_back(str.substr(lastPos, pos - lastPos));
+    // Skip delimiters.  Note the "not_of"
+    lastPos = str.find_first_not_of(delimiters, pos);
+    // Find next "non-delimiter"
+    pos = str.find_first_of(delimiters, lastPos);
+  }
+}
+
+// speeded up version of above
+template<typename T>
+inline void Tokenize( std::vector<T> &output
+										 , const std::string &input
+										 , const std::string& delimiters = " \t")
+{
+	std::vector<std::string> stringVector;
+	Tokenize(stringVector, input, delimiters);
+	return Scan<T>(output, stringVector );
+}
+
+class WordCount
+{
+	friend std::ostream& operator<<(std::ostream&, const WordCount&);
+public:
+  const std::string *m_str;
+  mutable float m_count;
+
+  WordCount(const std::string *str, float count)
+  :m_str(str)
+  ,m_count(count)
+  {}
+
+  void AddCount(float incr) const;
+
+  const std::string GetString() const
+  { return *m_str; }
+  const float GetCount() const
+  { return m_count; }
+
+	//! transitive comparison used for adding objects into FactorCollection
+	inline bool operator<(const WordCount &other) const
+	{ 
+		return m_str < other.m_str;
+	}
+};
+
+class Vocab
+{
+  std::set<std::string> m_coll;
+public:
+  const std::string *GetOrAdd(const std::string &word);
+};
+
+typedef std::set<WordCount> WordCountColl;
+
+class ExtractLex
+{
+  Vocab m_vocab;
+  std::map<WordCount, WordCountColl> m_collS2T, m_collT2S;
+
+  void Process(const std::string *target, const std::string *source);
+  void Process(const WordCount &in, const WordCount &out, std::map<WordCount, WordCountColl> &coll);
+  void Output(const std::map<WordCount, WordCountColl> &coll, std::ofstream &outStream);
+
+public:
+  void Process(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource, std::vector<std::string> &toksAlign);
+  void Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S);
+
+};
+