Wrapping OnDiskPt, Added OnDiskPt/Util.h with the tokenization procedure, so that they can be called from the Python wrap.

This commit is contained in:
Wilker Aziz 2012-11-13 17:27:33 +01:00
parent c767ee264f
commit 6bbf6db180
7 changed files with 1056 additions and 485 deletions

93
OnDiskPt/Util.h Normal file
View File

@ -0,0 +1,93 @@
#pragma once
#include <string>
#include <vector>
#include "OnDiskWrapper.h"
#include "Phrase.h"
#include "SourcePhrase.h"
#include "Word.h"
#include "PhraseNode.h"
namespace OnDiskPt
{
void Tokenize(Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskWrapper &onDiskWrapper)
{
bool nonTerm = false;
size_t tokSize = token.size();
int comStr =token.compare(0, 1, "[");
if (comStr == 0) {
comStr = token.compare(tokSize - 1, 1, "]");
nonTerm = comStr == 0;
}
if (nonTerm) {
// non-term
size_t splitPos = token.find_first_of("[", 2);
std::string wordStr = token.substr(0, splitPos);
if (splitPos == std::string::npos) {
// lhs - only 1 word
WordPtr word (new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word);
} else {
// source & target non-terms
if (addSourceNonTerm) {
WordPtr word( new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word);
}
wordStr = token.substr(splitPos, tokSize - splitPos);
if (addTargetNonTerm) {
WordPtr word(new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word);
}
}
} else {
// term
WordPtr word(new Word());
word->CreateFromString(token, onDiskWrapper.GetVocab());
phrase.AddWord(word);
}
}
SourcePhrase Tokenize(const std::vector<std::string>& tokens, OnDiskWrapper &onDiskWrapper)
{
SourcePhrase sourcePhrase;
if (tokens.size() > 0){
std::vector<std::string>::const_iterator token = tokens.begin();
for (; token + 1 != tokens.end(); ++token){
Tokenize(sourcePhrase, *token, true, true, onDiskWrapper);
}
// last position. LHS non-term
Tokenize(sourcePhrase, *token, false, true, onDiskWrapper);
}
return sourcePhrase;
}
const PhraseNode* Query(const SourcePhrase& sourcePhrase, OnDiskWrapper& onDiskWrapper)
{
const PhraseNode *node = &onDiskWrapper.GetRootSourceNode();
assert(node);
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos)
{
const Word &word = sourcePhrase.GetWord(pos);
node = node->GetChild(word, onDiskWrapper);
if (node == NULL)
{
break;
}
}
return node;
}
}

View File

@ -9,6 +9,7 @@
#include "moses/Util.h"
#include "OnDiskWrapper.h"
#include "SourcePhrase.h"
#include "Util.h"
using namespace std;
using namespace OnDiskPt;
@ -17,55 +18,6 @@ void usage();
typedef unsigned int uint;
void Tokenize(OnDiskPt::Phrase &phrase
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
, OnDiskPt::OnDiskWrapper &onDiskWrapper)
{
bool nonTerm = false;
size_t tokSize = token.size();
int comStr =token.compare(0, 1, "[");
if (comStr == 0) {
comStr = token.compare(tokSize - 1, 1, "]");
nonTerm = comStr == 0;
}
if (nonTerm) {
// non-term
size_t splitPos = token.find_first_of("[", 2);
string wordStr = token.substr(0, splitPos);
if (splitPos == string::npos) {
// lhs - only 1 word
WordPtr word (new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word);
} else {
// source & target non-terms
if (addSourceNonTerm) {
WordPtr word( new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word);
}
wordStr = token.substr(splitPos, tokSize - splitPos);
if (addTargetNonTerm) {
WordPtr word(new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
phrase.AddWord(word);
}
}
} else {
// term
WordPtr word(new Word());
word->CreateFromString(token, onDiskWrapper.GetVocab());
phrase.AddWord(word);
}
}
int main(int argc, char **argv)
{
int tableLimit = 20;
@ -89,52 +41,20 @@ int main(int argc, char **argv)
if(ttable == "")
usage();
OnDiskWrapper onDiskWrapper;
OnDiskWrapper onDiskWrapper;
bool retDb = onDiskWrapper.BeginLoad(ttable);
CHECK(retDb);
cerr << "Ready..." << endl;
CHECK(retDb);
cerr << "Ready..." << endl;
std::string line;
while(getline(std::cin, line)) {
std::vector<std::string> tokens;
tokens = Moses::Tokenize(line, " ");
cerr << "line: " << line << endl;
// create source phrase
SourcePhrase sourcePhrase;
for (size_t pos = 0; pos < tokens.size(); ++pos)
{
const string &tok = tokens[pos];
if (pos == tokens.size() - 1)
{ // last position. LHS non-term
Tokenize(sourcePhrase, tok, false, true, onDiskWrapper);
}
else
{
Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
}
}
const PhraseNode *node = &onDiskWrapper.GetRootSourceNode();
cerr << "node=" << node << endl;
assert(node);
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos)
{
const Word &word = sourcePhrase.GetWord(pos);
cerr << word << " ";
node = node->GetChild(word, onDiskWrapper);
cerr << "node=" << node << endl;
if (node == NULL)
{
break;
}
}
cerr << "line: " << line << endl;
SourcePhrase sourcePhrase = Tokenize(tokens, onDiskWrapper);
const PhraseNode* node = Query(sourcePhrase, onDiskWrapper);
if (node)
{ // source phrase points to a bunch of rules
@ -148,8 +68,6 @@ int main(int argc, char **argv)
cerr << " ";
targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
cerr << endl;
}
}
else
@ -162,12 +80,11 @@ int main(int argc, char **argv)
}
cerr << "Finished." << endl;
}
void usage()
{
std::cerr << "Usage: queryOnDiskPt [-n <nscores>] [-a] -t <ttable>\n"
std::cerr << "Usage: queryOnDiskPt [-n <nscores>] [-a] -t <ttable>\n"
"-tlimit <table limit> max number of rules per source phrase (default: 20)\n"
"-t <ttable> phrase table\n";
exit(1);

View File

@ -1,6 +1,5 @@
from libcpp.string cimport string
from libcpp.vector cimport vector
from libcpp.pair cimport pair
ctypedef string* str_pointer
ctypedef string* const_str_pointer "const str_pointer"
@ -16,8 +15,6 @@ cdef extern from 'PhraseDictionaryTree.h' namespace 'Moses':
Tokens fnames
Scores fvalues
cdef cppclass PhraseDictionaryTree:
PhraseDictionaryTree(unsigned nscores)
void UseWordAlignment(bint use)

View File

@ -0,0 +1,45 @@
from libcpp.string cimport string
from libcpp.vector cimport vector
cdef extern from 'Word.h' namespace 'OnDiskPt':
cdef cppclass Word
cdef extern from 'Phrase.h' namespace 'OnDiskPt':
cdef cppclass Phrase
cdef extern from 'SourcePhrase.h' namespace 'OnDiskPt':
cdef cppclass SourcePhrase
#cdef extern from 'TargetPhrase.h' namespace 'OnDiskPt':
# cdef cppclass TargetPhrase
#cdef extern from 'OnDiskWrapper.h' namespace 'OnDiskPt':
# cdef cppclass OnDiskWrapper
cdef extern from 'TargetPhraseCollection.h' namespace 'OnDiskPt':
cdef cppclass TargetPhraseCollection
cdef extern from 'PhraseNode.h' namespace 'OnDiskPt':
cdef cppclass PhraseNode
# cdef cppclass PhraseNodePointer 'PhaseNode*'
cdef extern from 'OnDiskWrapper.h' namespace 'OnDiskPt':
cdef cppclass OnDiskWrapper
cdef extern from 'PhraseNode.h' namespace 'OnDiskPt':
cdef cppclass PhraseNode:
PhraseNode* GetChild(Word& word, OnDiskWrapper& wrapper)
TargetPhraseCollection* GetTargetPhraseCollection(unsigned tableLimit, OnDiskWrapper& wrapper)
cdef extern from 'OnDiskWrapper.h' namespace 'OnDiskPt':
cdef cppclass OnDiskWrapper:
OnDiskWrapper()
bint BeginLoad(string& path)
PhraseNode& GetRootSourceNode()
cdef extern from 'Util.h' namespace 'OnDiskPt':
# cdef void Tokenize(Phrase& phrase, string& token, bint addSourceNonTerm, bint addTargetNonTerm, OnDiskWrapper& wrapper)
cdef SourcePhrase Tokenize(vector[string]& tokens, OnDiskWrapper& wrapper)

File diff suppressed because it is too large Load Diff

View File

@ -5,6 +5,7 @@ from libcpp.vector cimport vector
import os
import cython
cimport cdictree
cimport condiskpt
cpdef int fsign(float x):
"""Simply returns the sign of float x (zero is assumed +), it's defined here just so one gains a little bit with static typing"""
@ -87,7 +88,7 @@ cdef class PhraseDictionaryTree(object):
self.delimiters = delimiters
self.__tree = new cdictree.PhraseDictionaryTree(nscores)
self.__tree.UseWordAlignment(wa)
self.__tree.Read(string(path))
self.__tree.Read(path)
def __dealloc__(self):
del self.__tree
@ -115,7 +116,7 @@ cdef class PhraseDictionaryTree(object):
If 'cmp' is defined the return list is sorted.
If 'top' is defined, onlye the top elements will be returned."""
cdef bytes text = as_str(line)
cdef vector[string] fphrase = cdictree.Tokenize(string(text), string(self.delimiters))
cdef vector[string] fphrase = cdictree.Tokenize(text, self.delimiters)
cdef vector[cdictree.StringTgtCand]* rv = new vector[cdictree.StringTgtCand]()
cdef vector[string]* wa = NULL
cdef list phrases
@ -134,4 +135,20 @@ cdef class PhraseDictionaryTree(object):
return phrases[0:top]
else:
return phrases
cdef class OnDiskWrapper(object):
cdef condiskpt.OnDiskWrapper* wrapper
cdef readonly bytes delimiters
def __cinit__(self, bytes path, delimiters = ' \t'):
self.delimiters = delimiters
self.wrapper = new condiskpt.OnDiskWrapper()
self.wrapper.BeginLoad(string(path))
def query(self, line):
cdef bytes text = as_str(line)
cdef vector[string] ftokens = cdictree.Tokenize(text, self.delimiters)
#print 'query:', ftokens
#cdef condiskpt.SourcePhrase fphrase = condiskpt.Tokenize(ftokens, self.wrapper[0])

View File

@ -4,7 +4,7 @@ import sys
import os
mosesdir = os.path.abspath('../../')
includes = [mosesdir, os.path.join(mosesdir, 'moses')]
includes = [mosesdir, os.path.join(mosesdir, 'moses'), os.path.join(mosesdir, 'OnDiskPt')]
libdir = os.path.join(mosesdir, 'lib')
# options