mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
Wrapping OnDiskPt, Added OnDiskPt/Util.h with the tokenization procedure, so that they can be called from the Python wrap.
This commit is contained in:
parent
c767ee264f
commit
6bbf6db180
93
OnDiskPt/Util.h
Normal file
93
OnDiskPt/Util.h
Normal file
@ -0,0 +1,93 @@
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "OnDiskWrapper.h"
|
||||
#include "Phrase.h"
|
||||
#include "SourcePhrase.h"
|
||||
#include "Word.h"
|
||||
#include "PhraseNode.h"
|
||||
|
||||
|
||||
namespace OnDiskPt
|
||||
{
|
||||
|
||||
void Tokenize(Phrase &phrase
|
||||
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
|
||||
, OnDiskWrapper &onDiskWrapper)
|
||||
{
|
||||
|
||||
bool nonTerm = false;
|
||||
size_t tokSize = token.size();
|
||||
int comStr =token.compare(0, 1, "[");
|
||||
|
||||
if (comStr == 0) {
|
||||
comStr = token.compare(tokSize - 1, 1, "]");
|
||||
nonTerm = comStr == 0;
|
||||
}
|
||||
|
||||
if (nonTerm) {
|
||||
// non-term
|
||||
size_t splitPos = token.find_first_of("[", 2);
|
||||
std::string wordStr = token.substr(0, splitPos);
|
||||
|
||||
if (splitPos == std::string::npos) {
|
||||
// lhs - only 1 word
|
||||
WordPtr word (new Word());
|
||||
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
||||
phrase.AddWord(word);
|
||||
} else {
|
||||
// source & target non-terms
|
||||
if (addSourceNonTerm) {
|
||||
WordPtr word( new Word());
|
||||
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
||||
phrase.AddWord(word);
|
||||
}
|
||||
|
||||
wordStr = token.substr(splitPos, tokSize - splitPos);
|
||||
if (addTargetNonTerm) {
|
||||
WordPtr word(new Word());
|
||||
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
||||
phrase.AddWord(word);
|
||||
}
|
||||
|
||||
}
|
||||
} else {
|
||||
// term
|
||||
WordPtr word(new Word());
|
||||
word->CreateFromString(token, onDiskWrapper.GetVocab());
|
||||
phrase.AddWord(word);
|
||||
}
|
||||
}
|
||||
|
||||
SourcePhrase Tokenize(const std::vector<std::string>& tokens, OnDiskWrapper &onDiskWrapper)
|
||||
{
|
||||
SourcePhrase sourcePhrase;
|
||||
if (tokens.size() > 0){
|
||||
std::vector<std::string>::const_iterator token = tokens.begin();
|
||||
for (; token + 1 != tokens.end(); ++token){
|
||||
Tokenize(sourcePhrase, *token, true, true, onDiskWrapper);
|
||||
}
|
||||
// last position. LHS non-term
|
||||
Tokenize(sourcePhrase, *token, false, true, onDiskWrapper);
|
||||
}
|
||||
return sourcePhrase;
|
||||
}
|
||||
|
||||
const PhraseNode* Query(const SourcePhrase& sourcePhrase, OnDiskWrapper& onDiskWrapper)
|
||||
{
|
||||
const PhraseNode *node = &onDiskWrapper.GetRootSourceNode();
|
||||
assert(node);
|
||||
|
||||
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos)
|
||||
{
|
||||
const Word &word = sourcePhrase.GetWord(pos);
|
||||
node = node->GetChild(word, onDiskWrapper);
|
||||
if (node == NULL)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
}
|
@ -9,6 +9,7 @@
|
||||
#include "moses/Util.h"
|
||||
#include "OnDiskWrapper.h"
|
||||
#include "SourcePhrase.h"
|
||||
#include "Util.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace OnDiskPt;
|
||||
@ -17,55 +18,6 @@ void usage();
|
||||
|
||||
typedef unsigned int uint;
|
||||
|
||||
void Tokenize(OnDiskPt::Phrase &phrase
|
||||
, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
|
||||
, OnDiskPt::OnDiskWrapper &onDiskWrapper)
|
||||
{
|
||||
|
||||
bool nonTerm = false;
|
||||
size_t tokSize = token.size();
|
||||
int comStr =token.compare(0, 1, "[");
|
||||
|
||||
if (comStr == 0) {
|
||||
comStr = token.compare(tokSize - 1, 1, "]");
|
||||
nonTerm = comStr == 0;
|
||||
}
|
||||
|
||||
if (nonTerm) {
|
||||
// non-term
|
||||
size_t splitPos = token.find_first_of("[", 2);
|
||||
string wordStr = token.substr(0, splitPos);
|
||||
|
||||
if (splitPos == string::npos) {
|
||||
// lhs - only 1 word
|
||||
WordPtr word (new Word());
|
||||
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
||||
phrase.AddWord(word);
|
||||
} else {
|
||||
// source & target non-terms
|
||||
if (addSourceNonTerm) {
|
||||
WordPtr word( new Word());
|
||||
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
||||
phrase.AddWord(word);
|
||||
}
|
||||
|
||||
wordStr = token.substr(splitPos, tokSize - splitPos);
|
||||
if (addTargetNonTerm) {
|
||||
WordPtr word(new Word());
|
||||
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
|
||||
phrase.AddWord(word);
|
||||
}
|
||||
|
||||
}
|
||||
} else {
|
||||
// term
|
||||
WordPtr word(new Word());
|
||||
word->CreateFromString(token, onDiskWrapper.GetVocab());
|
||||
phrase.AddWord(word);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int tableLimit = 20;
|
||||
@ -89,52 +41,20 @@ int main(int argc, char **argv)
|
||||
if(ttable == "")
|
||||
usage();
|
||||
|
||||
OnDiskWrapper onDiskWrapper;
|
||||
OnDiskWrapper onDiskWrapper;
|
||||
bool retDb = onDiskWrapper.BeginLoad(ttable);
|
||||
CHECK(retDb);
|
||||
|
||||
cerr << "Ready..." << endl;
|
||||
|
||||
CHECK(retDb);
|
||||
|
||||
cerr << "Ready..." << endl;
|
||||
|
||||
std::string line;
|
||||
while(getline(std::cin, line)) {
|
||||
std::vector<std::string> tokens;
|
||||
tokens = Moses::Tokenize(line, " ");
|
||||
|
||||
cerr << "line: " << line << endl;
|
||||
|
||||
// create source phrase
|
||||
SourcePhrase sourcePhrase;
|
||||
|
||||
for (size_t pos = 0; pos < tokens.size(); ++pos)
|
||||
{
|
||||
const string &tok = tokens[pos];
|
||||
|
||||
if (pos == tokens.size() - 1)
|
||||
{ // last position. LHS non-term
|
||||
Tokenize(sourcePhrase, tok, false, true, onDiskWrapper);
|
||||
}
|
||||
else
|
||||
{
|
||||
Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
|
||||
}
|
||||
}
|
||||
|
||||
const PhraseNode *node = &onDiskWrapper.GetRootSourceNode();
|
||||
cerr << "node=" << node << endl;
|
||||
assert(node);
|
||||
|
||||
for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos)
|
||||
{
|
||||
const Word &word = sourcePhrase.GetWord(pos);
|
||||
cerr << word << " ";
|
||||
node = node->GetChild(word, onDiskWrapper);
|
||||
cerr << "node=" << node << endl;
|
||||
|
||||
if (node == NULL)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
cerr << "line: " << line << endl;
|
||||
SourcePhrase sourcePhrase = Tokenize(tokens, onDiskWrapper);
|
||||
const PhraseNode* node = Query(sourcePhrase, onDiskWrapper);
|
||||
|
||||
if (node)
|
||||
{ // source phrase points to a bunch of rules
|
||||
@ -148,8 +68,6 @@ int main(int argc, char **argv)
|
||||
cerr << " ";
|
||||
targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
|
||||
cerr << endl;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -162,12 +80,11 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
cerr << "Finished." << endl;
|
||||
|
||||
}
|
||||
|
||||
void usage()
|
||||
{
|
||||
std::cerr << "Usage: queryOnDiskPt [-n <nscores>] [-a] -t <ttable>\n"
|
||||
std::cerr << "Usage: queryOnDiskPt [-n <nscores>] [-a] -t <ttable>\n"
|
||||
"-tlimit <table limit> max number of rules per source phrase (default: 20)\n"
|
||||
"-t <ttable> phrase table\n";
|
||||
exit(1);
|
||||
|
@ -1,6 +1,5 @@
|
||||
from libcpp.string cimport string
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.pair cimport pair
|
||||
|
||||
ctypedef string* str_pointer
|
||||
ctypedef string* const_str_pointer "const str_pointer"
|
||||
@ -16,8 +15,6 @@ cdef extern from 'PhraseDictionaryTree.h' namespace 'Moses':
|
||||
Tokens fnames
|
||||
Scores fvalues
|
||||
|
||||
|
||||
|
||||
cdef cppclass PhraseDictionaryTree:
|
||||
PhraseDictionaryTree(unsigned nscores)
|
||||
void UseWordAlignment(bint use)
|
||||
|
45
contrib/python/moses/condiskpt.pxd
Normal file
45
contrib/python/moses/condiskpt.pxd
Normal file
@ -0,0 +1,45 @@
|
||||
from libcpp.string cimport string
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
cdef extern from 'Word.h' namespace 'OnDiskPt':
|
||||
cdef cppclass Word
|
||||
|
||||
cdef extern from 'Phrase.h' namespace 'OnDiskPt':
|
||||
cdef cppclass Phrase
|
||||
|
||||
cdef extern from 'SourcePhrase.h' namespace 'OnDiskPt':
|
||||
cdef cppclass SourcePhrase
|
||||
|
||||
#cdef extern from 'TargetPhrase.h' namespace 'OnDiskPt':
|
||||
# cdef cppclass TargetPhrase
|
||||
|
||||
#cdef extern from 'OnDiskWrapper.h' namespace 'OnDiskPt':
|
||||
# cdef cppclass OnDiskWrapper
|
||||
|
||||
cdef extern from 'TargetPhraseCollection.h' namespace 'OnDiskPt':
|
||||
cdef cppclass TargetPhraseCollection
|
||||
|
||||
cdef extern from 'PhraseNode.h' namespace 'OnDiskPt':
|
||||
cdef cppclass PhraseNode
|
||||
# cdef cppclass PhraseNodePointer 'PhaseNode*'
|
||||
|
||||
cdef extern from 'OnDiskWrapper.h' namespace 'OnDiskPt':
|
||||
cdef cppclass OnDiskWrapper
|
||||
|
||||
cdef extern from 'PhraseNode.h' namespace 'OnDiskPt':
|
||||
cdef cppclass PhraseNode:
|
||||
PhraseNode* GetChild(Word& word, OnDiskWrapper& wrapper)
|
||||
TargetPhraseCollection* GetTargetPhraseCollection(unsigned tableLimit, OnDiskWrapper& wrapper)
|
||||
|
||||
cdef extern from 'OnDiskWrapper.h' namespace 'OnDiskPt':
|
||||
|
||||
cdef cppclass OnDiskWrapper:
|
||||
OnDiskWrapper()
|
||||
bint BeginLoad(string& path)
|
||||
PhraseNode& GetRootSourceNode()
|
||||
|
||||
cdef extern from 'Util.h' namespace 'OnDiskPt':
|
||||
# cdef void Tokenize(Phrase& phrase, string& token, bint addSourceNonTerm, bint addTargetNonTerm, OnDiskWrapper& wrapper)
|
||||
cdef SourcePhrase Tokenize(vector[string]& tokens, OnDiskWrapper& wrapper)
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -5,6 +5,7 @@ from libcpp.vector cimport vector
|
||||
import os
|
||||
import cython
|
||||
cimport cdictree
|
||||
cimport condiskpt
|
||||
|
||||
cpdef int fsign(float x):
|
||||
"""Simply returns the sign of float x (zero is assumed +), it's defined here just so one gains a little bit with static typing"""
|
||||
@ -87,7 +88,7 @@ cdef class PhraseDictionaryTree(object):
|
||||
self.delimiters = delimiters
|
||||
self.__tree = new cdictree.PhraseDictionaryTree(nscores)
|
||||
self.__tree.UseWordAlignment(wa)
|
||||
self.__tree.Read(string(path))
|
||||
self.__tree.Read(path)
|
||||
|
||||
def __dealloc__(self):
|
||||
del self.__tree
|
||||
@ -115,7 +116,7 @@ cdef class PhraseDictionaryTree(object):
|
||||
If 'cmp' is defined the return list is sorted.
|
||||
If 'top' is defined, onlye the top elements will be returned."""
|
||||
cdef bytes text = as_str(line)
|
||||
cdef vector[string] fphrase = cdictree.Tokenize(string(text), string(self.delimiters))
|
||||
cdef vector[string] fphrase = cdictree.Tokenize(text, self.delimiters)
|
||||
cdef vector[cdictree.StringTgtCand]* rv = new vector[cdictree.StringTgtCand]()
|
||||
cdef vector[string]* wa = NULL
|
||||
cdef list phrases
|
||||
@ -134,4 +135,20 @@ cdef class PhraseDictionaryTree(object):
|
||||
return phrases[0:top]
|
||||
else:
|
||||
return phrases
|
||||
|
||||
cdef class OnDiskWrapper(object):
|
||||
|
||||
cdef condiskpt.OnDiskWrapper* wrapper
|
||||
cdef readonly bytes delimiters
|
||||
|
||||
def __cinit__(self, bytes path, delimiters = ' \t'):
|
||||
self.delimiters = delimiters
|
||||
self.wrapper = new condiskpt.OnDiskWrapper()
|
||||
self.wrapper.BeginLoad(string(path))
|
||||
|
||||
def query(self, line):
|
||||
cdef bytes text = as_str(line)
|
||||
cdef vector[string] ftokens = cdictree.Tokenize(text, self.delimiters)
|
||||
#print 'query:', ftokens
|
||||
#cdef condiskpt.SourcePhrase fphrase = condiskpt.Tokenize(ftokens, self.wrapper[0])
|
||||
|
||||
|
@ -4,7 +4,7 @@ import sys
|
||||
import os
|
||||
|
||||
mosesdir = os.path.abspath('../../')
|
||||
includes = [mosesdir, os.path.join(mosesdir, 'moses')]
|
||||
includes = [mosesdir, os.path.join(mosesdir, 'moses'), os.path.join(mosesdir, 'OnDiskPt')]
|
||||
libdir = os.path.join(mosesdir, 'lib')
|
||||
|
||||
# options
|
||||
|
Loading…
Reference in New Issue
Block a user