This commit is contained in:
alvations 2015-04-26 20:23:39 +02:00
commit dda3ddd80b
495 changed files with 8669 additions and 5118 deletions

65
Jamroot
View File

@ -72,48 +72,42 @@
#--clean to clean
#--debug-build to build with Og. Only available with gcc 4.8+
import os ;
import option ;
import modules ;
import path ;
path-constant TOP : . ;
include $(TOP)/jam-files/sanity.jam ;
include $(TOP)/jam-files/server.jam ;
# exit : 0 ;
if [ build_server ] != no
{
xmlrpc-c-prefix = [ shell_or_die "$(xmlrpc-c-config-cmd) c++2 abyss-server --prefix" ] ;
echo "XMLRPC-C: BUILDING MOSES WITH XMLRPC_C LIBRARY VERSION $(xmlrpc-c-version) FROM $(xmlrpc-c-prefix)" ;
xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-c-config-cmd) c++2 abyss-server --cflags" ] ;
requirements += <define>HAVE_XMLRPC_C ;
requirements += <cxxflags>$(xmlrpc-cxxflags) ;
xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-c-config-cmd) c++2 abyss-server --libs" ] ;
for local i in [ SPLIT_BY_CHARACTERS $(xmlrpc-linkflags) : " " ]
{
local libname = [ MATCH "-l(xmlrpc.*)" : $(i) ] ;
if $(libname)
{
external-lib $(libname)
: : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
requirements += <library>$(libname) ;
}
local pathname = [ MATCH "-L(.*)" : $(i) ] ;
if $(pathname)
{
requirements += <library-path>$(pathname) ;
}
}
home = [ os.environ "HOME" ] ;
if [ path.exists $(home)/moses-environment.jam ]
{
# for those of use who don't like typing in command line bjam options all day long
include $(home)/moses-environment.jam ;
}
# echo $(requirements) ;
# exit 0 ;
include $(TOP)/jam-files/check-environment.jam ; # get resource locations
# from environment variables
include $(TOP)/jam-files/xmlrpc-c.jam ; # xmlrpc-c stuff for the server
include $(TOP)/jam-files/curlpp.jam ; # curlpp stuff for bias lookup (MMT only)
# exit "done" : 0 ;
max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
if ! [ option.get "max-kenlm-order" ]
{
# some classes in Moses pull in header files from KenLM, so this needs to be
# defined here, not in moses/lm/Jamfile
option.set "max-kenlm-order" : 6 ;
requirements += <define>KENLM_MAX_ORDER=$(max-order) ;
}
# exit "all done" : 0 ;
boost 104400 ;
external-lib z ;
lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
requirements += <library>dl ;
#lib dl : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
#requirements += <library>dl ;
if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] {
@ -139,6 +133,7 @@ if [ option.get "filter-warnings" : : "yes" ] {
requirements += <cxxflags>-Wno-unused-but-set-variable ;
requirements += <cxxflags>-Wno-unused-result ;
requirements += <cxxflags>-Wno-unused-variable ;
requirements += <cxxflags>-Wcomment ;
}
if [ option.get "debug-build" : : "yes" ] {
@ -228,10 +223,11 @@ build-projects lm util phrase-extract phrase-extract/syntax-common search moses
if [ option.get "with-mm" : : "yes" ]
{
alias mm :
moses/TranslationModel/UG//bitext-find
moses/TranslationModel/UG//ptable-describe-features
moses/TranslationModel/UG//count-ptable-features
moses/TranslationModel/UG//ptable-lookup
moses/TranslationModel/UG//spe-check-coverage
# moses/TranslationModel/UG//spe-check-coverage
moses/TranslationModel/UG/mm//mtt-demo1
moses/TranslationModel/UG/mm//mtt-build
moses/TranslationModel/UG/mm//mtt-dump
@ -307,6 +303,3 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
local temp = [ _shell "mkdir -p $(TOP)/bin" ] ;
local temp = [ _shell "rm $(TOP)/bin/moses_chart" ] ;
local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ;

View File

@ -51,7 +51,7 @@ void OnDiskWrapper::BeginLoad(const std::string &filePath)
if (!m_vocab.Load(*this))
UTIL_THROW(util::FileOpenException, "Couldn't load vocab");
UINT64 rootFilePos = GetMisc("RootNodeOffset");
uint64_t rootFilePos = GetMisc("RootNodeOffset");
m_rootSourceNode = new PhraseNode(rootFilePos, *this);
}
@ -102,7 +102,7 @@ bool OnDiskWrapper::LoadMisc()
const string &key = tokens[0];
m_miscInfo[key] = Moses::Scan<UINT64>(tokens[1]);
m_miscInfo[key] = Moses::Scan<uint64_t>(tokens[1]);
}
return true;
@ -199,17 +199,17 @@ void OnDiskWrapper::SaveMisc()
size_t OnDiskWrapper::GetSourceWordSize() const
{
return sizeof(UINT64) + sizeof(char);
return sizeof(uint64_t) + sizeof(char);
}
size_t OnDiskWrapper::GetTargetWordSize() const
{
return sizeof(UINT64) + sizeof(char);
return sizeof(uint64_t) + sizeof(char);
}
UINT64 OnDiskWrapper::GetMisc(const std::string &key) const
uint64_t OnDiskWrapper::GetMisc(const std::string &key) const
{
std::map<std::string, UINT64>::const_iterator iter;
std::map<std::string, uint64_t>::const_iterator iter;
iter = m_miscInfo.find(key);
UTIL_THROW_IF2(iter == m_miscInfo.end()
, "Couldn't find value for key " << key
@ -243,7 +243,7 @@ Word *OnDiskWrapper::ConvertFromMoses(const std::vector<Moses::FactorType> &fact
} // for (size_t factorType
bool found;
UINT64 vocabId = m_vocab.GetVocabId(strme.str(), found);
uint64_t vocabId = m_vocab.GetVocabId(strme.str(), found);
if (!found) {
// factor not in phrase table -> phrse definately not in. exit
delete newWord;

View File

@ -43,7 +43,7 @@ protected:
size_t m_defaultNodeSize;
PhraseNode *m_rootSourceNode;
std::map<std::string, UINT64> m_miscInfo;
std::map<std::string, uint64_t> m_miscInfo;
void SaveMisc();
bool OpenForLoad(const std::string &filePath);
@ -105,7 +105,7 @@ public:
return *m_rootSourceNode;
}
UINT64 GetMisc(const std::string &key) const;
uint64_t GetMisc(const std::string &key) const;
Word *ConvertFromMoses(const std::vector<Moses::FactorType> &factorsVec
, const Moses::Word &origWord) const;

View File

@ -31,8 +31,8 @@ namespace OnDiskPt
size_t PhraseNode::GetNodeSize(size_t numChildren, size_t wordSize, size_t countSize)
{
size_t ret = sizeof(UINT64) * 2 // num children, value
+ (wordSize + sizeof(UINT64)) * numChildren // word + ptr to next source node
size_t ret = sizeof(uint64_t) * 2 // num children, value
+ (wordSize + sizeof(uint64_t)) * numChildren // word + ptr to next source node
+ sizeof(float) * countSize; // count info
return ret;
}
@ -45,7 +45,7 @@ PhraseNode::PhraseNode()
{
}
PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)
PhraseNode::PhraseNode(uint64_t filePos, OnDiskWrapper &onDiskWrapper)
:m_counts(onDiskWrapper.GetNumCounts())
{
// load saved node
@ -55,26 +55,26 @@ PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)
std::fstream &file = onDiskWrapper.GetFileSource();
file.seekg(filePos);
assert(filePos == (UINT64)file.tellg());
assert(filePos == (uint64_t)file.tellg());
file.read((char*) &m_numChildrenLoad, sizeof(UINT64));
file.read((char*) &m_numChildrenLoad, sizeof(uint64_t));
size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
m_memLoad = (char*) malloc(memAlloc);
// go to start of node again
file.seekg(filePos);
assert(filePos == (UINT64)file.tellg());
assert(filePos == (uint64_t)file.tellg());
// read everything into memory
file.read(m_memLoad, memAlloc);
assert(filePos + memAlloc == (UINT64)file.tellg());
assert(filePos + memAlloc == (uint64_t)file.tellg());
// get value
m_value = ((UINT64*)m_memLoad)[1];
m_value = ((uint64_t*)m_memLoad)[1];
// get counts
float *memFloat = (float*) (m_memLoad + sizeof(UINT64) * 2);
float *memFloat = (float*) (m_memLoad + sizeof(uint64_t) * 2);
assert(countSize == 1);
m_counts[0] = memFloat[0];
@ -108,10 +108,10 @@ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimi
//memset(mem, 0xfe, memAlloc);
size_t memUsed = 0;
UINT64 *memArray = (UINT64*) mem;
uint64_t *memArray = (uint64_t*) mem;
memArray[0] = GetSize(); // num of children
memArray[1] = m_value; // file pos of corresponding target phrases
memUsed += 2 * sizeof(UINT64);
memUsed += 2 * sizeof(uint64_t);
// count info
float *memFloat = (float*) (mem + memUsed);
@ -133,9 +133,9 @@ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimi
size_t wordMemUsed = childWord.WriteToMemory(currMem);
memUsed += wordMemUsed;
UINT64 *memArray = (UINT64*) (mem + memUsed);
uint64_t *memArray = (uint64_t*) (mem + memUsed);
memArray[0] = childNode.GetFilePos();
memUsed += sizeof(UINT64);
memUsed += sizeof(uint64_t);
}
@ -148,7 +148,7 @@ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimi
file.seekp(0, ios::end);
file.write(mem, memUsed);
UINT64 endPos = file.tellp();
uint64_t endPos = file.tellp();
assert(m_filePos + memUsed == endPos);
free(mem);
@ -206,7 +206,7 @@ const PhraseNode *PhraseNode::GetChild(const Word &wordSought, OnDiskWrapper &on
x = (l + r) / 2;
Word wordFound;
UINT64 childFilePos;
uint64_t childFilePos;
GetChild(wordFound, childFilePos, x, onDiskWrapper);
if (wordSought == wordFound) {
@ -222,14 +222,14 @@ const PhraseNode *PhraseNode::GetChild(const Word &wordSought, OnDiskWrapper &on
return ret;
}
void PhraseNode::GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const
void PhraseNode::GetChild(Word &wordFound, uint64_t &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const
{
size_t wordSize = onDiskWrapper.GetSourceWordSize();
size_t childSize = wordSize + sizeof(UINT64);
size_t childSize = wordSize + sizeof(uint64_t);
char *currMem = m_memLoad
+ sizeof(UINT64) * 2 // size & file pos of target phrase coll
+ sizeof(uint64_t) * 2 // size & file pos of target phrase coll
+ sizeof(float) * onDiskWrapper.GetNumCounts() // count info
+ childSize * ind;
@ -237,15 +237,15 @@ void PhraseNode::GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnD
assert(memRead == childSize);
}
size_t PhraseNode::ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const
size_t PhraseNode::ReadChild(Word &wordFound, uint64_t &childFilePos, const char *mem) const
{
size_t memRead = wordFound.ReadFromMemory(mem);
const char *currMem = mem + memRead;
UINT64 *memArray = (UINT64*) (currMem);
uint64_t *memArray = (uint64_t*) (currMem);
childFilePos = memArray[0];
memRead += sizeof(UINT64);
memRead += sizeof(uint64_t);
return memRead;
}

View File

@ -36,7 +36,7 @@ class PhraseNode
{
friend std::ostream& operator<<(std::ostream&, const PhraseNode&);
protected:
UINT64 m_filePos, m_value;
uint64_t m_filePos, m_value;
typedef std::map<Word, PhraseNode> ChildColl;
ChildColl m_children;
@ -48,35 +48,35 @@ protected:
TargetPhraseCollection m_targetPhraseColl;
char *m_memLoad, *m_memLoadLast;
UINT64 m_numChildrenLoad;
uint64_t m_numChildrenLoad;
void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
, size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const;
void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;
size_t ReadChild(Word &wordFound, uint64_t &childFilePos, const char *mem) const;
void GetChild(Word &wordFound, uint64_t &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;
public:
static size_t GetNodeSize(size_t numChildren, size_t wordSize, size_t countSize);
PhraseNode(); // unsaved node
PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper); // load saved node
PhraseNode(uint64_t filePos, OnDiskWrapper &onDiskWrapper); // load saved node
~PhraseNode();
void Add(const Word &word, UINT64 nextFilePos, size_t wordSize);
void Add(const Word &word, uint64_t nextFilePos, size_t wordSize);
void Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit);
void AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
, OnDiskWrapper &onDiskWrapper, size_t tableLimit
, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
UINT64 GetFilePos() const {
uint64_t GetFilePos() const {
return m_filePos;
}
UINT64 GetValue() const {
uint64_t GetValue() const {
return m_value;
}
void SetValue(UINT64 value) {
void SetValue(uint64_t value) {
m_value = value;
}
size_t GetSize() const {

View File

@ -103,17 +103,17 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
size_t spSize = sp->GetSize();
size_t sourceWordSize = onDiskWrapper.GetSourceWordSize();
size_t memNeeded = sizeof(UINT64) // num of words
size_t memNeeded = sizeof(uint64_t) // num of words
+ targetWordSize * phraseSize // actual words. lhs as last words
+ sizeof(UINT64) // num source words
+ sizeof(uint64_t) // num source words
+ sourceWordSize * spSize; // actual source words
memUsed = 0;
UINT64 *mem = (UINT64*) malloc(memNeeded);
uint64_t *mem = (uint64_t*) malloc(memNeeded);
// write size
mem[0] = phraseSize;
memUsed += sizeof(UINT64);
memUsed += sizeof(uint64_t);
// write each word
for (size_t pos = 0; pos < phraseSize; ++pos) {
@ -124,9 +124,9 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
// write size of source phrase and all source words
char *currPtr = (char*)mem + memUsed;
UINT64 *memTmp = (UINT64*) currPtr;
uint64_t *memTmp = (uint64_t*) currPtr;
memTmp[0] = spSize;
memUsed += sizeof(UINT64);
memUsed += sizeof(uint64_t);
for (size_t pos = 0; pos < spSize; ++pos) {
const Word &word = sp->GetWord(pos);
char *currPtr = (char*)mem + memUsed;
@ -145,13 +145,13 @@ void TargetPhrase::Save(OnDiskWrapper &onDiskWrapper)
std::fstream &file = onDiskWrapper.GetFileTargetInd();
UINT64 startPos = file.tellp();
uint64_t startPos = file.tellp();
file.seekp(0, ios::end);
file.write(mem, memUsed);
#ifndef NDEBUG
UINT64 endPos = file.tellp();
uint64_t endPos = file.tellp();
assert(startPos + memUsed == endPos);
#endif
@ -167,11 +167,11 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
size_t sparseFeatureSize = m_sparseFeatures.size();
size_t propSize = m_property.size();
size_t memNeeded = sizeof(UINT64) // file pos (phrase id)
+ sizeof(UINT64) + 2 * sizeof(UINT64) * numAlign // align
size_t memNeeded = sizeof(uint64_t) // file pos (phrase id)
+ sizeof(uint64_t) + 2 * sizeof(uint64_t) * numAlign // align
+ sizeof(float) * numScores // scores
+ sizeof(UINT64) + sparseFeatureSize // sparse features string
+ sizeof(UINT64) + propSize; // property string
+ sizeof(uint64_t) + sparseFeatureSize // sparse features string
+ sizeof(uint64_t) + propSize; // property string
char *mem = (char*) malloc(memNeeded);
//memset(mem, 0, memNeeded);
@ -179,8 +179,8 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
memUsed = 0;
// phrase id
memcpy(mem, &m_filePos, sizeof(UINT64));
memUsed += sizeof(UINT64);
memcpy(mem, &m_filePos, sizeof(uint64_t));
memUsed += sizeof(uint64_t);
// align
size_t tmp = WriteAlignToMemory(mem + memUsed);
@ -203,11 +203,11 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
size_t TargetPhrase::WriteStringToMemory(char *mem, const std::string &str) const
{
size_t memUsed = 0;
UINT64 *memTmp = (UINT64*) mem;
uint64_t *memTmp = (uint64_t*) mem;
size_t strSize = str.size();
memTmp[0] = strSize;
memUsed += sizeof(UINT64);
memUsed += sizeof(uint64_t);
const char *charStr = str.c_str();
memcpy(mem + memUsed, charStr, strSize);
@ -221,7 +221,7 @@ size_t TargetPhrase::WriteAlignToMemory(char *mem) const
size_t memUsed = 0;
// num of alignments
UINT64 numAlign = m_align.size();
uint64_t numAlign = m_align.size();
memcpy(mem, &numAlign, sizeof(numAlign));
memUsed += sizeof(numAlign);
@ -319,20 +319,20 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
return ret;
}
UINT64 TargetPhrase::ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl)
uint64_t TargetPhrase::ReadOtherInfoFromFile(uint64_t filePos, std::fstream &fileTPColl)
{
assert(filePos == (UINT64)fileTPColl.tellg());
assert(filePos == (uint64_t)fileTPColl.tellg());
UINT64 memUsed = 0;
fileTPColl.read((char*) &m_filePos, sizeof(UINT64));
memUsed += sizeof(UINT64);
uint64_t memUsed = 0;
fileTPColl.read((char*) &m_filePos, sizeof(uint64_t));
memUsed += sizeof(uint64_t);
assert(m_filePos != 0);
memUsed += ReadAlignFromFile(fileTPColl);
assert((memUsed + filePos) == (UINT64)fileTPColl.tellg());
assert((memUsed + filePos) == (uint64_t)fileTPColl.tellg());
memUsed += ReadScoresFromFile(fileTPColl);
assert((memUsed + filePos) == (UINT64)fileTPColl.tellg());
assert((memUsed + filePos) == (uint64_t)fileTPColl.tellg());
// sparse features
memUsed += ReadStringFromFile(fileTPColl, m_sparseFeatures);
@ -343,13 +343,13 @@ UINT64 TargetPhrase::ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPC
return memUsed;
}
UINT64 TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr)
uint64_t TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr)
{
UINT64 bytesRead = 0;
uint64_t bytesRead = 0;
UINT64 strSize;
fileTPColl.read((char*) &strSize, sizeof(UINT64));
bytesRead += sizeof(UINT64);
uint64_t strSize;
fileTPColl.read((char*) &strSize, sizeof(uint64_t));
bytesRead += sizeof(uint64_t);
if (strSize) {
char *mem = (char*) malloc(strSize + 1);
@ -364,15 +364,15 @@ UINT64 TargetPhrase::ReadStringFromFile(std::fstream &fileTPColl, std::string &o
return bytesRead;
}
UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
uint64_t TargetPhrase::ReadFromFile(std::fstream &fileTP)
{
UINT64 bytesRead = 0;
uint64_t bytesRead = 0;
fileTP.seekg(m_filePos);
UINT64 numWords;
fileTP.read((char*) &numWords, sizeof(UINT64));
bytesRead += sizeof(UINT64);
uint64_t numWords;
fileTP.read((char*) &numWords, sizeof(uint64_t));
bytesRead += sizeof(uint64_t);
for (size_t ind = 0; ind < numWords; ++ind) {
WordPtr word(new Word());
@ -381,9 +381,9 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
}
// read source words
UINT64 numSourceWords;
fileTP.read((char*) &numSourceWords, sizeof(UINT64));
bytesRead += sizeof(UINT64);
uint64_t numSourceWords;
fileTP.read((char*) &numSourceWords, sizeof(uint64_t));
bytesRead += sizeof(uint64_t);
PhrasePtr sp(new SourcePhrase());
for (size_t ind = 0; ind < numSourceWords; ++ind) {
@ -396,31 +396,31 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
return bytesRead;
}
UINT64 TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
uint64_t TargetPhrase::ReadAlignFromFile(std::fstream &fileTPColl)
{
UINT64 bytesRead = 0;
uint64_t bytesRead = 0;
UINT64 numAlign;
fileTPColl.read((char*) &numAlign, sizeof(UINT64));
bytesRead += sizeof(UINT64);
uint64_t numAlign;
fileTPColl.read((char*) &numAlign, sizeof(uint64_t));
bytesRead += sizeof(uint64_t);
for (size_t ind = 0; ind < numAlign; ++ind) {
AlignPair alignPair;
fileTPColl.read((char*) &alignPair.first, sizeof(UINT64));
fileTPColl.read((char*) &alignPair.second, sizeof(UINT64));
fileTPColl.read((char*) &alignPair.first, sizeof(uint64_t));
fileTPColl.read((char*) &alignPair.second, sizeof(uint64_t));
m_align.push_back(alignPair);
bytesRead += sizeof(UINT64) * 2;
bytesRead += sizeof(uint64_t) * 2;
}
return bytesRead;
}
UINT64 TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl)
uint64_t TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl)
{
UTIL_THROW_IF2(m_scores.size() == 0, "Translation rules must must have some scores");
UINT64 bytesRead = 0;
uint64_t bytesRead = 0;
for (size_t ind = 0; ind < m_scores.size(); ++ind) {
fileTPColl.read((char*) &m_scores[ind], sizeof(float));

View File

@ -36,7 +36,7 @@ class Phrase;
namespace OnDiskPt
{
typedef std::pair<UINT64, UINT64> AlignPair;
typedef std::pair<uint64_t, uint64_t> AlignPair;
typedef std::vector<AlignPair> AlignType;
class Vocab;
@ -53,15 +53,15 @@ protected:
std::string m_sparseFeatures, m_property;
std::vector<float> m_scores;
UINT64 m_filePos;
uint64_t m_filePos;
size_t WriteAlignToMemory(char *mem) const;
size_t WriteScoresToMemory(char *mem) const;
size_t WriteStringToMemory(char *mem, const std::string &str) const;
UINT64 ReadAlignFromFile(std::fstream &fileTPColl);
UINT64 ReadScoresFromFile(std::fstream &fileTPColl);
UINT64 ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr);
uint64_t ReadAlignFromFile(std::fstream &fileTPColl);
uint64_t ReadScoresFromFile(std::fstream &fileTPColl);
uint64_t ReadStringFromFile(std::fstream &fileTPColl, std::string &outStr);
public:
TargetPhrase() {
@ -95,7 +95,7 @@ public:
char *WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed) const;
void Save(OnDiskWrapper &onDiskWrapper);
UINT64 GetFilePos() const {
uint64_t GetFilePos() const {
return m_filePos;
}
float GetScore(size_t ind) const {
@ -108,8 +108,8 @@ public:
, const Moses::PhraseDictionary &phraseDict
, const std::vector<float> &weightT
, bool isSyntax) const;
UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl);
UINT64 ReadFromFile(std::fstream &fileTP);
uint64_t ReadOtherInfoFromFile(uint64_t filePos, std::fstream &fileTPColl);
uint64_t ReadFromFile(std::fstream &fileTP);
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;

View File

@ -71,12 +71,12 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
{
std::fstream &file = onDiskWrapper.GetFileTargetColl();
size_t memUsed = sizeof(UINT64);
size_t memUsed = sizeof(uint64_t);
char *mem = (char*) malloc(memUsed);
// size of coll
UINT64 numPhrases = GetSize();
((UINT64*)mem)[0] = numPhrases;
uint64_t numPhrases = GetSize();
((uint64_t*)mem)[0] = numPhrases;
// MAIN LOOP
CollType::iterator iter;
@ -98,16 +98,16 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
}
// total number of bytes
//((UINT64*)mem)[0] = (UINT64) memUsed;
//((uint64_t*)mem)[0] = (uint64_t) memUsed;
UINT64 startPos = file.tellp();
uint64_t startPos = file.tellp();
file.seekp(0, ios::end);
file.write((char*) mem, memUsed);
free(mem);
#ifndef NDEBUG
UINT64 endPos = file.tellp();
uint64_t endPos = file.tellp();
assert(startPos + memUsed == endPos);
#endif
m_filePos = startPos;
@ -148,7 +148,7 @@ Moses::TargetPhraseCollection *TargetPhraseCollection::ConvertToMoses(const std:
}
void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnDiskWrapper &onDiskWrapper)
void TargetPhraseCollection::ReadFromFile(size_t tableLimit, uint64_t filePos, OnDiskWrapper &onDiskWrapper)
{
fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
fstream &fileTP = onDiskWrapper.GetFileTargetInd();
@ -156,23 +156,23 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
size_t numScores = onDiskWrapper.GetNumScores();
UINT64 numPhrases;
uint64_t numPhrases;
UINT64 currFilePos = filePos;
uint64_t currFilePos = filePos;
fileTPColl.seekg(filePos);
fileTPColl.read((char*) &numPhrases, sizeof(UINT64));
fileTPColl.read((char*) &numPhrases, sizeof(uint64_t));
// table limit
if (tableLimit) {
numPhrases = std::min(numPhrases, (UINT64) tableLimit);
numPhrases = std::min(numPhrases, (uint64_t) tableLimit);
}
currFilePos += sizeof(UINT64);
currFilePos += sizeof(uint64_t);
for (size_t ind = 0; ind < numPhrases; ++ind) {
TargetPhrase *tp = new TargetPhrase(numScores);
UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
uint64_t sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
tp->ReadFromFile(fileTP);
currFilePos += sizeOtherInfo;
@ -181,7 +181,7 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
}
}
UINT64 TargetPhraseCollection::GetFilePos() const
uint64_t TargetPhraseCollection::GetFilePos() const
{
return m_filePos;
}

View File

@ -46,7 +46,7 @@ class TargetPhraseCollection
protected:
typedef std::vector<TargetPhrase*> CollType;
CollType m_coll;
UINT64 m_filePos;
uint64_t m_filePos;
std::string m_debugStr;
public:
@ -67,7 +67,7 @@ public:
const TargetPhrase &GetTargetPhrase(size_t ind) const;
UINT64 GetFilePos() const;
uint64_t GetFilePos() const;
Moses::TargetPhraseCollection *ConvertToMoses(const std::vector<Moses::FactorType> &inputFactors
, const std::vector<Moses::FactorType> &outputFactors
@ -75,7 +75,7 @@ public:
, const std::vector<float> &weightT
, Vocab &vocab
, bool isSyntax) const;
void ReadFromFile(size_t tableLimit, UINT64 filePos, OnDiskWrapper &onDiskWrapper);
void ReadFromFile(size_t tableLimit, uint64_t filePos, OnDiskWrapper &onDiskWrapper);
const std::string GetDebugStr() const;
void SetDebugStr(const std::string &str);

View File

@ -38,7 +38,7 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
Moses::Tokenize(tokens, line);
UTIL_THROW_IF2(tokens.size() != 2, "Vocab file corrupted");
const string &key = tokens[0];
m_vocabColl[key] = Moses::Scan<UINT64>(tokens[1]);
m_vocabColl[key] = Moses::Scan<uint64_t>(tokens[1]);
}
// create lookup
@ -48,7 +48,7 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
CollType::const_iterator iter;
for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) {
UINT32 vocabId = iter->second;
uint32_t vocabId = iter->second;
const std::string &word = iter->first;
m_lookup[vocabId] = word;
@ -63,13 +63,13 @@ void Vocab::Save(OnDiskWrapper &onDiskWrapper)
CollType::const_iterator iterVocab;
for (iterVocab = m_vocabColl.begin(); iterVocab != m_vocabColl.end(); ++iterVocab) {
const string &word = iterVocab->first;
UINT32 vocabId = iterVocab->second;
uint32_t vocabId = iterVocab->second;
file << word << " " << vocabId << endl;
}
}
UINT64 Vocab::AddVocabId(const std::string &str)
uint64_t Vocab::AddVocabId(const std::string &str)
{
// find string id
CollType::const_iterator iter = m_vocabColl.find(str);
@ -83,7 +83,7 @@ UINT64 Vocab::AddVocabId(const std::string &str)
}
}
UINT64 Vocab::GetVocabId(const std::string &str, bool &found) const
uint64_t Vocab::GetVocabId(const std::string &str, bool &found) const
{
// find string id
CollType::const_iterator iter = m_vocabColl.find(str);

View File

@ -34,19 +34,19 @@ class OnDiskWrapper;
class Vocab
{
protected:
typedef std::map<std::string, UINT64> CollType;
typedef std::map<std::string, uint64_t> CollType;
CollType m_vocabColl;
std::vector<std::string> m_lookup; // opposite of m_vocabColl
UINT64 m_nextId; // starts @ 1
uint64_t m_nextId; // starts @ 1
public:
Vocab()
:m_nextId(1) {
}
UINT64 AddVocabId(const std::string &str);
UINT64 GetVocabId(const std::string &str, bool &found) const;
const std::string &GetString(UINT64 vocabId) const {
uint64_t AddVocabId(const std::string &str);
uint64_t GetVocabId(const std::string &str, bool &found) const;
const std::string &GetString(uint64_t vocabId) const {
return m_lookup[vocabId];
}

View File

@ -57,10 +57,10 @@ void Word::CreateFromString(const std::string &inString, Vocab &vocab)
size_t Word::WriteToMemory(char *mem) const
{
UINT64 *vocabMem = (UINT64*) mem;
uint64_t *vocabMem = (uint64_t*) mem;
vocabMem[0] = m_vocabId;
size_t size = sizeof(UINT64);
size_t size = sizeof(uint64_t);
// is non-term
char bNonTerm = (char) m_isNonTerminal;
@ -72,10 +72,10 @@ size_t Word::WriteToMemory(char *mem) const
size_t Word::ReadFromMemory(const char *mem)
{
UINT64 *vocabMem = (UINT64*) mem;
uint64_t *vocabMem = (uint64_t*) mem;
m_vocabId = vocabMem[0];
size_t memUsed = sizeof(UINT64);
size_t memUsed = sizeof(uint64_t);
// is non-term
char bNonTerm;
@ -88,8 +88,8 @@ size_t Word::ReadFromMemory(const char *mem)
size_t Word::ReadFromFile(std::fstream &file)
{
const size_t memAlloc = sizeof(UINT64) + sizeof(char);
char mem[sizeof(UINT64) + sizeof(char)];
const size_t memAlloc = sizeof(uint64_t) + sizeof(char);
char mem[sizeof(uint64_t) + sizeof(char)];
file.read(mem, memAlloc);
size_t memUsed = ReadFromMemory(mem);

View File

@ -43,7 +43,7 @@ class Word
private:
bool m_isNonTerminal;
UINT64 m_vocabId;
uint64_t m_vocabId;
public:
explicit Word() {
@ -67,7 +67,7 @@ public:
size_t ReadFromMemory(const char *mem);
size_t ReadFromFile(std::fstream &file);
void SetVocabId(UINT32 vocabId) {
void SetVocabId(uint32_t vocabId) {
m_vocabId = vocabId;
}

View File

@ -2,7 +2,7 @@
#include <fstream>
#include <string>
#include <stdlib.h>
#include <cstdlib>
#include <cstring>
namespace

View File

@ -4,7 +4,7 @@
#include <iostream>
#include <cstring>
#include <string>
#include <stdlib.h>
#include <cstdlib>
#include "SuffixArray.h"
#include "TargetCorpus.h"

View File

@ -1,6 +1,6 @@
#include "PhrasePairCollection.h"
#include <stdlib.h>
#include <cstdlib>
#include <cstring>
#include <algorithm>

View File

@ -2,7 +2,7 @@
#include <fstream>
#include <string>
#include <stdlib.h>
#include <cstdlib>
#include <cstring>
namespace

View File

@ -2,7 +2,7 @@
#include <fstream>
#include <string>
#include <stdlib.h>
#include <cstdlib>
#include <cstring>
namespace

View File

@ -109,14 +109,17 @@ size_t lookup( string query )
return suffixArray.Count( queryString );
}
vector<string> tokenize( const char input[] )
// Duplicate of definition in util/tokenize.hh.
// TODO: Can we de-duplicate this? At the time of writing biconcor does not
// use util at all.
vector<string> tokenize(const char input[])
{
vector< string > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
int i;
for(i = 0; input[i] != '\0'; i++) {
const bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;

View File

@ -5,11 +5,14 @@ namespace TOKENIZER_NAMESPACE {
#endif
Parameters::Parameters()
: cfg_path(0)
: nthreads(0)
, chunksize(2000)
, cfg_path(0)
, verbose_p(false)
, detag_p(false)
, alltag_p(false)
, escape_p(true)
, entities_p(false)
, escape_p(false)
, aggro_p(false)
, supersub_p(false)
, url_p(true)
@ -23,6 +26,10 @@ Parameters::Parameters()
, refined_p(false)
, unescape_p(false)
, drop_bad_p(false)
, split_p(false)
, notokenization_p(false)
, para_marks_p(false)
, split_breaks_p(false)
{
}

View File

@ -12,10 +12,13 @@ struct Parameters
std::string lang_iso;
std::vector<std::string> args;
std::string out_path;
int nthreads;
int chunksize;
const char *cfg_path;
bool verbose_p;
bool detag_p;
bool alltag_p;
bool entities_p;
bool escape_p;
bool aggro_p;
bool supersub_p;
@ -30,6 +33,10 @@ struct Parameters
bool refined_p;
bool unescape_p;
bool drop_bad_p;
bool split_p;
bool notokenization_p;
bool para_marks_p;
bool split_breaks_p;
Parameters();

File diff suppressed because it is too large Load Diff

View File

@ -26,12 +26,37 @@ class Tokenizer {
private:
static std::string cfg_dir;
typedef enum {
empty = 0,
blank,
upper, // upper case
letta, // extended word class (includes number, hyphen)
numba,
hyphn,
stops, // blank to stops are "extended word class" variants
quote, // init & fini = {',"}
pinit, // init (includes INVERT_*)
pfini, // fini
pfpct, // fini + pct
marks,
limit
} charclass_t;
std::size_t nthreads;
std::size_t chunksize;
std::string cfg_dir;
// non-breaking prefixes (numeric) utf8
std::set<std::string> nbpre_num_set;
// non-breaking prefixes (other) utf8
std::set<std::string> nbpre_gen_set;
// non-breaking prefixes (numeric) ucs4
std::set<std::wstring> nbpre_num_ucs4;
// non-breaking prefixes (other) ucs4
std::set<std::wstring> nbpre_gen_ucs4;
// compiled protected patterns
std::vector<re2::RE2 *> prot_pat_vec;
protected:
@ -42,6 +67,7 @@ protected:
bool latin_p; // is lang_iso "fr" or "it"
bool skip_xml_p;
bool skip_alltags_p;
bool entities_p;
bool escape_p;
bool unescape_p;
bool aggressive_hyphen_p;
@ -54,20 +80,44 @@ protected:
bool narrow_kana_p;
bool refined_p;
bool drop_bad_p;
bool splits_p;
bool verbose_p;
bool para_marks_p;
bool split_breaks_p;
// return counts of general and numeric prefixes loaded
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
// escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
bool escape(std::string& inplace);
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
void protected_tokenize(std::string& inplace);
public:
// used for boost::thread
struct VectorTokenizerCallable {
Tokenizer *tokenizer;
std::vector<std::string>& in;
std::vector<std::string>& out;
VectorTokenizerCallable(Tokenizer *_tokenizer,
std::vector<std::string>& _in,
std::vector<std::string>& _out)
: tokenizer(_tokenizer)
, in(_in)
, out(_out) {
};
// cfg_dir is assumed shared by all languages
static void set_config_dir(const std::string& _cfg_dir);
void operator()() {
out.resize(in.size());
for (std::size_t ii = 0; ii < in.size(); ++ii)
if (in[ii].empty())
out[ii] = in[ii];
else if (tokenizer->penn_p)
out[ii] = tokenizer->penn_tokenize(in[ii]);
else
out[ii] = tokenizer->quik_tokenize(in[ii]);
};
};
public:
Tokenizer(); // UNIMPL
@ -78,21 +128,46 @@ public:
~Tokenizer();
// required before other methods, may throw
void init();
void init(const char *cfg_dir_path = 0);
// streaming tokenizer reads from is, writes to os, preserving line breaks
void set_config_dir(const std::string& _cfg_dir);
// required after processing a contiguous sequence of lines when sentence splitting is on
void reset();
// simultaneous sentence splitting not yet implemented
bool splitting() const { return splits_p; }
// escapes chars the set &|"'<> after tokenization (moses special characters)
bool escape(std::string& inplace);
// used in detokenizer, converts entities into characters
// if escape_p is set, does not unescape moses special tokens, thus
// escape_p and unescape_p can be used together usefully
bool unescape(std::string& inplace);
// streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
std::size_t tokenize(std::istream& is, std::ostream& os);
// tokenize padded line buffer to return string
std::string tokenize(const std::string& buf);
// quik-tokenize padded line buffer to return string
std::string quik_tokenize(const std::string& buf);
// penn-tokenize padded line buffer to return string // untested
std::string penn_tokenize(const std::string& buf);
// select-tokenize padded line buffer to return string
std::string tokenize(const std::string& buf) {
return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
}
// tokenize with output argument
void tokenize(const std::string& buf, std::string& outs) {
outs = tokenize(buf);
}
// tokenize to a vector
std::vector<std::string> tokens(const std::string& in) {
std::istringstream tokss(tokenize(in));
std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
std::vector<std::string> outv;
std::copy(std::istream_iterator<std::string>(tokss),
std::istream_iterator<std::string>(),
@ -117,6 +192,12 @@ public:
return detokenize(oss.str());
}
// split a string on sentence boundaries (approximately)
std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
// split sentences from input stream and write one per line on output stream
std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
}; // end class Tokenizer
#ifdef TOKENIZER_NAMESPACE

View File

@ -16,10 +16,12 @@ usage(const char *path)
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
std::cerr << " -a -- aggressive hyphenization" << std::endl;
std::cerr << " -b -- drop bad bytes" << std::endl;
std::cerr << " -B -- splitter will split on linebreak" << std::endl;
std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
std::cerr << " -d -- downcase" << std::endl;
std::cerr << " -D -- detokenize" << std::endl;
std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
std::cerr << " -E -- preserve entities during tokenization" << std::endl;
std::cerr << " -k -- narrow kana" << std::endl;
std::cerr << " -n -- narrow latin" << std::endl;
std::cerr << " -N -- normalize" << std::endl;
@ -27,12 +29,16 @@ usage(const char *path)
std::cerr << " -p -- penn treebank style" << std::endl;
std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
std::cerr << " -u -- disable url handling" << std::endl;
std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
std::cerr << " -v -- verbose" << std::endl;
std::cerr << " -w -- word filter" << std::endl;
std::cerr << " -x -- skip xml tag lines" << std::endl;
std::cerr << " -y -- skip all xml tags" << std::endl;
std::cerr << " -X -- split only, with <P> marks" << std::endl;
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl;
std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
@ -83,15 +89,35 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int nlines = 0;
std::string line;
while (ifs.good() && std::getline(ifs,line)) {
if (line.empty()) continue;
if (line.empty())
continue;
std::vector<std::string> tokens(tize.tokens(line));
int count = 0;
bool was_break = false;
for (auto& token: tokens) {
if (token.empty()) {
if (count || was_break) {
ofs << std::endl;
count = 0;
nlines++;
was_break = true;
continue;
}
}
was_break = false;
std::string word(token_word(token));
if (word.empty()) continue;
ofs << word << ' ';
count++;
if (word.empty()) {
continue;
}
if (count++) {
ofs << ' ';
}
ofs << word;
}
if (count) {
ofs << std::endl;
nlines++;
@ -104,13 +130,16 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int main(int ac, char **av)
{
int rc = 0;
Parameters params;
Parameters params;
const char *prog = av[0];
bool next_cfg_p = false;
bool next_output_p = false;
bool next_threads_p = false;
bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
if (!detokenize_p)
params.split_p = std::strstr(av[0],"splitter") != 0;
while (++av,--ac) {
if (**av == '-') {
switch (av[0][1]) {
@ -120,6 +149,9 @@ int main(int ac, char **av)
case 'b':
params.drop_bad_p = true;
break;
case 'B':
params.split_breaks_p = true;
break;
case 'c':
next_cfg_p = true;
break;
@ -127,10 +159,13 @@ int main(int ac, char **av)
params.downcase_p = true;
break;
case 'D':
detokenize_p = true;
detokenize_p = !detokenize_p;
break;
case 'e':
params.escape_p = false;
params.escape_p = !params.escape_p;
break;
case 'E':
params.entities_p = true;
break;
case 'h':
usage(prog);
@ -156,6 +191,16 @@ int main(int ac, char **av)
case 's':
params.supersub_p = true;
break;
case 'S':
params.split_p = !params.split_p;
break;
case 'T':
params.notokenization_p = true;
params.para_marks_p = false;
break;
case 't':
next_threads_p = true;
break;
case 'U':
params.unescape_p = true;
break;
@ -171,6 +216,10 @@ int main(int ac, char **av)
case 'x':
params.detag_p = true;
break;
case 'X':
params.notokenization_p = true;
params.para_marks_p = true;
break;
case 'y':
params.alltag_p = true;
break;
@ -181,7 +230,7 @@ int main(int ac, char **av)
std::cerr << "Unknown option: " << *av << std::endl;
::exit(1);
}
} else if (params.lang_iso.empty() && strlen(*av) == 2) {
} else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
params.lang_iso = *av;
} else if (next_output_p) {
next_output_p = false;
@ -189,6 +238,14 @@ int main(int ac, char **av)
} else if (next_cfg_p) {
next_cfg_p = false;
params.cfg_path = *av;
} else if (next_threads_p) {
next_threads_p = false;
char *comma = strchr(*av,',');
if (comma) {
*comma++ = 0;
params.chunksize = std::strtoul(comma,0,0);
}
params.nthreads = std::strtoul(*av,0,0);
} else {
params.args.push_back(std::string(*av));
}
@ -230,7 +287,6 @@ int main(int ac, char **av)
if (params.verbose_p) {
std::cerr << "config path: " << params.cfg_path << std::endl;
}
Tokenizer::set_config_dir(std::string(params.cfg_path));
}
std::unique_ptr<std::ofstream> pofs = 0;
@ -244,16 +300,16 @@ int main(int ac, char **av)
Tokenizer tize(params);
tize.init();
size_t nlines = 0;
std::pair<std::size_t,std::size_t> plines = { 0, 0 };
if (params.words_p) {
if (params.args.empty()) {
nlines += copy_words(tize,std::cin,ofs);
plines.first += copy_words(tize,std::cin,ofs);
} else {
for (std::string& arg : params.args) {
try {
std::ifstream ifs(arg.c_str());
nlines += copy_words(tize,ifs,ofs);
plines.first += copy_words(tize,ifs,ofs);
} catch (...) {
std::cerr << "Exception extracting words from path " << arg << std::endl;
}
@ -261,18 +317,22 @@ int main(int ac, char **av)
}
} else if (params.args.empty()) {
if (detokenize_p) {
nlines = tize.detokenize(std::cin,ofs);
plines.first = tize.detokenize(std::cin,ofs);
} else if (params.notokenization_p) {
plines = tize.splitter(std::cin,ofs);
} else {
nlines = tize.tokenize(std::cin,ofs);
plines.first = tize.tokenize(std::cin,ofs);
}
} else {
for (std::string& arg : params.args) {
try {
std::ifstream ifs(arg.c_str());
if (detokenize_p) {
nlines = tize.detokenize(ifs,ofs);
plines.first = tize.detokenize(ifs,ofs);
} else if (params.notokenization_p) {
plines = tize.splitter(ifs,ofs);
} else {
nlines = tize.tokenize(ifs,ofs);
plines.first = tize.tokenize(ifs,ofs);
}
} catch (...) {
std::cerr << "Exception tokenizing from path " << arg << std::endl;
@ -280,9 +340,12 @@ int main(int ac, char **av)
}
}
if (params.verbose_p)
std::cerr << "%%% " << nlines << " lines." << std::endl;
if (params.verbose_p) {
std::cerr << "%%% " << plines.first << " lines." << std::endl;
if (plines.second) {
std::cerr << "%%% " << plines.second << " sentences." << std::endl;
}
}
return rc;
}

View File

@ -13,8 +13,8 @@
#ifndef LOSSYCOUNTER_H
#define LOSSYCOUNTER_H
#include <stddef.h>
#include <math.h>
#include <cstddef>
#include <cmath>
#ifdef USE_UNORDERED_MAP
#include <tr1/unordered_map>
#else

View File

@ -13,7 +13,7 @@
#include <string>
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <cstdlib>
#include <iomanip>
#include "phrase-extract.h"

View File

@ -5,13 +5,13 @@
#include <sstream>
#include <string>
#include <iostream>
#include <stdio.h>
#include <cstdio>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <string.h>
#include <cstring>
#include <map>
struct Cache {
@ -45,8 +45,8 @@ struct LMClient {
exit(1);
}
bzero((char *)&server, sizeof(server));
bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
memset(&server, '\0', sizeof(server));
memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
server.sin_family = hp->h_addrtype;
server.sin_port = htons(port);

46
contrib/mada/qsub-madamira.perl Executable file
View File

@ -0,0 +1,46 @@
#!/usr/bin/env perl
use warnings;
use strict;
use File::Slurp;
use File::Basename;
use Cwd 'abs_path';
my $splitDir = $ARGV[0];
$splitDir = abs_path($splitDir);
my @files = read_dir $splitDir;
my $qsubDir=dirname($splitDir) ."/qsub";
print STDERR "qsubDir=$qsubDir\n";
`mkdir -p $qsubDir`;
my $out2Dir=dirname($splitDir) ."/out2";
print STDERR "out2Dir=$out2Dir\n";
`mkdir -p $out2Dir`;
for my $file ( @files ) {
print STDERR "$file ";
my $qsubFile = "$qsubDir/$file.sh";
open(RUN_FILE, ">$qsubFile");
print RUN_FILE "#!/usr/bin/env bash\n"
."#PBS -d/scratch/hh65/workspace/experiment/ar-en \n"
."#PBS -l mem=5gb \n\n"
."export PATH=\"/scratch/statmt/bin:/share/apps/NYUAD/perl/gcc_4.9.1/5.20.1/bin:/share/apps/NYUAD/jdk/1.8.0_31/bin:/share/apps/NYUAD/zlib/gcc_4.9.1/1.2.8/bin:/share/apps/NYUAD/cmake/gcc_4.9.1/3.1.0-rc3/bin:/share/apps/NYUAD/boost/gcc_4.9.1/openmpi_1.8.3/1.57.0/bin:/share/apps/NYUAD/openmpi/gcc_4.9.1/1.8.3/bin:/share/apps/NYUAD/python/gcc_4.9.1/2.7.9/bin:/share/apps/NYUAD/gcc/binutils/2.21/el6/bin:/share/apps/NYUAD/gcc/gcc/4.9.1/el6/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/opt/bio/ncbi/bin:/opt/bio/mpiblast/bin:/opt/bio/EMBOSS/bin:/opt/bio/clustalw/bin:/opt/bio/tcoffee/bin:/opt/bio/hmmer/bin:/opt/bio/phylip/exe:/opt/bio/mrbayes:/opt/bio/fasta:/opt/bio/glimmer/bin:/opt/bio/glimmer/scripts:/opt/bio/gromacs/bin:/opt/bio/gmap/bin:/opt/bio/tigr/bin:/opt/bio/autodocksuite/bin:/opt/bio/wgs/bin:/opt/ganglia/bin:/opt/ganglia/sbin:/opt/bin:/usr/java/latest/bin:/opt/pdsh/bin:/opt/rocks/bin:/opt/rocks/sbin:/opt/torque/bin:/opt/torque/sbin:/home/hh65/bin:/home/hh65/bin\" \n"
."module load NYUAD/2.0 \n"
."module load gcc python/2.7.9 openmpi/1.8.3 boost cmake zlib jdk perl expat \n"
."cd /scratch/statmt/MADAMIRA-release-20140709-1.0 \n";
print RUN_FILE "java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar /scratch/statmt/MADAMIRA-release-20140709-1.0/MADAMIRA.jar "
."-rawinput $splitDir/$file -rawoutdir $out2Dir -rawconfig /scratch/statmt/MADAMIRA-release-20140709-1.0/samples/sampleConfigFile.xml \n";
close(RUN_FILE);
my $cmd = "qsub $qsubFile";
`$cmd`;
}

View File

@ -46,6 +46,7 @@ namespace mpi = boost::mpi;
#include "moses/FF/PhrasePairFeature.h"
#include "moses/FF/WordPenaltyProducer.h"
#include "moses/LM/Base.h"
#include "util/random.hh"
using namespace Mira;
using namespace std;
@ -54,6 +55,7 @@ namespace po = boost::program_options;
int main(int argc, char** argv)
{
util::rand_init();
size_t rank = 0;
size_t size = 1;
#ifdef MPI_ENABLE

View File

@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/Word.h"
#include "moses/FF/FeatureFunction.h"
#include "Decoder.h"
#include "util/random.hh"
typedef std::map<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightMap;
typedef std::pair<const Moses::FeatureFunction*, std::vector< float > > ProducerWeightPair;
@ -37,8 +38,7 @@ template <class T> bool from_string(T& t, const std::string& s, std::ios_base& (
struct RandomIndex {
ptrdiff_t operator()(ptrdiff_t max) {
srand(time(0)); // Initialize random number generator with current time.
return static_cast<ptrdiff_t> (rand() % max);
return util::rand_excl(max);
}
};

View File

@ -7,8 +7,8 @@
<Project Name="lm" Path="lm/lm.project" Active="No"/>
<Project Name="OnDiskPt" Path="OnDiskPt/OnDiskPt.project" Active="No"/>
<Project Name="search" Path="search/search.project" Active="No"/>
<Project Name="moses" Path="moses/moses.project" Active="No"/>
<Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="Yes"/>
<Project Name="moses" Path="moses/moses.project" Active="Yes"/>
<Project Name="moses-cmd" Path="moses-cmd/moses-cmd.project" Active="No"/>
<Project Name="score" Path="score/score.project" Active="No"/>
<Project Name="consolidate" Path="consolidate/consolidate.project" Active="No"/>
<BuildMatrix>

View File

@ -4,11 +4,12 @@
* Created on: 28 Feb 2014
* Author: hieu
*/
#include <stdlib.h>
#include <stdio.h>
#include <cstdlib>
#include <cstdio>
#include <algorithm>
#include <fstream>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
#include "EnOpenNLPChunker.h"
#include "moses/Util.h"
@ -28,10 +29,11 @@ EnOpenNLPChunker::~EnOpenNLPChunker() {
void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector<string> &filterList)
{
const boost::filesystem::path
inPath = boost::filesystem::unique_path(),
outPath = boost::filesystem::unique_path();
// read all input to a temp file
char *ptr = tmpnam(NULL);
string inStr(ptr);
ofstream inFile(ptr);
ofstream inFile(inPath.c_str());
string line;
while (getline(in, line)) {
@ -40,21 +42,18 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
}
inFile.close();
ptr = tmpnam(NULL);
string outStr(ptr);
// execute chunker
string cmd = "cat " + inStr + " | "
string cmd = "cat " + inPath.native() + " | "
+ m_openNLPPath + "/bin/opennlp POSTagger "
+ m_openNLPPath + "/models/en-pos-maxent.bin | "
+ m_openNLPPath + "/bin/opennlp ChunkerME "
+ m_openNLPPath + "/models/en-chunker.bin > "
+ outStr;
+ outPath.native();
//g << "Executing:" << cmd << endl;
int ret = system(cmd.c_str());
// read result of chunker and output as Moses xml trees
ifstream outFile(outStr.c_str());
ifstream outFile(outPath.c_str());
size_t lineNum = 0;
while (getline(outFile, line)) {
@ -66,8 +65,8 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
outFile.close();
// clean up temporary files
remove(inStr.c_str());
remove(outStr.c_str());
remove(inPath.c_str());
remove(outPath.c_str());
}
void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector<string> &filterList)

View File

@ -1,5 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="manual-label" InternalType="Console">
<Plugins>
<Plugin Name="CMakePlugin">
<![CDATA[[{
"name": "Debug",
"enabled": false,
"buildDirectory": "build",
"sourceDirectory": "$(ProjectPath)",
"generator": "",
"buildType": "",
"arguments": [],
"parentProject": ""
}]]]>
</Plugin>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
</Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="manual-label">
@ -14,6 +31,8 @@
<File Name="Main.cpp"/>
<File Name="Main.h"/>
</VirtualDirectory>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@ -33,6 +52,8 @@
<Linker Options="" Required="yes">
<LibraryPath Value="/Users/hieu/workspace/github/mosesdecoder/boost/lib64"/>
<Library Value="boost_program_options"/>
<Library Value="boost_filesystem"/>
<Library Value="boost_system"/>
</Linker>
<ResourceCompiler Options="" Required="no"/>
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
@ -107,6 +128,4 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Debug"/>
<Dependencies Name="Release"/>
</CodeLite_Project>

View File

@ -1,5 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<CodeLite_Project Name="moses-cmd" InternalType="Console">
<Plugins>
<Plugin Name="CMakePlugin">
<![CDATA[[{
"name": "Debug",
"enabled": false,
"buildDirectory": "build",
"sourceDirectory": "$(ProjectPath)",
"generator": "",
"buildType": "",
"arguments": [],
"parentProject": ""
}]]]>
</Plugin>
<Plugin Name="qmake">
<![CDATA[00010001N0005Debug000000000000]]>
</Plugin>
</Plugins>
<Description/>
<Dependencies/>
<VirtualDirectory Name="src"/>
@ -9,6 +26,14 @@
<File Name="../../../moses-cmd/MainVW.cpp" ExcludeProjConfig="Debug"/>
<File Name="../../../moses-cmd/MainVW.h" ExcludeProjConfig="Debug"/>
</VirtualDirectory>
<Dependencies Name="Release"/>
<Dependencies Name="Debug">
<Project Name="OnDiskPt"/>
<Project Name="lm"/>
<Project Name="moses"/>
<Project Name="search"/>
<Project Name="util"/>
</Dependencies>
<Settings Type="Executable">
<GlobalSettings>
<Compiler Options="" C_Options="" Assembler="">
@ -53,7 +78,7 @@
<Library Value="rt"/>
</Linker>
<ResourceCompiler Options="" Required="no"/>
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="" UseSeparateDebugArgs="no" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
<General OutputFile="$(IntermediateDirectory)/$(ProjectName)" IntermediateDirectory="./Debug" Command="./$(ProjectName)" CommandArguments="-f /var/folders/c4/2p48fcwx611dmkdqq44mbblm0000gn/T/ZVd8xvuJAR.ini -i /Users/hieu/workspace/github/moses-regression-tests/tests/phrase.basic-surface-binptable.oldformat/to-translate.txt" UseSeparateDebugArgs="yes" DebugArguments="" WorkingDirectory="$(IntermediateDirectory)" PauseExecWhenProcTerminates="yes" IsGUIProgram="no" IsEnabled="yes"/>
<Environment EnvVarSetName="&lt;Use Defaults&gt;" DbgSetName="&lt;Use Defaults&gt;">
<![CDATA[]]>
</Environment>
@ -125,12 +150,4 @@
</Completion>
</Configuration>
</Settings>
<Dependencies Name="Release"/>
<Dependencies Name="Debug">
<Project Name="OnDiskPt"/>
<Project Name="lm"/>
<Project Name="moses"/>
<Project Name="search"/>
<Project Name="util"/>
</Dependencies>
</CodeLite_Project>

View File

@ -474,8 +474,6 @@
<File Name="../../../moses/FF/DistortionScoreProducer.h"/>
<File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.cpp"/>
<File Name="../../../moses/FF/DynamicCacheBasedLanguageModel.h"/>
<File Name="../../../moses/FF/ExternalFeature.cpp"/>
<File Name="../../../moses/FF/ExternalFeature.h"/>
<File Name="../../../moses/FF/Factory.cpp"/>
<File Name="../../../moses/FF/Factory.h"/>
<File Name="../../../moses/FF/FeatureFunction.cpp"/>

View File

@ -20,7 +20,7 @@
#error Cython requires Python 2.4+.
#else
#define CYTHON_ABI "0_20_1post0"
#include <stddef.h> /* For offsetof */
#include <cstddef> /* For offsetof */
#ifndef offsetof
#define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
#endif
@ -343,7 +343,7 @@ void __Pyx_call_destructor(T* x) {
#if defined(WIN32) || defined(MS_WINDOWS)
#define _USE_MATH_DEFINES
#endif
#include <math.h>
#include <cmath>
#define __PYX_HAVE__moses__dictree
#define __PYX_HAVE_API__moses__dictree
#include "string.h"
@ -1131,7 +1131,7 @@ bad:
static CYTHON_INLINE int __Pyx_PyObject_Append(PyObject* L, PyObject* x); /*proto*/
#include <string.h>
#include <cstring>
static int __Pyx_SetVtable(PyObject *dict, void *vtable); /*proto*/

View File

@ -31,14 +31,14 @@
///////////////////////////////////////////////////////////////////////////////
// if you are not using precompiled headers then include these lines:
//#include <windows.h>
//#include <stdio.h>
//#include <cstdio>
//#include <tchar.h>
///////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cstdio>
#include <cstring>
#include <cmath>
#include "WIN32_functions.h"
@ -228,4 +228,4 @@ double lgamma(int x)
sum += coefs[j]/++y;
}
return -tmp+log(2.5066282746310005*sum/(double)x);
}
}

View File

@ -42,6 +42,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "RelativeEntropyCalc.h"
#include "LexicalReordering.h"
#include "LexicalReorderingState.h"
#include "util/random.hh"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
@ -205,7 +206,7 @@ int main(int argc, char** argv)
//initialise random numbers
srand(time(NULL));
rand_init();
// set up read/writing class
IOWrapper* ioWrapper = GetIOWrapper(staticData);

View File

@ -4,10 +4,10 @@
#include <iomanip>
#include <vector>
#include <map>
#include <stdlib.h>
#include <math.h>
#include <cstdlib>
#include <cmath>
#include <algorithm>
#include <stdio.h>
#include <cstdio>
#include "TrellisPathList.h"
#include "TrellisPath.h"
#include "StaticData.h"

View File

@ -11,7 +11,7 @@ else
{
with-xmlrpc-c = [ option.get "with-xmlrpc-c" ] ;
if $(with-xmlrpc-c) {
echo Bulding mosesserver. ;
echo While building mosesserver ... ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "!!! You are linking the XMLRPC-C library; Do NOT use v.1.25.29 !!!" ;
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;

View File

@ -37,6 +37,7 @@ int main(int argc, char** argv)
#include "moses/Manager.h"
#include "moses/StaticData.h"
#include "moses/ThreadPool.h"
#include "moses/TranslationTask.h"
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#if PT_UG
@ -232,8 +233,8 @@ public:
/**
* Required so that translations can be sent to a thread pool.
**/
class TranslationTask : public virtual Moses::Task {
public:
class TranslationTask : public virtual Moses::TranslationTask {
protected:
TranslationTask(xmlrpc_c::paramList const& paramList,
boost::condition_variable& cond, boost::mutex& mut)
: m_paramList(paramList),
@ -242,23 +243,33 @@ public:
m_done(false)
{}
public:
static boost::shared_ptr<TranslationTask>
create(xmlrpc_c::paramList const& paramList,
boost::condition_variable& cond, boost::mutex& mut)
{
boost::shared_ptr<TranslationTask> ret(new TranslationTask(paramList, cond, mut));
ret->m_self = ret;
return ret;
}
virtual bool DeleteAfterExecution() {return false;}
bool IsDone() const {return m_done;}
const map<string, xmlrpc_c::value>& GetRetData() { return m_retData;}
virtual void Run() {
virtual void
Run()
{
using namespace xmlrpc_c;
const params_t params = m_paramList.getStruct(0);
m_paramList.verifyEnd(1);
params_t::const_iterator si = params.find("text");
if (si == params.end()) {
throw xmlrpc_c::fault(
"Missing source text",
xmlrpc_c::fault::CODE_PARSE);
throw fault("Missing source text", fault::CODE_PARSE);
}
const string source((xmlrpc_c::value_string(si->second)));
const string source = value_string(si->second);
XVERBOSE(1,"Input: " << source << endl);
si = params.find("align");
@ -272,7 +283,7 @@ public:
si = params.find("report-all-factors");
bool reportAllFactors = (si != params.end());
si = params.find("nbest");
int nbest_size = (si == params.end()) ? 0 : int(xmlrpc_c::value_int(si->second));
int nbest_size = (si == params.end()) ? 0 : int(value_int(si->second));
si = params.find("nbest-distinct");
bool nbest_distinct = (si != params.end());
@ -281,21 +292,25 @@ public:
vector<float> multiModelWeights;
si = params.find("lambda");
if (si != params.end()) {
xmlrpc_c::value_array multiModelArray = xmlrpc_c::value_array(si->second);
vector<xmlrpc_c::value> multiModelValueVector(multiModelArray.vectorValueValue());
for (size_t i=0;i < multiModelValueVector.size();i++) {
multiModelWeights.push_back(xmlrpc_c::value_double(multiModelValueVector[i]));
}
}
if (si != params.end())
{
value_array multiModelArray = value_array(si->second);
vector<value> multiModelValueVector(multiModelArray.vectorValueValue());
for (size_t i=0;i < multiModelValueVector.size();i++)
{
multiModelWeights.push_back(value_double(multiModelValueVector[i]));
}
}
si = params.find("model_name");
if (si != params.end() && multiModelWeights.size() > 0) {
const string model_name = xmlrpc_c::value_string(si->second);
PhraseDictionaryMultiModel* pdmm = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
if (si != params.end() && multiModelWeights.size() > 0)
{
const string model_name = value_string(si->second);
PhraseDictionaryMultiModel* pdmm
= (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
pdmm->SetTemporaryMultiModelWeightsVector(multiModelWeights);
}
}
const StaticData &staticData = StaticData::Instance();
//Make sure alternative paths are retained, if necessary
@ -306,13 +321,14 @@ public:
stringstream out, graphInfo, transCollOpts;
if (staticData.IsSyntax()) {
TreeInput tinput;
const vector<FactorType>&
inputFactorOrder = staticData.GetInputFactorOrder();
stringstream in(source + "\n");
tinput.Read(in,inputFactorOrder);
ChartManager manager(tinput);
if (staticData.IsSyntax())
{
boost::shared_ptr<TreeInput> tinput(new TreeInput);
const vector<FactorType>& IFO = staticData.GetInputFactorOrder();
istringstream in(source + "\n");
tinput->Read(in,IFO);
ttasksptr task = Moses::TranslationTask::create(tinput);
ChartManager manager(task);
manager.Decode();
const ChartHypothesis *hypo = manager.GetBestHypothesis();
outputChartHypo(out,hypo);
@ -320,57 +336,50 @@ public:
// const size_t translationId = tinput.GetTranslationId();
std::ostringstream sgstream;
manager.OutputSearchGraphMoses(sgstream);
m_retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_string(sgstream.str())));
m_retData["sg"] = value_string(sgstream.str());
}
} else {
size_t lineNumber = 0; // TODO: Include sentence request number here?
Sentence sentence;
sentence.SetTranslationId(lineNumber);
const vector<FactorType> &
inputFactorOrder = staticData.GetInputFactorOrder();
stringstream in(source + "\n");
sentence.Read(in,inputFactorOrder);
Manager manager(sentence);
manager.Decode();
}
else
{
// size_t lineNumber = 0; // TODO: Include sentence request number here?
boost::shared_ptr<Sentence> sentence(new Sentence(0,source));
ttasksptr task = Moses::TranslationTask::create(sentence);
Manager manager(task);
manager.Decode();
const Hypothesis* hypo = manager.GetBestHypothesis();
vector<xmlrpc_c::value> alignInfo;
outputHypo(out,hypo,addAlignInfo,alignInfo,reportAllFactors);
if (addAlignInfo) {
m_retData.insert(pair<string, xmlrpc_c::value>("align", xmlrpc_c::value_array(alignInfo)));
}
if (addWordAlignInfo) {
stringstream wordAlignment;
hypo->OutputAlignment(wordAlignment);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair) {
if (addAlignInfo) m_retData["align"] = value_array(alignInfo);
if (addWordAlignInfo)
{
stringstream wordAlignment;
hypo->OutputAlignment(wordAlignment);
vector<xmlrpc_c::value> alignments;
string alignmentPair;
while (wordAlignment >> alignmentPair)
{
int pos = alignmentPair.find('-');
map<string, xmlrpc_c::value> wordAlignInfo;
wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str()));
wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
}
m_retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
}
if (addGraphInfo) {
insertGraphInfo(manager,m_retData);
}
if (addTopts) {
insertTranslationOptions(manager,m_retData);
}
if (nbest_size>0) {
outputNBest(manager, m_retData, nbest_size, nbest_distinct,
reportAllFactors, addAlignInfo, addScoreBreakdown);
}
wordAlignInfo["source-word"]
= value_int(atoi(alignmentPair.substr(0, pos).c_str()));
wordAlignInfo["target-word"]
= value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
alignments.push_back(value_struct(wordAlignInfo));
}
m_retData["word-align"] = value_array(alignments);
}
if (addGraphInfo) insertGraphInfo(manager,m_retData);
if (addTopts) insertTranslationOptions(manager,m_retData);
if (nbest_size > 0)
{
outputNBest(manager, m_retData, nbest_size, nbest_distinct,
reportAllFactors, addAlignInfo, addScoreBreakdown);
}
(const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
}
pair<string, xmlrpc_c::value>
text("text", xmlrpc_c::value_string(out.str()));
m_retData.insert(text);
}
m_retData["text"] = value_string(out.str());
XVERBOSE(1,"Output: " << out.str() << endl);
{
boost::lock_guard<boost::mutex> lock(m_mut);
@ -380,9 +389,12 @@ public:
}
void outputHypo(ostream& out, const Hypothesis* hypo, bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo, bool reportAllFactors = false) {
void outputHypo(ostream& out, const Hypothesis* hypo,
bool addAlignmentInfo, vector<xmlrpc_c::value>& alignInfo,
bool reportAllFactors = false) {
if (hypo->GetPrevHypo() != NULL) {
outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo, alignInfo, reportAllFactors);
outputHypo(out,hypo->GetPrevHypo(),addAlignmentInfo,
alignInfo, reportAllFactors);
Phrase p = hypo->GetCurrTargetPhrase();
if(reportAllFactors) {
out << p << " ";
@ -524,7 +536,7 @@ public:
{
// should the score breakdown be reported in a more structured manner?
ostringstream buf;
path.GetScoreBreakdown().OutputAllFeatureScores(buf);
path.GetScoreBreakdown()->OutputAllFeatureScores(buf);
nBestXMLItem["fvals"] = xmlrpc_c::value_string(buf.str());
}
@ -595,7 +607,7 @@ public:
boost::condition_variable cond;
boost::mutex mut;
typedef ::TranslationTask TTask;
boost::shared_ptr<TTask> task(new TTask(paramList,cond,mut));
boost::shared_ptr<TTask> task = TTask::create(paramList,cond,mut);
m_threadPool.Submit(task);
boost::unique_lock<boost::mutex> lock(mut);
while (!task->IsDone()) {

View File

@ -31,14 +31,14 @@
///////////////////////////////////////////////////////////////////////////////
// if you are not using precompiled headers then include these lines:
//#include <windows.h>
//#include <stdio.h>
//#include <cstdio>
//#include <tchar.h>
///////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cstdio>
#include <cstring>
#include <cmath>
#include "WIN32_functions.h"
@ -228,4 +228,4 @@ double lgamma(int x)
sum += coefs[j]/++y;
}
return -tmp+log(2.5066282746310005*sum/(double)x);
}
}

View File

@ -24,7 +24,7 @@
#ifndef _NL_CPT__
#define _NL_CPT__
//#include <stdlib.h>
//#include <cstdlib>
//#include <vector>
//#include <string>
//#include <cassert>

View File

@ -24,7 +24,7 @@
#ifndef _NL_LIST_ //////////////////////////////////////////////////////////////
#define _NL_LIST_ //////////////////////////////////////////////////////////////
#include <stdlib.h>
#include <cstdlib>
#define Listed(x) ListedObject<x>

View File

@ -26,10 +26,10 @@
#include "nl-array.h"
#include <stdio.h>
#include <cstdio>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <string>
using namespace std;

View File

@ -1,6 +1,6 @@
#include "ExternalFeature.h"
#include <dlfcn.h>
#include <stdlib.h>
#include <cstdlib>
#include <iostream>
#include "util/exception.hh"

View File

@ -0,0 +1,42 @@
# get stuff from environment variables if not set on the command line
# unless blocked explicitly
for local what in cmph irstlm
{
if ! [ option.get "with-$(what)" ] && ! [ option.get "no-$(what)" : : no ]
{
local where = [ os.environ "$(what:U)_ROOT" ] ;
if $(where)
{
echo "setting option with-$(what) from environment variable "
"$(what:U)_ROOT ." ;
option.set "with-$(what)" : $(where) ;
}
}
local where = [ option.get "with-$(what)" ] ;
}
# if --with-moses-regtest is specified without a directory
local regtest = [ option.get "with-moses-regtest" : no : yes ] ;
if $(regtest) = yes
{ # regtests requested but no path given
echo "Regression tests requested but no path given." ;
local $(where) = [ os.environ "MOSES_REGTEST_ROOT" ] ;
if ! $(where)
{
local HOME = [ os.environ "HOME" ] ;
if [ path.exists $(HOME)/moses-regression-tests ]
{
echo "Using ~/moses-regression-tests as the default." ;
option.set "with-moses-regtest" : "~/moses-regression-tests" ;
}
}
else
{
if [ path.exists $(where) ]
{
echo "Using $(where) from environment variable MOSES_REGTEST_ROOT."
option.set "with-regtest" : $(where) ;
}
}
}

123
jam-files/curlpp.jam Normal file
View File

@ -0,0 +1,123 @@
# -*- jam -*-
# configuration for curlpp
# I haven't been able to wrap my mind around bjam yet, so chances are
# there's a much better way to do things.
module curlppvars { } # this stores the variables we want to keep
if [ option.get "no-curlpp" : : yes ]
{
rule curlpp ( what ? ) { } # never return anything
}
else
{
local version ;
local prefix ;
# check if a non-standard location for curl is given
local curlpp = [ option.get "with-curlpp" ] ;
if ! $(curlpp) # maybe via environment variable CURLPP_ROOT ?
{
local where = [ os.environ "CURLPP_ROOT" ] ;
if $(where)
{
option.set "with-curlpp" : $(where) ;
local msg = "CURLPP: setting --with-curlpp=$(where) via environment" ;
echo "$(msg) variable CURLPP_ROOT" ;
}
curlpp = [ option.get "with-curlpp" ] ;
}
local config ;
if $(curlpp)
{
config = $(curlpp)/bin/curlpp-config ;
}
else # is curlpp-config in the path ?
{
local curlpp-check = [ _shell "curlpp-config 2>/dev/null" : exit-status ] ;
if $(curlpp-check[2]) = 0 { config = curlpp-config ; }
}
if $(config)
{
prefix = [ shell_or_die "$(config) --prefix" ] ;
version = [ shell_or_die "$(config) --version" ] ;
version = [ SPLIT_BY_CHARACTERS $(version) : " " ] ;
version = [ trim-nl $(version[2]) ] ;
modules.poke curlppvars : prefix : $(prefix) ;
modules.poke curlppvars : version : $(version) ;
requirements += <define>HAVE_CURLPP ;
local cpp-cflags = [ shell_or_die "$(config) --cflags" ] ;
for local i in [ SPLIT_BY_CHARACTERS $(cpp-cflags) : " " ]
{
local incpath = [ MATCH "-I(.*)" : $(i) ] ;
if $(incpath)
{
# echo "CURLPP: $(i)" ;
requirements += <cxxflags>"-isystem $(incpath)" ;
# requirements += <include>$(incpath) ;
}
}
local cpp-libs = [ shell_or_die "$(config) --libs" ] ;
local cpp-prefix = [ shell_or_die "$(config) --prefix" ] ;
for local i in [ SPLIT_BY_CHARACTERS $(cpp-libs) : " " ]
{
local libpath = [ MATCH "^-L(.*)" : $(i) ] ;
if $(libpath) { requirements += <library-path>$(libpath) ; }
local libname = [ MATCH "^-l(.*)" : $(i) ] ;
if $(libname)
{
# local curl = [ MATCH "^-l(.*pp)" : $(i) ] ;
# if [ path.exists $(cpp-prefix)/lib/lib$(libname).a ]
# {
# echo "CURLPP: STATIC LINKING FOR LIBRARY: $(libname)" ;
# lib $(libname) : : <link>static ;
# }
# else
# {
external-lib $(libname) : $(cpp-prefix)/lib ;
# }
requirements += <library>$(libname)/<link>shared ;
# requirements += <library>$(libname) ;
}
else
{
requirements += <linkflags>$(i) ;
}
# requirements += <library-path>/usr/lib/x86_64-linux-gnu ;
# for local xtra in idn rtmp ssl crypto ssl crypto ldap rt
# {
# external-lib $(xtra) : /usr/lib/x86_64-linux-gnu ;
# requirements += <library>$(xtra) ;
# }
}
# for local e in idn rtmp ssl crypto ldap rt
# {
# external-lib $(e) ; # : /usr/lib/x86_64-linux-gnu /usr/lib32 ;
# requirements += <library>$(e) ;
# }
# the rule curlpp provides access to all the variables defined in this file
# if none argument is given, it returns $(version), which should only be
# defined if curl is available
rule curlpp ( what ? )
{
if $(what)
{
retval = [ modules.peek curlppvars : $(what) ] ;
if $(retval) { return $(retval) ; }
}
else { return "yes" ; }
}
}
else { rule curlpp { } }
}
if [ curlpp ]
{
local prefix = [ curlpp prefix ] ;
local version = [ curlpp version ] ;
echo "CULRPP: USING VERSION $(version) FROM $(prefix)" ;
}

View File

@ -134,10 +134,15 @@ void file_dirscan_( file_info_t * const d, scanback func, void * closure )
int file_mkdir( char const * const path )
{
#if defined(__MINGW32__)
/* MinGW's mkdir() takes only one argument: the path. */
mkdir(path);
#else
/* Explicit cast to remove const modifiers and avoid related compiler
* warnings displayed when using the intel compiler.
*/
return mkdir( (char *)path, 0777 );
#endif
}

View File

@ -74,7 +74,7 @@
* Windows MingW32
*/
#ifdef MINGW
#ifdef __MINGW32__
#include <fcntl.h>
#include <stdlib.h>

View File

@ -22,6 +22,14 @@ rule shell_or_fail ( cmd ) {
}
}
rule shell_or_die ( cmd ) {
local ret = [ SHELL $(cmd) : exit-status ] ;
if $(ret[2]) != 0 {
exit $(cmd) failed : 1 ;
}
return [ trim-nl $(ret[1]) ] ;
}
cxxflags = [ os.environ "CXXFLAGS" ] ;
cflags = [ os.environ "CFLAGS" ] ;
ldflags = [ os.environ "LDFLAGS" ] ;

View File

@ -1,86 +0,0 @@
# import path ;
import option ;
# Is the XMLRPC-C server available?
rule shell_or_die ( cmd ) {
local ret = [ _shell $(cmd) : exit-status ] ;
if $(ret[2]) != 0 {
exit "Failed to run $(cmd)" : 1 ;
}
return $(ret[1]) ;
}
build-server = [ option.get "no-xmlrpc-c" : "yes" : "no" ] ;
if $(build-server) = yes
{
# by default, we try to build server capabilities into the server
xmlrpc-c-path = [ option.get "with-xmlrpc-c" ] ;
if $(xmlrpc-c-path) = ""
{
xmlrpc-c-config-cmd = "xmlrpc-c-config" ;
}
else
{
xmlrpc-c-config-cmd = "$(xmlrpc-c-path)/bin/xmlrpc-c-config" ;
}
# check if xmlrpc-config is available
xmlrpc-check = [ _shell "$(xmlrpc-c-config-cmd) --features 2>/dev/null" : exit-status ] ;
if $(xmlrpc-check[2]) = 0
{
# xmlrpc-c-config was found. Now check if abyss server is available
if [ MATCH "(abyss-server)" : $(xmlrpc-check[1]) ]
{
# Yes, abyss server is available. Is it the right xmlrpc-c version
# Version 1.25.29 does not work.
xmlrpc-check = [ _shell "$(xmlrpc-c-config-cmd) --version 2>/dev/null" : exit-status ] ;
xmlrpc-c-version = $(xmlrpc-check[1]) ;
if [ MATCH "(1.25.29)" : $(xmlrpc-c-version) ]
{
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "XMLRPC-C: Moses is not compatible with xmlrpc-c version $(xmlrpc-c-version). " ;
echo "XMLRPC-C: Use another one or compile without server functionality (--no-xmlrpc-c)." ;
echo "XMLRPC-C: Build aborted." ;
exit : 1 ;
}
else
{
# echo "XMLRPC-C: Found abyss server." ;
}
}
else
{
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "XMLRPC-C: Found xmlrpc-c but it does not provide the abyss server." ;
echo "XMLRPC-C: Use another xmlrpc-c installation that provides one " ;
echo "XMLRPC-C: or compile without server functionality (--no-xmlrpc-c)." ;
exit : 1 ;
}
}
else if [ option.get "with-xmlrpc-c" ]
{
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "XMLRPC-C: Could not find $(xmlrpc-c-config-cmd). Build aborted. " ;
exit : 1 ;
}
else
{
build-server = no ;
rule build_server { return no ; }
}
}
if $(build-server) = yes
{
xmlrpc-path = [ _shell "$(xmlrpc-c-config-cmd) --prefix 2>/dev/null" : exit-status ] ;
rule build_server { return $(xmlrpc-c-config-cmd) ; }
rule xmlrpc_path { return $(xmlrpc-path[1]) ; }
}
else
{
rule build_server { return no ; }
}

100
jam-files/xmlrpc-c.jam Normal file
View File

@ -0,0 +1,100 @@
# This module handles the use (or non-use) of the externall
# xmlrpc-c library (including the abyss server) that is needed for
# moses server functionality
if [ option.get "no-xmlrpc-c" ]
{
rule xmlrpc ( what ? ) { } # never return anything
}
else
{
local xmlrpc = [ option.get "with-xmlrpc-c" ] ;
if ! $(xmlrpc) # check for environment variable
{
local where = [ os.environ "XMLRPC_C_ROOT" ] ;
if $(where)
{
option.set "with-xmlrpc-c" : $(where) ;
local msg = "setting --with-xmlrpc-c=$(where) via environment " ;
echo "$(msg) variable XMLRPC_C_ROOT" ;
}
xmlrpc = [ option.get "with-xmlrpc-c" ] ;
}
local config ;
if ! $(xmlrpc) { config = "xmlrpc-c-config" ; }
else { config = "$(xmlrpc)/bin/xmlrpc-c-config" ; }
# check if xmlrpc-config can be executed
xmlrpc-check = [ _shell "$(config) --features 2>/dev/null" : exit-status ] ;
if $(xmlrpc-check[2]) = 0 # yes it can
{
# is the abyss server is available ?
if [ MATCH "(abyss-server)" : $(xmlrpc-check[1]) ]
{
# Yes, abyss server is available. Is it the right xmlrpc-c version ?
# Note: Version 1.25.29 does not work.
xmlrpc-check = [ _shell "$(config) --version 2>/dev/null" : exit-status ] ;
xmlrpc-c-version = $(xmlrpc-check[1]) ;
if [ MATCH "(1.25.29)" : $(xmlrpc-c-version) ]
{
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "XMLRPC-C: Moses is not compatible with xmlrpc-c version $(xmlrpc-c-version). " ;
echo "XMLRPC-C: Use another one or compile without server functionality (--no-xmlrpc-c)." ;
echo "XMLRPC-C: Build aborted." ;
exit : 1 ;
}
}
else
{
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "XMLRPC-C: Found xmlrpc-c but it does not provide the abyss server." ;
echo "XMLRPC-C: Use another xmlrpc-c installation that provides one " ;
echo "XMLRPC-C: or compile without server functionality (--no-xmlrpc-c)." ;
exit : 1 ;
}
local prefix = [ shell_or_die "$(config) --prefix" ] ;
local version = [ shell_or_die "$(config) --version" ] ;
echo "XMLRPC-C: USING VERSION $(version) FROM $(prefix)" ;
# now add stuff to the requirements
local xmlrpc-cxxflags = [ shell_or_die "$(config) c++2 abyss-server --cflags" ] ;
requirements += <define>HAVE_XMLRPC_C ;
requirements += <cxxflags>$(xmlrpc-cxxflags) ;
local libs = [ shell_or_die "$(config) c++2 abyss-server --libs" ] ;
for local i in [ SPLIT_BY_CHARACTERS $(libs) : " " ]
{
local libname = [ MATCH "-l(xmlrpc.*)" : $(i) ] ;
if $(libname)
{
external-lib $(libname) : $(prefix)/lib ;
# : : <runtime-link>static:<link>static <runtime-link>shared:<link>shared ;
requirements += <library>$(libname) ;
}
local pathname = [ MATCH "-L(.*)" : $(i) ] ;
if $(pathname)
{
requirements += <library-path>$(pathname) ;
}
}
rule xmlrpc { return yes ; }
}
else if [ option.get "with-xmlrpc-c" ]
{
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
echo "XMLRPC-C: Could not find $(config). " ;
echo "Build aborted. " ;
echo "Use --no-xmlrpc-c to compile moses without server functionality. " ;
exit : 1 ;
}
else
{
rule xmlrpc ( what ? ) { } # never return anything
}
}

View File

@ -14,12 +14,12 @@ update-if-changed $(ORDER-LOG) $(max-order) ;
max-order += <dependency>$(ORDER-LOG) ;
wrappers = ;
local with-nplm = [ option.get "with-nplm-0.1" ] ;
local with-nplm = [ option.get "with-nplm" ] ;
if $(with-nplm) {
lib neuralLM : : <search>$(with-nplm)/src ;
lib nplm : : <search>$(with-nplm)/src ;
obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ;
alias nplm : nplm.o neuralLM ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
wrappers += nplm ;
alias nplm-all : nplm.o nplm ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
wrappers += nplm-all ;
}
fakelib kenlm : $(wrappers) [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;

View File

@ -19,9 +19,8 @@
#include "util/sorted_uniform.hh"
#include <algorithm>
#include <stdint.h>
#include <assert.h>
#include <cassert>
namespace lm {
namespace ngram {

View File

@ -2,9 +2,8 @@
#define LM_BLANK_H
#include <limits>
#include <stdint.h>
#include <math.h>
#include <cmath>
namespace lm {
namespace ngram {

View File

@ -9,9 +9,8 @@
#include <iostream>
#include <iomanip>
#include <limits>
#include <math.h>
#include <stdlib.h>
#include <cmath>
#include <cstdlib>
#ifdef WIN32
#include "util/getopt.hh"

View File

@ -48,7 +48,8 @@ class StatCollector {
// TODO: Specialize error message for j == 3, meaning 3+
UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for "
<< (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any "
<< (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?");
<< (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?\n"
<< "Try deduplicating the input. To override this error for e.g. a class-based model, rerun with --discount_fallback\n");
}
// See equation (26) in Chen and Goodman.
@ -63,7 +64,7 @@ class StatCollector {
case THROW_UP:
throw;
case COMPLAIN:
std::cerr << e.what() << " Substituting fallback discounts D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl;
std::cerr << "Substituting fallback discounts for order " << i << ": D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl;
case SILENT:
break;
}

View File

@ -78,7 +78,7 @@ BOOST_AUTO_TEST_CASE(Simple) {
DiscountConfig discount_config;
discount_config.fallback = Discount();
discount_config.bad_action = THROW_UP;
BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, discount_config, discount).Run(for_adjust), BadDiscountException);
BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, std::vector<bool>(), discount_config, discount).Run(for_adjust), BadDiscountException);
}
BOOST_REQUIRE_EQUAL(4UL, counts.size());
BOOST_CHECK_EQUAL(4UL, counts[0]);

View File

@ -45,7 +45,8 @@ BOOST_AUTO_TEST_CASE(Short) {
NGramStream stream;
uint64_t token_count;
WordIndex type_count = 10;
CorpusCount counter(input_piece, vocab.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize(), SILENT);
std::vector<bool> prune_words;
CorpusCount counter(input_piece, vocab.get(), token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT);
chain >> boost::ref(counter) >> stream >> util::stream::kRecycle;
const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};

View File

@ -8,8 +8,8 @@
#include "util/fixed_array.hh"
#include "util/murmur_hash.hh"
#include <assert.h>
#include <math.h>
#include <cassert>
#include <cmath>
namespace lm { namespace builder {
namespace {

View File

@ -9,7 +9,7 @@
#include <iostream>
#endif
#include <string.h>
#include <cstring>
namespace lm { namespace builder {

View File

@ -202,6 +202,7 @@ int main(int argc, char *argv[]) {
initial.adder_out.block_count = 2;
pipeline.read_backoffs = initial.adder_out;
// Read from stdin, write to stdout by default
util::scoped_fd in(0), out(1);
if (vm.count("text")) {
in.reset(util::OpenReadOrThrow(text.c_str()));
@ -210,7 +211,6 @@ int main(int argc, char *argv[]) {
out.reset(util::CreateOrThrow(arpa.c_str()));
}
// Read from stdin
try {
lm::builder::Output output;
output.Add(new lm::builder::PrintARPA(out.release(), verbose_header));

View File

@ -5,10 +5,9 @@
#include "lm/word_index.hh"
#include <cstddef>
#include <assert.h>
#include <cassert>
#include <stdint.h>
#include <string.h>
#include <cstring>
namespace lm {
namespace builder {

View File

@ -7,8 +7,7 @@
#include "util/stream/timer.hh"
#include <sstream>
#include <string.h>
#include <cstring>
namespace lm { namespace builder {

View File

@ -10,8 +10,7 @@
#include "util/string_piece.hh"
#include <ostream>
#include <assert.h>
#include <cassert>
// Warning: print routines read all unigrams before all bigrams before all
// trigrams etc. So if other parts of the chain move jointly, you'll have to

View File

@ -30,9 +30,10 @@ struct Config {
return show_progress ? messages : 0;
}
// This will be called with every string in the vocabulary. See
// enumerate_vocab.hh for more detail. Config does not take ownership; you
// are still responsible for deleting it (or stack allocating).
// This will be called with every string in the vocabulary by the
// constructor; it need only exist for the lifetime of the constructor.
// See enumerate_vocab.hh for more detail. Config does not take ownership;
// just delete/let it go out of scope after the constructor exits.
EnumerateVocab *enumerate_vocab;

View File

@ -6,9 +6,9 @@
#include <string>
#include <vector>
#include <ctype.h>
#include <errno.h>
#include <string.h>
#include <cctype>
#include <cerrno>
#include <cstring>
namespace lm {

View File

@ -14,7 +14,7 @@
#include <string>
#include <vector>
#include <string.h>
#include <cstring>
#include <stdint.h>
namespace util { class FilePiece; }

View File

@ -9,7 +9,7 @@
#include <string>
#include <vector>
#include <ctype.h>
#include <cctype>
namespace lm {
namespace phrase {

View File

@ -3,7 +3,7 @@
#include <istream>
#include <iostream>
#include <ctype.h>
#include <cctype>
namespace lm {
namespace vocab {

View File

@ -1,7 +1,7 @@
#include "lm/lm_exception.hh"
#include<errno.h>
#include<stdio.h>
#include <cerrno>
#include <cstdio>
namespace lm {

View File

@ -17,8 +17,7 @@
#include <algorithm>
#include <vector>
#include <string.h>
#include <cstring>
namespace util { class FilePiece; }

View File

@ -1,7 +1,7 @@
#include "lm/model.hh"
#include <stdlib.h>
#include <string.h>
#include <cstdlib>
#include <cstring>
#define BOOST_TEST_MODULE ModelTest
#include <boost/test/unit_test.hpp>

View File

@ -11,8 +11,7 @@
#include <ostream>
#include <istream>
#include <string>
#include <math.h>
#include <cmath>
namespace lm {
namespace ngram {

View File

@ -5,8 +5,7 @@
#include "lm/state.hh"
#include <algorithm>
#include <assert.h>
#include <cassert>
namespace lm {
namespace ngram {

View File

@ -9,8 +9,8 @@
#include <sstream>
#include <vector>
#include <ctype.h>
#include <string.h>
#include <cctype>
#include <cstring>
#include <stdint.h>
#ifdef WIN32

View File

@ -12,8 +12,7 @@
#include <vector>
#include <cstdlib>
#include <assert.h>
#include <cassert>
namespace lm {
namespace ngram {

View File

@ -5,7 +5,7 @@
#include "lm/word_index.hh"
#include "util/murmur_hash.hh"
#include <string.h>
#include <cstring>
namespace lm {
namespace ngram {

View File

@ -5,7 +5,7 @@
#include "util/exception.hh"
#include "util/sorted_uniform.hh"
#include <assert.h>
#include <cassert>
namespace lm {
namespace ngram {

View File

@ -6,7 +6,7 @@
#include "util/string_piece.hh"
#include <string>
#include <string.h>
#include <cstring>
namespace lm {
namespace base {

View File

@ -12,8 +12,7 @@
#include "util/probing_hash_table.hh"
#include <string>
#include <string.h>
#include <cstring>
namespace lm {
namespace ngram {

View File

@ -2,7 +2,7 @@
#ifndef LM_WORD_INDEX_H
#define LM_WORD_INDEX_H
#include <limits.h>
#include <climits>
namespace lm {
typedef unsigned int WordIndex;

View File

@ -3,8 +3,7 @@
#include "util/file.hh"
#include <algorithm>
#include <string.h>
#include <cstring>
#include "neuralLM.h"
@ -21,6 +20,26 @@ WordIndex Vocabulary::Index(const std::string &str) const {
return vocab_.lookup_word(str);
}
class Backend {
public:
Backend(const nplm::neuralLM &from, const std::size_t cache_size) : lm_(from), ngram_(from.get_order()) {
lm_.set_cache(cache_size);
}
nplm::neuralLM &LM() { return lm_; }
const nplm::neuralLM &LM() const { return lm_; }
Eigen::Matrix<int,Eigen::Dynamic,1> &staging_ngram() { return ngram_; }
double lookup_from_staging() { return lm_.lookup_ngram(ngram_); }
int order() const { return lm_.get_order(); }
private:
nplm::neuralLM lm_;
Eigen::Matrix<int,Eigen::Dynamic,1> ngram_;
};
bool Model::Recognize(const std::string &name) {
try {
util::scoped_fd file(util::OpenReadOrThrow(name.c_str()));
@ -31,10 +50,18 @@ bool Model::Recognize(const std::string &name) {
} catch (const util::Exception &) {
return false;
}
}
}
namespace {
nplm::neuralLM *LoadNPLM(const std::string &file) {
util::scoped_ptr<nplm::neuralLM> ret(new nplm::neuralLM());
ret->read(file);
return ret.release();
}
} // namespace
Model::Model(const std::string &file, std::size_t cache)
: base_instance_(new nplm::neuralLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
: base_instance_(LoadNPLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ". Change the defintion of NPLM_MAX_ORDER and recompile.");
// log10 compatible with backoff models.
base_instance_->set_log_base(10.0);
@ -49,26 +76,25 @@ Model::Model(const std::string &file, std::size_t cache)
Model::~Model() {}
FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const {
nplm::neuralLM *lm = backend_.get();
if (!lm) {
lm = new nplm::neuralLM(*base_instance_);
backend_.reset(lm);
lm->set_cache(cache_size_);
Backend *backend = backend_.get();
if (!backend) {
backend = new Backend(*base_instance_, cache_size_);
backend_.reset(backend);
}
// State is in natural word order.
FullScoreReturn ret;
for (int i = 0; i < lm->get_order() - 1; ++i) {
lm->staging_ngram()(i) = from.words[i];
for (int i = 0; i < backend->order() - 1; ++i) {
backend->staging_ngram()(i) = from.words[i];
}
lm->staging_ngram()(lm->get_order() - 1) = new_word;
ret.prob = lm->lookup_from_staging();
backend->staging_ngram()(backend->order() - 1) = new_word;
ret.prob = backend->lookup_from_staging();
// Always say full order.
ret.ngram_length = lm->get_order();
ret.ngram_length = backend->order();
// Shift everything down by one.
memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (lm->get_order() - 2));
out_state.words[lm->get_order() - 2] = new_word;
memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (backend->order() - 2));
out_state.words[backend->order() - 2] = new_word;
// Fill in trailing words with zeros so state comparison works.
memset(out_state.words + lm->get_order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - lm->get_order()));
memset(out_state.words + backend->order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - backend->order()));
return ret;
}

View File

@ -49,6 +49,8 @@ struct State {
WordIndex words[NPLM_MAX_ORDER - 1];
};
class Backend;
class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
private:
typedef lm::base::ModelFacade<Model, State, Vocabulary> P;
@ -68,7 +70,7 @@ class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
private:
boost::scoped_ptr<nplm::neuralLM> base_instance_;
mutable boost::thread_specific_ptr<nplm::neuralLM> backend_;
mutable boost::thread_specific_ptr<Backend> backend_;
Vocabulary vocab_;

View File

@ -191,7 +191,7 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
float logbleu = 0.0;
for (int i = 0; i < kBleuNgramOrder; ++i) {
for (std::size_t i = 0; i < kBleuNgramOrder; ++i) {
if (comps[2*i] == 0) {
return 0.0;
}
@ -249,7 +249,7 @@ float smoothedSentenceBleu
UTIL_THROW_IF(stats.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
float logbleu = 0.0;
for (int j = 0; j < kBleuNgramOrder; j++) {
for (std::size_t j = 0; j < kBleuNgramOrder; j++) {
logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing);
}
logbleu /= kBleuNgramOrder;
@ -275,7 +275,7 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
// Calculate BLEU
float logbleu = 0.0;
for (int j = 0; j < kBleuNgramOrder; j++) {
for (std::size_t j = 0; j < kBleuNgramOrder; j++) {
logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
}
logbleu /= kBleuNgramOrder;

View File

@ -17,6 +17,7 @@
#include "util/exception.hh"
#include "util/file_piece.hh"
#include "util/random.hh"
#include "util/tokenize_piece.hh"
#include "util/string_piece.hh"
#include "FeatureDataIterator.h"
@ -286,7 +287,7 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
} else {
//create shards by randomly sampling
for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
shard_contents.push_back(rand() % data_size);
shard_contents.push_back(util::rand_excl(data_size));
}
}

View File

@ -13,6 +13,8 @@
#include <iostream>
#include <string>
#include "util/unistd.hh"
#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
#include <ext/stdio_filebuf.h>

View File

@ -40,28 +40,3 @@ inputfilestream::~inputfilestream()
void inputfilestream::close()
{
}
outputfilestream::outputfilestream(const std::string &filePath)
: std::ostream(0), m_streambuf(0), m_is_good(false)
{
// check if file is readable
std::filebuf* fb = new std::filebuf();
m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
if (IsGzipFile(filePath)) {
throw runtime_error("Output to a zipped file not supported!");
} else {
m_streambuf = fb;
}
this->init(m_streambuf);
}
outputfilestream::~outputfilestream()
{
delete m_streambuf;
m_streambuf = 0;
}
void outputfilestream::close()
{
}

View File

@ -22,20 +22,4 @@ public:
void close();
};
class outputfilestream : public std::ostream
{
protected:
std::streambuf *m_streambuf;
bool m_is_good;
public:
explicit outputfilestream(const std::string &filePath);
virtual ~outputfilestream();
bool good() const {
return m_is_good;
}
void close();
};
#endif // MERT_FILE_STREAM_H_

View File

@ -1,6 +1,9 @@
#include <iostream>
#include "util/tokenize_piece.hh"
#include "ForestRescore.h"
#include "MiraFeatureVector.h"
#define BOOST_TEST_MODULE MertForestRescore
#include <boost/test/unit_test.hpp>
@ -10,8 +13,7 @@
using namespace std;
using namespace MosesTuning;
BOOST_AUTO_TEST_CASE(viterbi_simple_lattice)
{
BOOST_AUTO_TEST_CASE(viterbi_simple_lattice) {
Vocab vocab;
WordVec words;
string wordStrings[] =
@ -242,5 +244,101 @@ BOOST_AUTO_TEST_CASE(viterbi_3branch_lattice)
BOOST_CHECK_EQUAL(6, hopeHypo.bleuStats[8]);
}
BOOST_AUTO_TEST_CASE(viterbi_full_hypergraph) {
Vocab vocab;
//References
ReferenceSet references;
references.AddLine(0,"in addition to EU support for businesses , also the administration of national business support will be concentrated in four Centres for Economic Development , Transport and Environment ( ELY Centres ) , starting from mid @-@ September .",vocab);
//Load the hypergraph
Graph graph(vocab);
util::scoped_fd fd(util::OpenReadOrThrow("mert/hgtest/0.gz"));
util::FilePiece file(fd.release());
ReadGraph(file,graph);
//prune
SparseVector weights;
weights.set("OpSequenceModel0_1",0.011187);
weights.set("OpSequenceModel0_2",-0.002797);
weights.set("OpSequenceModel0_3",0.002797);
weights.set("OpSequenceModel0_4",-0.000140);
weights.set("OpSequenceModel0_5",0.004195);
weights.set("Distortion0",0.041952);
weights.set("PhrasePenalty0",0.027968);
weights.set("WordPenalty0",-0.139841);
weights.set("UnknownWordPenalty0",1.000000);
weights.set("LM0",0.069920);
weights.set("LexicalReordering0_1",0.041952);
weights.set("LexicalReordering0_2",0.041952);
weights.set("LexicalReordering0_3",0.041952);
weights.set("LexicalReordering0_4",0.041952);
weights.set("LexicalReordering0_5",0.041952);
weights.set("LexicalReordering0_6",0.041952);
weights.set("LexicalReordering0_7",0.041952);
weights.set("LexicalReordering0_8",0.041952);
weights.set("TranslationModel0_1",0.027968);
weights.set("TranslationModel0_2",0.027968);
weights.set("TranslationModel0_3",0.027968);
weights.set("TranslationModel0_4",0.027968);
weights.set("TranslationModel0_5",0.027968);
weights.set("TranslationModel0_6",0.027968);
weights.set("TranslationModel0_7",0.027968);
weights.set("TranslationModel0_8",0.027968);
weights.set("TranslationModel0_9",0.027968);
weights.set("TranslationModel0_10",0.027968);
weights.set("TranslationModel0_11",0.027968);
weights.set("TranslationModel0_12",0.027968);
weights.set("TranslationModel0_13",0.027968);
size_t edgeCount = 500;
boost::shared_ptr<Graph> prunedGraph;
prunedGraph.reset(new Graph(vocab));
graph.Prune(prunedGraph.get(), weights, edgeCount);
vector<ValType> bg(9);
HgHypothesis bestHypo;
//best hypothesis
Viterbi(*prunedGraph, weights, 0, references, 0, bg, &bestHypo);
//check output as expected
string expectedStr = "<s> the EU matters , but also the national matters management focus since mid @-@ September four ely @-@ centre . </s>";
util::TokenIter<util::SingleCharacter, true> expected(expectedStr, util::SingleCharacter(' '));
for (size_t i = 0; i < bestHypo.text.size(); ++i) {
//cerr << bestHypo.text[i]->first << " ";
BOOST_CHECK_EQUAL(*expected,bestHypo.text[i]->first);
++expected;
}
BOOST_CHECK(!expected);
//cerr << endl;
//check scores
BOOST_CHECK_CLOSE(-80.062,bestHypo.featureVector.get("OpSequenceModel0_1"), 0.001);
BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_2"), 0.001);
BOOST_CHECK_CLOSE(2,bestHypo.featureVector.get("OpSequenceModel0_3"), 0.001);
BOOST_CHECK_CLOSE(3,bestHypo.featureVector.get("OpSequenceModel0_4"), 0.001);
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("OpSequenceModel0_5"), 0.001);
BOOST_CHECK_CLOSE(-6,bestHypo.featureVector.get("Distortion0"), 0.001);
BOOST_CHECK_CLOSE(14,bestHypo.featureVector.get("PhrasePenalty0"), 0.001);
BOOST_CHECK_CLOSE(-20,bestHypo.featureVector.get("WordPenalty0"), 0.001);
BOOST_CHECK_CLOSE(-100,bestHypo.featureVector.get("UnknownWordPenalty0"), 0.001);
BOOST_CHECK_CLOSE(-126.616,bestHypo.featureVector.get("LM0"), 0.001);
BOOST_CHECK_CLOSE(-5.2238,bestHypo.featureVector.get("LexicalReordering0_1"), 0.001);
BOOST_CHECK_CLOSE(-0.29515,bestHypo.featureVector.get("LexicalReordering0_2"), 0.001);
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_3"), 0.001);
BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_4"), 0.001);
BOOST_CHECK_CLOSE(-9.28267,bestHypo.featureVector.get("LexicalReordering0_5"), 0.001);
BOOST_CHECK_CLOSE(-0.470004,bestHypo.featureVector.get("LexicalReordering0_6"), 0.001);
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("LexicalReordering0_7"), 0.001);
BOOST_CHECK_CLOSE(-0.402678,bestHypo.featureVector.get("LexicalReordering0_8"), 0.001);
BOOST_CHECK_CLOSE(-54.3119,bestHypo.featureVector.get("TranslationModel0_1"), 0.001);
BOOST_CHECK_CLOSE(-62.2619,bestHypo.featureVector.get("TranslationModel0_2"), 0.001);
BOOST_CHECK_CLOSE(-23.8782,bestHypo.featureVector.get("TranslationModel0_3"), 0.001);
BOOST_CHECK_CLOSE(-25.1626,bestHypo.featureVector.get("TranslationModel0_4"), 0.001);
BOOST_CHECK_CLOSE(12.9986,bestHypo.featureVector.get("TranslationModel0_5"), 0.001);
BOOST_CHECK_CLOSE(3.99959,bestHypo.featureVector.get("TranslationModel0_6"), 0.001);
BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_7"), 0.001);
BOOST_CHECK_CLOSE(1.99979,bestHypo.featureVector.get("TranslationModel0_8"), 0.001);
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_9"), 0.001);
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_10"), 0.001);
BOOST_CHECK_CLOSE(0,bestHypo.featureVector.get("TranslationModel0_11"), 0.001);
BOOST_CHECK_CLOSE(0.999896,bestHypo.featureVector.get("TranslationModel0_12"), 0.001);
BOOST_CHECK_CLOSE(7.99917,bestHypo.featureVector.get("TranslationModel0_13"), 0.001);
}

View File

@ -180,7 +180,7 @@ HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
references_.Load(referenceFiles, vocab_);
SparseVector weights;
wv.ToSparse(&weights);
wv.ToSparse(&weights,num_dense_);
scorer_ = scorer;
static const string kWeights = "weights";
@ -243,7 +243,7 @@ void HypergraphHopeFearDecoder::HopeFear(
{
size_t sentenceId = *sentenceIdIter_;
SparseVector weights;
wv.ToSparse(&weights);
wv.ToSparse(&weights, num_dense_);
const Graph& graph = *(graphs_[sentenceId]);
// ValType hope_scale = 1.0;
@ -338,7 +338,7 @@ void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValTy
HgHypothesis bestHypo;
size_t sentenceId = *sentenceIdIter_;
SparseVector weights;
wv.ToSparse(&weights);
wv.ToSparse(&weights, num_dense_);
vector<ValType> bg(scorer_->NumberOfScores());
//cerr << "Calculating bleu on " << sentenceId << endl;
Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo);

View File

@ -12,7 +12,7 @@
#include <string>
#include <vector>
#include <utility>
#include <stddef.h>
#include <cstddef>
#include "FeatureDataIterator.h"
#include "ScoreDataIterator.h"

View File

@ -77,6 +77,7 @@ unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_f
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test forest_rescore_test : ForestRescoreTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test hypergraph_test : HypergraphTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test mira_feature_vector_test : MiraFeatureVectorTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test ngram_test : NgramTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test optimizer_factory_test : OptimizerFactoryTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test point_test : PointTest.cpp mert_lib ..//boost_unit_test_framework ;

View File

@ -6,7 +6,7 @@
#include <iterator>
#include <sstream>
#include <stdexcept>
#include <stdio.h>
#include <cstdio>
#include <string>
#include <vector>
@ -18,6 +18,7 @@
#include "ScoreStats.h"
#include "Util.h"
#include "util/unistd.hh"
using namespace std;
@ -25,7 +26,7 @@ namespace MosesTuning
{
// Meteor supported
#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
#if (defined(__GLIBCXX__) || defined(__GLIBCPP__)) && !defined(_WIN32)
// for clarity
#define CHILD_STDIN_READ pipefds_input[0]

View File

@ -0,0 +1,49 @@
#include "MiraFeatureVector.h"
#include "MiraWeightVector.h"
#define BOOST_TEST_MODULE MiraFeatureVector
#include <boost/test/unit_test.hpp>
using namespace MosesTuning;
/* Note that the conversion to and from SparseVector needs to know
how many of the features are really "dense". This is because in hg mira
all features (sparse and dense) are to get rolled in to SparseVector
*/
BOOST_AUTO_TEST_CASE(from_sparse) {
SparseVector sp;
sp.set("dense0", 0.2);
sp.set("dense1", 0.3);
sp.set("sparse0", 0.7);
sp.set("sparse1", 0.9);
sp.set("sparse2", 0.1);
MiraFeatureVector mfv(sp,2);
BOOST_CHECK_EQUAL(mfv.size(),5);
BOOST_CHECK_EQUAL(mfv.feat(0),0);
BOOST_CHECK_EQUAL(mfv.feat(1),1);
BOOST_CHECK_EQUAL(mfv.feat(2),4);
BOOST_CHECK_EQUAL(mfv.feat(3),5);
BOOST_CHECK_EQUAL(mfv.feat(4),6);
BOOST_CHECK_CLOSE(mfv.val(0), 0.2,1e-5);
BOOST_CHECK_CLOSE(mfv.val(1), 0.3,1e-5);
BOOST_CHECK_CLOSE(mfv.val(2), 0.7,1e-5);
BOOST_CHECK_CLOSE(mfv.val(3), 0.9,1e-5);
BOOST_CHECK_CLOSE(mfv.val(4), 0.1,1e-5);
MiraWeightVector mwv;
mwv.update(mfv,1.0);
SparseVector sp2;
mwv.ToSparse(&sp2,2);
//check we get back what we started with
BOOST_CHECK_CLOSE(sp2.get("dense0"), 0.2,1e-5);
BOOST_CHECK_CLOSE(sp2.get("dense1"), 0.3,1e-5);
BOOST_CHECK_CLOSE(sp2.get("sparse0"), 0.7,1e-5);
BOOST_CHECK_CLOSE(sp2.get("sparse1"), 0.9,1e-5);
BOOST_CHECK_CLOSE(sp2.get("sparse2"), 0.1,1e-5);
}

View File

@ -93,11 +93,17 @@ void MiraWeightVector::update(size_t index, ValType delta)
m_lastUpdated[index] = m_numUpdates;
}
void MiraWeightVector::ToSparse(SparseVector* sparse) const
void MiraWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
{
for (size_t i = 0; i < m_weights.size(); ++i) {
if(abs(m_weights[i])>1e-8) {
sparse->set(i,m_weights[i]);
if (i < denseSize) {
sparse->set(i,m_weights[i]);
} else {
//The ids in MiraFeatureVector/MiraWeightVector for sparse features
//need to be translated when converting back to SparseVector.
sparse->set(i-denseSize, m_weights[i]);
}
}
}
}
@ -172,12 +178,18 @@ size_t AvgWeightVector::size() const
return m_wv.m_weights.size();
}
void AvgWeightVector::ToSparse(SparseVector* sparse) const
void AvgWeightVector::ToSparse(SparseVector* sparse, size_t denseSize) const
{
for (size_t i = 0; i < size(); ++i) {
ValType w = weight(i);
if(abs(w)>1e-8) {
sparse->set(i,w);
if (i < denseSize) {
sparse->set(i,w);
} else {
//The ids in MiraFeatureVector/MiraWeightVector for sparse features
//need to be translated when converting back to SparseVector.
sparse->set(i-denseSize, w);
}
}
}
}

View File

@ -64,9 +64,9 @@ public:
AvgWeightVector avg();
/**
* Convert to sparse vector, interpreting all features as sparse.
* Convert to sparse vector, interpreting all features as sparse. Only used by hgmira.
**/
void ToSparse(SparseVector* sparse) const;
void ToSparse(SparseVector* sparse, size_t denseSize) const;
friend class AvgWeightVector;
@ -104,7 +104,7 @@ public:
ValType score(const MiraFeatureVector& fv) const;
ValType weight(std::size_t index) const;
std::size_t size() const;
void ToSparse(SparseVector* sparse) const;
void ToSparse(SparseVector* sparse, size_t num_dense) const;
private:
const MiraWeightVector& m_wv;
};

View File

@ -8,7 +8,7 @@
#include <fstream>
#include <sstream>
#include <math.h>
#include <cmath>
#include "Permutation.h"
#include "Util.h"

Some files were not shown because too many files have changed in this diff Show More