mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-03 23:07:14 +03:00
45 lines
814 B
C++
45 lines
814 B
C++
#include <iostream>
|
|
#include "hash.h"
|
|
|
|
using namespace std;
|
|
|
|
namespace probingpt
|
|
{
|
|
|
|
uint64_t getHash(StringPiece text)
|
|
{
|
|
std::size_t len = text.size();
|
|
uint64_t key = util::MurmurHashNative(text.data(), len);
|
|
return key;
|
|
}
|
|
|
|
std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
|
|
{
|
|
//Tokenize
|
|
std::vector<uint64_t> output;
|
|
|
|
util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
|
|
|
|
while (itWord) {
|
|
StringPiece word = *itWord;
|
|
uint64_t id = 0;
|
|
|
|
util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
|
|
while (itFactor) {
|
|
StringPiece factor = *itFactor;
|
|
//cerr << "factor=" << factor << endl;
|
|
|
|
id += getHash(factor);
|
|
itFactor++;
|
|
}
|
|
|
|
output.push_back(id);
|
|
itWord++;
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
}
|
|
|