mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-08 20:46:59 +03:00
151 lines
4.0 KiB
C++
151 lines
4.0 KiB
C++
#ifndef UTIL_TOKENIZE_PIECE_H
|
|
#define UTIL_TOKENIZE_PIECE_H
|
|
|
|
#include "util/exception.hh"
|
|
#include "util/string_piece.hh"
|
|
|
|
#include <boost/iterator/iterator_facade.hpp>
|
|
|
|
#include <algorithm>
|
|
#include <cstring>
|
|
|
|
namespace util {
|
|
|
|
// Thrown on dereference when out of tokens to parse
|
|
class OutOfTokens : public Exception {
|
|
public:
|
|
OutOfTokens() throw() {}
|
|
~OutOfTokens() throw() {}
|
|
};
|
|
|
|
class SingleCharacter {
|
|
public:
|
|
SingleCharacter() {}
|
|
explicit SingleCharacter(char delim) : delim_(delim) {}
|
|
|
|
StringPiece Find(const StringPiece &in) const {
|
|
return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1);
|
|
}
|
|
|
|
private:
|
|
char delim_;
|
|
};
|
|
|
|
class MultiCharacter {
|
|
public:
|
|
MultiCharacter() {}
|
|
|
|
explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {}
|
|
|
|
StringPiece Find(const StringPiece &in) const {
|
|
return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size());
|
|
}
|
|
|
|
private:
|
|
StringPiece delimiter_;
|
|
};
|
|
|
|
class AnyCharacter {
|
|
public:
|
|
AnyCharacter() {}
|
|
explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {}
|
|
|
|
StringPiece Find(const StringPiece &in) const {
|
|
return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1);
|
|
}
|
|
|
|
private:
|
|
StringPiece chars_;
|
|
};
|
|
|
|
class BoolCharacter {
|
|
public:
|
|
BoolCharacter() {}
|
|
|
|
explicit BoolCharacter(const bool *delimiter) { delimiter_ = delimiter; }
|
|
|
|
StringPiece Find(const StringPiece &in) const {
|
|
for (const char *i = in.data(); i != in.data() + in.size(); ++i) {
|
|
if (delimiter_[static_cast<unsigned char>(*i)]) return StringPiece(i, 1);
|
|
}
|
|
return StringPiece(in.data() + in.size(), 0);
|
|
}
|
|
|
|
template <unsigned Length> static void Build(const char (&characters)[Length], bool (&out)[256]) {
|
|
memset(out, 0, sizeof(out));
|
|
for (const char *i = characters; i != characters + Length; ++i) {
|
|
out[static_cast<unsigned char>(*i)] = true;
|
|
}
|
|
}
|
|
|
|
private:
|
|
const bool *delimiter_;
|
|
};
|
|
|
|
class AnyCharacterLast {
|
|
public:
|
|
AnyCharacterLast() {}
|
|
|
|
explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {}
|
|
|
|
StringPiece Find(const StringPiece &in) const {
|
|
return StringPiece(std::find_end(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1);
|
|
}
|
|
|
|
private:
|
|
StringPiece chars_;
|
|
};
|
|
|
|
template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> {
|
|
public:
|
|
TokenIter() {}
|
|
|
|
template <class Construct> TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) {
|
|
increment();
|
|
}
|
|
|
|
bool operator!() const {
|
|
return current_.data() == 0;
|
|
}
|
|
operator bool() const {
|
|
return current_.data() != 0;
|
|
}
|
|
|
|
static TokenIter<Find, SkipEmpty> end() {
|
|
return TokenIter<Find, SkipEmpty>();
|
|
}
|
|
|
|
private:
|
|
friend class boost::iterator_core_access;
|
|
|
|
void increment() {
|
|
do {
|
|
StringPiece found(finder_.Find(after_));
|
|
current_ = StringPiece(after_.data(), found.data() - after_.data());
|
|
if (found.data() == after_.data() + after_.size()) {
|
|
after_ = StringPiece(NULL, 0);
|
|
} else {
|
|
after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size());
|
|
}
|
|
} while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false.
|
|
}
|
|
|
|
bool equal(const TokenIter<Find, SkipEmpty> &other) const {
|
|
return current_.data() == other.current_.data();
|
|
}
|
|
|
|
const StringPiece &dereference() const {
|
|
UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens");
|
|
return current_;
|
|
}
|
|
|
|
StringPiece current_;
|
|
StringPiece after_;
|
|
|
|
Find finder_;
|
|
};
|
|
|
|
} // namespace util
|
|
|
|
#endif // UTIL_TOKENIZE_PIECE_H
|