mirror of
https://github.com/google/sentencepiece.git
synced 2024-09-11 10:55:42 +03:00
remove absl/random and absl/memory, add absl::btree_map
This commit is contained in:
parent
adf9e81b63
commit
f5c736302c
@ -17,11 +17,11 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "sentencepiece_model.pb.h"
|
||||
#include "third_party/absl/container/btree_set.h"
|
||||
#include "third_party/absl/container/flat_hash_map.h"
|
||||
#include "trainer_interface.h"
|
||||
|
||||
@ -51,7 +51,7 @@ class Trainer : public TrainerInterface {
|
||||
|
||||
// Position list. Use set so that we can keep the order of occurrence.
|
||||
// See EncodePos/DecodePos.
|
||||
std::set<uint64_t> positions;
|
||||
absl::btree_set<uint64_t> positions;
|
||||
|
||||
bool IsBigram() const { return left != nullptr && right != nullptr; }
|
||||
std::string ToString() const;
|
||||
@ -72,8 +72,7 @@ class Trainer : public TrainerInterface {
|
||||
CHECK_LE(l, std::numeric_limits<uint16_t>::max());
|
||||
CHECK_LE(r, std::numeric_limits<uint16_t>::max());
|
||||
const uint64_t n = (static_cast<uint64_t>(sid) << 32) |
|
||||
(static_cast<uint64_t>(l) << 16) |
|
||||
r;
|
||||
(static_cast<uint64_t>(l) << 16) | r;
|
||||
return n;
|
||||
}
|
||||
|
||||
@ -118,7 +117,7 @@ class Trainer : public TrainerInterface {
|
||||
absl::flat_hash_map<uint64_t, Symbol *> symbols_cache_;
|
||||
|
||||
// Set of symbols from which we find the best symbol in each iteration.
|
||||
std::set<Symbol *> active_symbols_;
|
||||
absl::btree_set<Symbol *> active_symbols_;
|
||||
|
||||
// Stores symbols allocated in heap so that we can delete them at onece.
|
||||
std::vector<Symbol *> allocated_;
|
||||
|
@ -18,7 +18,6 @@
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "util.h"
|
||||
|
||||
#if defined(OS_WIN) && defined(UNICODE) && defined(_UNICODE)
|
||||
@ -105,12 +104,12 @@ using DefaultWritableFile = PosixWritableFile;
|
||||
|
||||
std::unique_ptr<ReadableFile> NewReadableFile(absl::string_view filename,
|
||||
bool is_binary) {
|
||||
return absl::make_unique<DefaultReadableFile>(filename, is_binary);
|
||||
return std::make_unique<DefaultReadableFile>(filename, is_binary);
|
||||
}
|
||||
|
||||
std::unique_ptr<WritableFile> NewWritableFile(absl::string_view filename,
|
||||
bool is_binary) {
|
||||
return absl::make_unique<DefaultWritableFile>(filename, is_binary);
|
||||
return std::make_unique<DefaultWritableFile>(filename, is_binary);
|
||||
}
|
||||
|
||||
} // namespace filesystem
|
||||
|
@ -12,10 +12,10 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.!
|
||||
|
||||
#include "model_factory.h"
|
||||
|
||||
#include "bpe_model.h"
|
||||
#include "char_model.h"
|
||||
#include "model_factory.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "unigram_model.h"
|
||||
#include "word_model.h"
|
||||
|
||||
@ -28,16 +28,16 @@ std::unique_ptr<ModelInterface> ModelFactory::Create(
|
||||
|
||||
switch (trainer_spec.model_type()) {
|
||||
case TrainerSpec::UNIGRAM:
|
||||
return absl::make_unique<unigram::Model>(model_proto);
|
||||
return std::make_unique<unigram::Model>(model_proto);
|
||||
break;
|
||||
case TrainerSpec::BPE:
|
||||
return absl::make_unique<bpe::Model>(model_proto);
|
||||
return std::make_unique<bpe::Model>(model_proto);
|
||||
break;
|
||||
case TrainerSpec::WORD:
|
||||
return absl::make_unique<word::Model>(model_proto);
|
||||
return std::make_unique<word::Model>(model_proto);
|
||||
break;
|
||||
case TrainerSpec::CHAR:
|
||||
return absl::make_unique<character::Model>(model_proto);
|
||||
return std::make_unique<character::Model>(model_proto);
|
||||
break;
|
||||
default:
|
||||
LOG(ERROR) << "Unknown model_type: " << trainer_spec.model_type();
|
||||
@ -45,6 +45,6 @@ std::unique_ptr<ModelInterface> ModelFactory::Create(
|
||||
break;
|
||||
}
|
||||
|
||||
return absl::make_unique<unigram::Model>(model_proto);
|
||||
return std::make_unique<unigram::Model>(model_proto);
|
||||
}
|
||||
} // namespace sentencepiece
|
||||
|
@ -17,7 +17,6 @@
|
||||
#include <algorithm>
|
||||
|
||||
#include "sentencepiece_model.pb.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "third_party/absl/strings/str_format.h"
|
||||
#include "util.h"
|
||||
|
||||
@ -148,7 +147,7 @@ void ModelInterface::InitializePieces() {
|
||||
}
|
||||
}
|
||||
|
||||
matcher_ = absl::make_unique<normalizer::PrefixMatcher>(user_defined_symbols);
|
||||
matcher_ = std::make_unique<normalizer::PrefixMatcher>(user_defined_symbols);
|
||||
}
|
||||
|
||||
std::vector<absl::string_view> SplitIntoWords(absl::string_view text,
|
||||
|
@ -18,7 +18,6 @@
|
||||
#include <vector>
|
||||
|
||||
#include "common.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "third_party/absl/strings/match.h"
|
||||
#include "third_party/absl/strings/string_view.h"
|
||||
#include "third_party/absl/strings/strip.h"
|
||||
@ -58,7 +57,7 @@ void Normalizer::Init() {
|
||||
if (!status_.ok()) return;
|
||||
|
||||
// Reads the body of double array.
|
||||
trie_ = absl::make_unique<Darts::DoubleArray>();
|
||||
trie_ = std::make_unique<Darts::DoubleArray>();
|
||||
|
||||
// The second arg of set_array is not the size of blob,
|
||||
// but the number of double array units.
|
||||
@ -314,7 +313,7 @@ PrefixMatcher::PrefixMatcher(const std::set<absl::string_view> &dic) {
|
||||
std::vector<const char *> key;
|
||||
key.reserve(dic.size());
|
||||
for (const auto &it : dic) key.push_back(it.data());
|
||||
trie_ = absl::make_unique<Darts::DoubleArray>();
|
||||
trie_ = std::make_unique<Darts::DoubleArray>();
|
||||
if (trie_->build(key.size(), const_cast<char **>(&key[0]), nullptr,
|
||||
nullptr) != 0) {
|
||||
LOG(ERROR) << "Failed to build the TRIE for PrefixMatcher";
|
||||
|
@ -30,7 +30,6 @@
|
||||
#include "model_interface.h"
|
||||
#include "normalizer.h"
|
||||
#include "sentencepiece.pb.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "third_party/absl/strings/numbers.h"
|
||||
#include "third_party/absl/strings/str_cat.h"
|
||||
#include "third_party/absl/strings/str_join.h"
|
||||
@ -217,7 +216,7 @@ SentencePieceProcessor::SentencePieceProcessor() {}
|
||||
SentencePieceProcessor::~SentencePieceProcessor() {}
|
||||
|
||||
util::Status SentencePieceProcessor::Load(absl::string_view filename) {
|
||||
auto model_proto = absl::make_unique<ModelProto>();
|
||||
auto model_proto = std::make_unique<ModelProto>();
|
||||
RETURN_IF_ERROR(io::LoadModelProto(filename, model_proto.get()));
|
||||
return Load(std::move(model_proto));
|
||||
}
|
||||
@ -227,14 +226,14 @@ void SentencePieceProcessor::LoadOrDie(absl::string_view filename) {
|
||||
}
|
||||
|
||||
util::Status SentencePieceProcessor::Load(const ModelProto &model_proto) {
|
||||
auto model_proto_copy = absl::make_unique<ModelProto>();
|
||||
auto model_proto_copy = std::make_unique<ModelProto>();
|
||||
*model_proto_copy = model_proto;
|
||||
return Load(std::move(model_proto_copy));
|
||||
}
|
||||
|
||||
util::Status SentencePieceProcessor::LoadFromSerializedProto(
|
||||
absl::string_view serialized) {
|
||||
auto model_proto = absl::make_unique<ModelProto>();
|
||||
auto model_proto = std::make_unique<ModelProto>();
|
||||
CHECK_OR_RETURN(
|
||||
model_proto->ParseFromArray(serialized.data(), serialized.size()));
|
||||
return Load(std::move(model_proto));
|
||||
@ -244,11 +243,11 @@ util::Status SentencePieceProcessor::Load(
|
||||
std::unique_ptr<ModelProto> model_proto) {
|
||||
model_proto_ = std::move(model_proto);
|
||||
model_ = ModelFactory::Create(*model_proto_);
|
||||
normalizer_ = absl::make_unique<normalizer::Normalizer>(
|
||||
normalizer_ = std::make_unique<normalizer::Normalizer>(
|
||||
model_proto_->normalizer_spec(), model_proto_->trainer_spec());
|
||||
if (model_proto_->has_denormalizer_spec() &&
|
||||
!model_proto_->denormalizer_spec().precompiled_charsmap().empty()) {
|
||||
denormalizer_ = absl::make_unique<normalizer::Normalizer>(
|
||||
denormalizer_ = std::make_unique<normalizer::Normalizer>(
|
||||
model_proto_->denormalizer_spec());
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,6 @@
|
||||
#include "sentencepiece_trainer.h"
|
||||
#include "testharness.h"
|
||||
#include "third_party/absl/container/flat_hash_map.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "third_party/absl/strings/str_cat.h"
|
||||
#include "third_party/absl/strings/string_view.h"
|
||||
#include "util.h"
|
||||
@ -123,7 +122,7 @@ NormalizerSpec MakeDefaultNormalizerSpec() {
|
||||
TEST(SentencepieceProcessorTest, StatusTest) {
|
||||
SentencePieceProcessor sp;
|
||||
EXPECT_FALSE(sp.status().ok());
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
sp.SetModel(std::move(mock));
|
||||
EXPECT_FALSE(sp.status().ok());
|
||||
}
|
||||
@ -135,7 +134,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
|
||||
{
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
|
||||
const EncodeResult result = {
|
||||
{WS "ABC", 3}, {WS "DE", 4}, {"F", 0}, {"</s>", 2}};
|
||||
@ -143,7 +142,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
|
||||
sp.SetModel(std::move(mock));
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
std::vector<std::string> output;
|
||||
EXPECT_TRUE(sp.Encode("ABC DEF", &output).ok());
|
||||
@ -186,7 +185,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
|
||||
// Unknown sequences.
|
||||
{
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
|
||||
const EncodeResult result = {
|
||||
{WS "ABC", 3}, {WS "D", 4}, {"E", 0}, {"F", 0}, {"</s>", 2}};
|
||||
@ -196,7 +195,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
mock->SetEncodeResult(kInput, result);
|
||||
sp.SetModel(std::move(mock));
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
std::vector<std::string> output;
|
||||
EXPECT_TRUE(sp.Encode("ABC DEF", &output).ok());
|
||||
@ -236,7 +235,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
// Byte-fallback.
|
||||
{
|
||||
const absl::string_view kInput2 = WS "ABC" WS "DEFあ";
|
||||
auto mock = absl::make_unique<ByteFallbackMockModel>();
|
||||
auto mock = std::make_unique<ByteFallbackMockModel>();
|
||||
|
||||
const EncodeResult result = {{WS "ABC", 3}, {WS "D", 4}, {"E", 0},
|
||||
{"F", 0}, {"あ", 0}, {"</s>", 2}};
|
||||
@ -250,7 +249,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
mock->SetEncodeResult(kInput2, result);
|
||||
sp.SetModel(std::move(mock));
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
std::vector<std::string> output;
|
||||
EXPECT_TRUE(sp.Encode("ABC DEFあ", &output).ok());
|
||||
@ -306,12 +305,12 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
// Crash if
|
||||
// ModelInterface::Encode() returns shorter results.
|
||||
{
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
const EncodeResult result = {{WS "ABC", 3}};
|
||||
mock->SetEncodeResult(kInput, result);
|
||||
sp.SetModel(std::move(mock));
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
SentencePieceText spt;
|
||||
// Expects crash.
|
||||
EXPECT_FALSE(sp.Encode("ABC DEF", &spt).ok());
|
||||
@ -320,13 +319,13 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
// Crash if
|
||||
// ModelInterface::Encode() returns longer results.
|
||||
{
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
const EncodeResult result = {
|
||||
{WS "ABC", 3}, {WS "DE", 4}, {"F", 5}, {"G", 6}};
|
||||
mock->SetEncodeResult(kInput, result);
|
||||
sp.SetModel(std::move(mock));
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
SentencePieceText spt;
|
||||
// Expects crash.
|
||||
EXPECT_FALSE(sp.Encode("ABC DEF", &spt).ok());
|
||||
@ -335,13 +334,13 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
// Crash if
|
||||
// ModelInterface::Encode() returns an empty piece.
|
||||
{
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
const EncodeResult result = {
|
||||
{WS "ABC", 3}, {WS "DE", 4}, {"", 5}, {"F", 6}};
|
||||
mock->SetEncodeResult(kInput, result);
|
||||
sp.SetModel(std::move(mock));
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
SentencePieceText spt;
|
||||
// Expects crash.
|
||||
EXPECT_FALSE(sp.Encode("ABC DEF", &spt).ok());
|
||||
@ -349,7 +348,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
|
||||
// Halfwidth to Fullwidith katakana normalization.
|
||||
{
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
const EncodeResult result = {{WS "グー", 3}, {"グル", 4}, {"</s>", 2}};
|
||||
const absl::string_view input = WS "グーグル";
|
||||
mock->SetEncodeResult(input, result);
|
||||
@ -383,7 +382,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
|
||||
|
||||
// One to many normalization.
|
||||
{
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
const EncodeResult result = {{WS "株式", 3}, {"会社", 4}, {"</s>", 2}};
|
||||
const absl::string_view input = WS "株式会社";
|
||||
mock->SetEncodeResult(input, result);
|
||||
@ -422,7 +421,7 @@ TEST(SentencepieceProcessorTest, NBestEncodeTest) {
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
|
||||
const NBestEncodeResult result = {
|
||||
{{{WS "ABC", 3}, {WS "DE", 4}, {"F", 0}, {"</s>", 2}},
|
||||
@ -433,7 +432,7 @@ TEST(SentencepieceProcessorTest, NBestEncodeTest) {
|
||||
mock->SetNBestEncodeResult(kInput, result);
|
||||
sp.SetModel(std::move(mock));
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
std::vector<std::vector<std::string>> output;
|
||||
EXPECT_TRUE(sp.NBestEncode("ABC DEF", 2, &output).ok());
|
||||
@ -464,7 +463,7 @@ TEST(SentencepieceProcessorTest, NBestEncodeTest) {
|
||||
spt2.ParseFromString(sp.NBestEncodeAsSerializedProto("ABC DEF", 2)));
|
||||
EXPECT_EQ(spt.SerializeAsString(), spt2.SerializeAsString());
|
||||
|
||||
auto mock_empty = absl::make_unique<MockModel>();
|
||||
auto mock_empty = std::make_unique<MockModel>();
|
||||
mock_empty->SetNBestEncodeResult(kInput, {});
|
||||
sp.SetModel(std::move(mock_empty));
|
||||
EXPECT_FALSE(sp.NBestEncode("ABC DEF", 2, &output).ok());
|
||||
@ -476,7 +475,7 @@ TEST(SentencepieceProcessorTest, SampleEncodeTest) {
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
|
||||
auto mock = absl::make_unique<MockModel>();
|
||||
auto mock = std::make_unique<MockModel>();
|
||||
|
||||
const EncodeResult result = {
|
||||
{WS "ABC", 3}, {WS "DE", 4}, {"F", 0}, {"</s>", 2}};
|
||||
@ -490,7 +489,7 @@ TEST(SentencepieceProcessorTest, SampleEncodeTest) {
|
||||
mock->SetEncodeResult(kInput, result);
|
||||
sp.SetModel(std::move(mock));
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
std::vector<std::string> output;
|
||||
EXPECT_TRUE(sp.SampleEncode("ABC DEF", -1, 0.5, &output).ok());
|
||||
@ -536,7 +535,7 @@ TEST(SentencepieceProcessorTest, SampleEncodeTest) {
|
||||
const float prob = 1.0 * freq[0] / (freq[0] + freq[1]);
|
||||
EXPECT_NEAR(prob, expected_prob, 0.05);
|
||||
|
||||
auto mock_empty = absl::make_unique<MockModel>();
|
||||
auto mock_empty = std::make_unique<MockModel>();
|
||||
mock_empty->SetNBestEncodeResult(kInput, {});
|
||||
sp.SetModel(std::move(mock_empty));
|
||||
EXPECT_FALSE(sp.SampleEncode("ABC DEF", 10, 0.5, &output).ok());
|
||||
@ -578,12 +577,12 @@ TEST(SentencepieceProcessorTest, DecodeTest) {
|
||||
|
||||
{
|
||||
SentencePieceProcessor sp;
|
||||
auto mock = absl::make_unique<DecodeMockModel>();
|
||||
auto mock = std::make_unique<DecodeMockModel>();
|
||||
sp.SetModel(std::move(mock));
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
SentencePieceText spt;
|
||||
|
||||
@ -629,15 +628,15 @@ TEST(SentencepieceProcessorTest, DecodeTest) {
|
||||
// unk_surface is not defined.
|
||||
{
|
||||
SentencePieceProcessor sp;
|
||||
auto proto = absl::make_unique<ModelProto>();
|
||||
auto proto = std::make_unique<ModelProto>();
|
||||
sp.Load(std::move(proto)).IgnoreError();
|
||||
|
||||
auto mock = absl::make_unique<DecodeMockModel>();
|
||||
auto mock = std::make_unique<DecodeMockModel>();
|
||||
sp.SetModel(std::move(mock));
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
SentencePieceText spt;
|
||||
|
||||
@ -648,16 +647,16 @@ TEST(SentencepieceProcessorTest, DecodeTest) {
|
||||
|
||||
{
|
||||
SentencePieceProcessor sp;
|
||||
auto proto = absl::make_unique<ModelProto>();
|
||||
auto proto = std::make_unique<ModelProto>();
|
||||
proto->mutable_trainer_spec()->set_unk_surface("");
|
||||
sp.Load(std::move(proto)).IgnoreError();
|
||||
|
||||
auto mock = absl::make_unique<DecodeMockModel>();
|
||||
auto mock = std::make_unique<DecodeMockModel>();
|
||||
sp.SetModel(std::move(mock));
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
SentencePieceText spt;
|
||||
|
||||
@ -668,16 +667,16 @@ TEST(SentencepieceProcessorTest, DecodeTest) {
|
||||
|
||||
{
|
||||
SentencePieceProcessor sp;
|
||||
auto proto = absl::make_unique<ModelProto>();
|
||||
auto proto = std::make_unique<ModelProto>();
|
||||
proto->mutable_trainer_spec()->set_unk_surface("<UNK>");
|
||||
sp.Load(std::move(proto)).IgnoreError();
|
||||
|
||||
auto mock = absl::make_unique<DecodeMockModel>();
|
||||
auto mock = std::make_unique<DecodeMockModel>();
|
||||
sp.SetModel(std::move(mock));
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
SentencePieceText spt;
|
||||
|
||||
@ -688,18 +687,18 @@ TEST(SentencepieceProcessorTest, DecodeTest) {
|
||||
|
||||
{
|
||||
SentencePieceProcessor sp;
|
||||
auto proto = absl::make_unique<ModelProto>();
|
||||
auto proto = std::make_unique<ModelProto>();
|
||||
proto->mutable_trainer_spec()->set_unk_surface("");
|
||||
proto->mutable_normalizer_spec()->set_add_dummy_prefix(false);
|
||||
proto->mutable_normalizer_spec()->set_remove_extra_whitespaces(false);
|
||||
sp.Load(std::move(proto)).IgnoreError();
|
||||
|
||||
auto mock = absl::make_unique<DecodeMockModel>();
|
||||
auto mock = std::make_unique<DecodeMockModel>();
|
||||
sp.SetModel(std::move(mock));
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
SentencePieceText spt;
|
||||
|
||||
@ -746,18 +745,18 @@ TEST(SentencepieceProcessorTest, DummyPrefixDecodeTest) {
|
||||
|
||||
{
|
||||
SentencePieceProcessor sp;
|
||||
auto proto = absl::make_unique<ModelProto>();
|
||||
auto proto = std::make_unique<ModelProto>();
|
||||
proto->mutable_trainer_spec()->set_unk_surface("");
|
||||
proto->mutable_normalizer_spec()->set_add_dummy_prefix(true);
|
||||
proto->mutable_normalizer_spec()->set_remove_extra_whitespaces(false);
|
||||
sp.Load(std::move(proto)).IgnoreError();
|
||||
|
||||
auto mock = absl::make_unique<DecodeMockModel>();
|
||||
auto mock = std::make_unique<DecodeMockModel>();
|
||||
sp.SetModel(std::move(mock));
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
SentencePieceText spt;
|
||||
|
||||
@ -768,18 +767,18 @@ TEST(SentencepieceProcessorTest, DummyPrefixDecodeTest) {
|
||||
|
||||
{
|
||||
SentencePieceProcessor sp;
|
||||
auto proto = absl::make_unique<ModelProto>();
|
||||
auto proto = std::make_unique<ModelProto>();
|
||||
proto->mutable_trainer_spec()->set_unk_surface("");
|
||||
proto->mutable_normalizer_spec()->set_add_dummy_prefix(true);
|
||||
proto->mutable_normalizer_spec()->set_remove_extra_whitespaces(true);
|
||||
sp.Load(std::move(proto)).IgnoreError();
|
||||
|
||||
auto mock = absl::make_unique<DecodeMockModel>();
|
||||
auto mock = std::make_unique<DecodeMockModel>();
|
||||
sp.SetModel(std::move(mock));
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
SentencePieceText spt;
|
||||
|
||||
@ -833,12 +832,12 @@ TEST(SentencepieceProcessorTest, ByteFallbackDecodeTest) {
|
||||
};
|
||||
|
||||
SentencePieceProcessor sp;
|
||||
auto mock = absl::make_unique<ByteFallbackDecodeMockModel>();
|
||||
auto mock = std::make_unique<ByteFallbackDecodeMockModel>();
|
||||
sp.SetModel(std::move(mock));
|
||||
|
||||
const auto normalization_spec = MakeDefaultNormalizerSpec();
|
||||
sp.SetNormalizer(
|
||||
absl::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
std::make_unique<normalizer::Normalizer>(normalization_spec));
|
||||
|
||||
{
|
||||
const std::vector<std::string> input = {
|
||||
@ -1347,7 +1346,7 @@ TEST(SentencePieceProcessorTest, EndToEndTest) {
|
||||
// Moves ModelProto.
|
||||
{
|
||||
SentencePieceProcessor sp;
|
||||
auto moved = absl::make_unique<ModelProto>();
|
||||
auto moved = std::make_unique<ModelProto>();
|
||||
const ModelProto *moved_ptr = moved.get();
|
||||
*moved = model_proto;
|
||||
EXPECT_TRUE(sp.Load(std::move(moved)).ok());
|
||||
|
@ -12,10 +12,10 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.!
|
||||
|
||||
#include "trainer_factory.h"
|
||||
|
||||
#include "bpe_model_trainer.h"
|
||||
#include "char_model_trainer.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "trainer_factory.h"
|
||||
#include "unigram_model_trainer.h"
|
||||
#include "word_model_trainer.h"
|
||||
|
||||
@ -27,27 +27,27 @@ std::unique_ptr<TrainerInterface> TrainerFactory::Create(
|
||||
const NormalizerSpec &denormalizer_spec) {
|
||||
switch (trainer_spec.model_type()) {
|
||||
case TrainerSpec::UNIGRAM:
|
||||
return absl::make_unique<unigram::Trainer>(trainer_spec, normalizer_spec,
|
||||
return std::make_unique<unigram::Trainer>(trainer_spec, normalizer_spec,
|
||||
denormalizer_spec);
|
||||
break;
|
||||
case TrainerSpec::BPE:
|
||||
return absl::make_unique<bpe::Trainer>(trainer_spec, normalizer_spec,
|
||||
return std::make_unique<bpe::Trainer>(trainer_spec, normalizer_spec,
|
||||
denormalizer_spec);
|
||||
break;
|
||||
case TrainerSpec::WORD:
|
||||
return absl::make_unique<word::Trainer>(trainer_spec, normalizer_spec,
|
||||
return std::make_unique<word::Trainer>(trainer_spec, normalizer_spec,
|
||||
denormalizer_spec);
|
||||
break;
|
||||
case TrainerSpec::CHAR:
|
||||
return absl::make_unique<character::Trainer>(
|
||||
trainer_spec, normalizer_spec, denormalizer_spec);
|
||||
return std::make_unique<character::Trainer>(trainer_spec, normalizer_spec,
|
||||
denormalizer_spec);
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unknown model_type: " << trainer_spec.model_type();
|
||||
break;
|
||||
}
|
||||
|
||||
return absl::make_unique<unigram::Trainer>(trainer_spec, normalizer_spec,
|
||||
return std::make_unique<unigram::Trainer>(trainer_spec, normalizer_spec,
|
||||
denormalizer_spec);
|
||||
}
|
||||
} // namespace sentencepiece
|
||||
|
@ -29,9 +29,6 @@
|
||||
#include "sentencepiece_processor.h"
|
||||
#include "sentencepiece_trainer.h"
|
||||
#include "third_party/absl/container/flat_hash_map.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "third_party/absl/random/distributions.h"
|
||||
#include "third_party/absl/random/random.h"
|
||||
#include "third_party/absl/strings/numbers.h"
|
||||
#include "third_party/absl/strings/str_cat.h"
|
||||
#include "third_party/absl/strings/str_format.h"
|
||||
@ -107,7 +104,7 @@ class SentenceSelector {
|
||||
if (spec_->input_sentence_size() > 0) {
|
||||
if (spec_->shuffle_input_sentence()) {
|
||||
constexpr size_t kSeed = 12345678;
|
||||
sampler_ = absl::make_unique<Sampler>(
|
||||
sampler_ = std::make_unique<Sampler>(
|
||||
sentences, spec_->input_sentence_size(), kSeed);
|
||||
} else {
|
||||
LOG(INFO)
|
||||
@ -303,12 +300,12 @@ bool TrainerInterface::IsValidSentencePiece(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void AddDPNoise(const TrainerSpec &trainer_spec,
|
||||
random::SharedBitGen &generator, T *to_update) {
|
||||
void AddDPNoise(const TrainerSpec &trainer_spec, std::mt19937 *generator,
|
||||
T *to_update) {
|
||||
if (trainer_spec.differential_privacy_noise_level() > 0) {
|
||||
float random_num = absl::Gaussian<float>(
|
||||
generator, 0, trainer_spec.differential_privacy_noise_level());
|
||||
|
||||
std::normal_distribution<float> dist(
|
||||
0.0f, trainer_spec.differential_privacy_noise_level());
|
||||
const float random_num = dist(*generator);
|
||||
*to_update =
|
||||
std::round(std::max(0.f, random_num + static_cast<float>(*to_update)));
|
||||
}
|
||||
@ -351,7 +348,7 @@ util::Status TrainerInterface::LoadSentences() {
|
||||
LOG(INFO) << "SentenceIterator is not specified. Using "
|
||||
"MultiFileSentenceIterator.";
|
||||
sentence_iterator_impl =
|
||||
absl::make_unique<MultiFileSentenceIterator>(std::vector<std::string>(
|
||||
std::make_unique<MultiFileSentenceIterator>(std::vector<std::string>(
|
||||
trainer_spec_.input().begin(), trainer_spec_.input().end()));
|
||||
sentence_iterator_ = sentence_iterator_impl.get();
|
||||
}
|
||||
@ -428,7 +425,7 @@ END:
|
||||
LOG(INFO) << "Normalizing sentences...";
|
||||
CHECK_OR_RETURN(!sentences_.empty());
|
||||
{
|
||||
auto pool = absl::make_unique<ThreadPool>(trainer_spec_.num_threads());
|
||||
auto pool = std::make_unique<ThreadPool>(trainer_spec_.num_threads());
|
||||
pool->StartWorkers();
|
||||
for (int n = 0; n < trainer_spec_.num_threads(); ++n) {
|
||||
pool->Schedule([&, n]() {
|
||||
@ -475,12 +472,12 @@ END:
|
||||
std::min<uint64>(trainer_spec_.num_threads(), sentences_.size() - 1);
|
||||
|
||||
{
|
||||
auto pool = absl::make_unique<ThreadPool>(num_workers);
|
||||
auto pool = std::make_unique<ThreadPool>(num_workers);
|
||||
pool->StartWorkers();
|
||||
for (int n = 0; n < num_workers; ++n) {
|
||||
pool->Schedule([&, n]() {
|
||||
// One per thread generator.
|
||||
random::SharedBitGen generator;
|
||||
auto *generator = random::GetRandomGenerator();
|
||||
for (size_t i = n; i < sentences_.size(); i += num_workers) {
|
||||
AddDPNoise<int64>(trainer_spec_, generator,
|
||||
&(sentences_[i].second));
|
||||
|
@ -25,7 +25,6 @@
|
||||
#include <vector>
|
||||
|
||||
#include "third_party/absl/container/flat_hash_map.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "third_party/absl/strings/str_split.h"
|
||||
#include "third_party/absl/strings/string_view.h"
|
||||
#include "util.h"
|
||||
@ -626,7 +625,7 @@ void Model::BuildTrie(std::vector<std::pair<absl::string_view, int>> *pieces) {
|
||||
value[i] = (*pieces)[i].second; // vocab_id
|
||||
}
|
||||
|
||||
trie_ = absl::make_unique<Darts::DoubleArray>();
|
||||
trie_ = std::make_unique<Darts::DoubleArray>();
|
||||
if (trie_->build(key.size(), const_cast<char **>(&key[0]), nullptr,
|
||||
&value[0]) != 0) {
|
||||
status_ = util::InternalError("cannot build double-array.");
|
||||
|
@ -28,7 +28,6 @@
|
||||
#include "pretokenizer_for_training.h"
|
||||
#include "sentencepiece_trainer.h"
|
||||
#include "third_party/absl/container/flat_hash_map.h"
|
||||
#include "third_party/absl/memory/memory.h"
|
||||
#include "third_party/absl/strings/str_replace.h"
|
||||
#include "third_party/absl/strings/str_split.h"
|
||||
#include "third_party/esaxx/esa.hxx" // Suffix array library.
|
||||
@ -283,7 +282,7 @@ std::vector<float> Trainer::RunEStep(const TrainerModel &model, float *obj,
|
||||
std::vector<float> objs(trainer_spec_.num_threads(), 0.0);
|
||||
std::vector<int64> ntokens(trainer_spec_.num_threads(), 0.0);
|
||||
|
||||
auto pool = absl::make_unique<ThreadPool>(trainer_spec_.num_threads());
|
||||
auto pool = std::make_unique<ThreadPool>(trainer_spec_.num_threads());
|
||||
pool->StartWorkers();
|
||||
|
||||
int64 all_sentence_freq = 0;
|
||||
@ -405,7 +404,7 @@ TrainerModel::SentencePieces Trainer::PruneSentencePieces(
|
||||
std::vector<std::vector<std::vector<int>>> inverteds(
|
||||
trainer_spec_.num_threads());
|
||||
|
||||
auto pool = absl::make_unique<ThreadPool>(trainer_spec_.num_threads());
|
||||
auto pool = std::make_unique<ThreadPool>(trainer_spec_.num_threads());
|
||||
pool->StartWorkers();
|
||||
for (int n = 0; n < trainer_spec_.num_threads(); ++n) {
|
||||
freqs[n].resize(sentencepieces.size(), 0.0);
|
||||
|
@ -288,11 +288,6 @@ namespace random {
|
||||
|
||||
std::mt19937 *GetRandomGenerator();
|
||||
|
||||
class SharedBitGen {
|
||||
public:
|
||||
std::mt19937 *engine() { return GetRandomGenerator(); }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class ReservoirSampler {
|
||||
public:
|
||||
|
@ -12,7 +12,17 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.!
|
||||
|
||||
#ifndef ABSL_CONTAINER_RANDOM_H_
|
||||
#define ABSL_CONTAINER_RANDOM_H_
|
||||
#ifndef ABSL_CONTAINER_BTREE_SET_
|
||||
#define ABSL_CONTAINER_BTREE_SET_
|
||||
|
||||
#endif // ABSL_CONTAINER_RANDOM_H_
|
||||
#include <set>
|
||||
|
||||
namespace absl {
|
||||
|
||||
template <typename T, typename Compare = std::less<T>,
|
||||
typename Allocator = std::allocator<T>>
|
||||
using btree_set = std::set<T, Compare, Allocator>;
|
||||
|
||||
}
|
||||
|
||||
#endif // ABSL_CONTAINER_BTREE_SET_
|
71
third_party/absl/memory/memory.h
vendored
71
third_party/absl/memory/memory.h
vendored
@ -1,71 +0,0 @@
|
||||
//
|
||||
// Copyright 2017 The Abseil Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// -----------------------------------------------------------------------------
|
||||
// File: string_view.h
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// This file contains the definition of the `absl::string_view` class. A
|
||||
// `string_view` points to a contiguous span of characters, often part or all of
|
||||
// another `std::string`, double-quoted std::string literal, character array, or
|
||||
// even another `string_view`.
|
||||
//
|
||||
// This `absl::string_view` abstraction is designed to be a drop-in
|
||||
// replacement for the C++17 `std::string_view` abstraction.
|
||||
#ifndef ABSL_MEMORY_MEMORY_H_
|
||||
#define ABSL_MEMORY_MEMORY_H_
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace absl {
|
||||
|
||||
// Trait to select overloads and return types for MakeUnique.
|
||||
template <typename T>
|
||||
struct MakeUniqueResult {
|
||||
using scalar = std::unique_ptr<T>;
|
||||
};
|
||||
template <typename T>
|
||||
struct MakeUniqueResult<T[]> {
|
||||
using array = std::unique_ptr<T[]>;
|
||||
};
|
||||
template <typename T, size_t N>
|
||||
struct MakeUniqueResult<T[N]> {
|
||||
using invalid = void;
|
||||
};
|
||||
|
||||
// MakeUnique<T>(...) is an early implementation of C++14 std::make_unique.
|
||||
// It is designed to be 100% compatible with std::make_unique so that the
|
||||
// eventual switchover will be a simple renaming operation.
|
||||
template <typename T, typename... Args>
|
||||
typename MakeUniqueResult<T>::scalar make_unique(Args &&... args) { // NOLINT
|
||||
return std::unique_ptr<T>(
|
||||
new T(std::forward<Args>(args)...)); // NOLINT(build/c++11)
|
||||
}
|
||||
|
||||
// Overload for array of unknown bound.
|
||||
// The allocation of arrays needs to use the array form of new,
|
||||
// and cannot take element constructor arguments.
|
||||
template <typename T>
|
||||
typename MakeUniqueResult<T>::array make_unique(size_t n) {
|
||||
return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
|
||||
}
|
||||
|
||||
// Reject arrays of known bound.
|
||||
template <typename T, typename... Args>
|
||||
typename MakeUniqueResult<T>::invalid make_unique(Args &&... /* args */) =
|
||||
delete; // NOLINT
|
||||
|
||||
} // namespace absl
|
||||
#endif // ABSL_MEMORY_MEMORY_H_
|
31
third_party/absl/random/distributions.h
vendored
31
third_party/absl/random/distributions.h
vendored
@ -1,31 +0,0 @@
|
||||
// Copyright 2016 Google Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.!
|
||||
|
||||
#ifndef ABSL_CONTAINER_DISTRIBUTIONS_H_
|
||||
#define ABSL_CONTAINER_DISTRIBUTIONS_H_
|
||||
|
||||
#include <random>
|
||||
|
||||
#include "random.h"
|
||||
|
||||
namespace absl {
|
||||
|
||||
template <typename T, typename G>
|
||||
T Gaussian(G &generator, T mean, T stddev) {
|
||||
std::normal_distribution<> dist(mean, stddev);
|
||||
return dist(*generator.engine());
|
||||
}
|
||||
} // namespace absl
|
||||
|
||||
#endif // ABSL_CONTAINER_DISTRIBUTIONS_H_
|
Loading…
Reference in New Issue
Block a user