mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-17 09:47:34 +03:00
Merged PR 19761: Expose SPM Interface from Marian
This PR adds interfaces in Marian to allow it to handle segmentation duties. Related work items: #121418
This commit is contained in:
parent
b653db0a9b
commit
4ff2ef189e
@ -81,11 +81,16 @@ if(MSVC)
|
||||
# These are used in src/CMakeLists.txt on a per-target basis
|
||||
list(APPEND ALL_WARNINGS /WX; /W4;)
|
||||
|
||||
# Disabled bogus warnings for CPU intrinsics:
|
||||
# Disabled bogus warnings for CPU intrinsics and Protobuf:
|
||||
# C4100: 'identifier' : unreferenced formal parameter
|
||||
# C4310: cast truncates constant value
|
||||
# C4324: 'marian::cpu::int16::`anonymous-namespace'::ScatterPut': structure was padded due to alignment specifier
|
||||
# C4702: unreachable code; note it is also disabled globally in the VS project file
|
||||
set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\"")
|
||||
if(USE_SENTENCEPIECE)
|
||||
set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4100\"")
|
||||
else()
|
||||
set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\"")
|
||||
endif()
|
||||
|
||||
# set(INTRINSICS "/arch:AVX")
|
||||
add_definitions(-DUSE_SSE2=1)
|
||||
|
17
src/3rd_party/CMakeLists.txt
vendored
17
src/3rd_party/CMakeLists.txt
vendored
@ -71,9 +71,7 @@ if(USE_SENTENCEPIECE)
|
||||
endif()
|
||||
|
||||
# regardless of -DUSE_STATIC_LIBS setting always build sentencepiece statically
|
||||
if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
|
||||
set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE)
|
||||
endif()
|
||||
set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE)
|
||||
set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.")
|
||||
|
||||
if(USE_STATIC_LIBS)
|
||||
@ -111,16 +109,11 @@ if(USE_SENTENCEPIECE)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
|
||||
endif()
|
||||
|
||||
# regardless of -DUSE_STATIC_LIBS setting always build sentencepiece statically
|
||||
if(GENERATE_MARIAN_INSTALL_TARGETS)
|
||||
if(USE_STATIC_LIBS)
|
||||
install(TARGETS sentencepiece-static sentencepiece_train-static
|
||||
EXPORT marian-targets
|
||||
DESTINATION sentencepiece)
|
||||
else()
|
||||
install(TARGETS sentencepiece sentencepiece_train
|
||||
EXPORT marian-targets
|
||||
DESTINATION sentencepiece)
|
||||
endif()
|
||||
install(TARGETS sentencepiece-static sentencepiece_train-static
|
||||
EXPORT marian-targets
|
||||
DESTINATION sentencepiece)
|
||||
endif(GENERATE_MARIAN_INSTALL_TARGETS)
|
||||
endif(USE_SENTENCEPIECE)
|
||||
|
||||
|
@ -4,6 +4,9 @@ include_directories(.)
|
||||
include_directories(3rd_party)
|
||||
include_directories(3rd_party/SQLiteCpp/include)
|
||||
include_directories(3rd_party/sentencepiece)
|
||||
if(USE_SENTENCEPIECE)
|
||||
include_directories(3rd_party/sentencepiece/third_party/protobuf-lite)
|
||||
endif(USE_SENTENCEPIECE)
|
||||
include_directories(3rd_party/fbgemm/include)
|
||||
include_directories(3rd_party/intgemm)
|
||||
include_directories(${CMAKE_BINARY_DIR}/src/3rd_party/intgemm) # running cmake on the intgemm submodule triggers config file generation in this directory.
|
||||
@ -110,6 +113,7 @@ set(MARIAN_SOURCES
|
||||
|
||||
# this is only compiled to catch build errors
|
||||
microsoft/quicksand.cpp
|
||||
microsoft/sentencepiece.cpp
|
||||
microsoft/cosmos.cpp
|
||||
|
||||
# copied from quicksand to be able to read binary shortlist
|
||||
|
169
src/microsoft/sentencepiece.cpp
Normal file
169
src/microsoft/sentencepiece.cpp
Normal file
@ -0,0 +1,169 @@
|
||||
#include <sstream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef USE_SENTENCEPIECE
|
||||
#include "sentencepiece.h"
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wsuggest-override"
|
||||
#endif
|
||||
|
||||
#include "sentencepiece/src/builtin_pb/sentencepiece.pb.h"
|
||||
|
||||
#ifdef __GNUC__
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
#include "sentencepiece/src/sentencepiece_processor.h"
|
||||
#include "sentencepiece/src/sentencepiece_trainer.h"
|
||||
#include "unicode_conversions.h"
|
||||
|
||||
namespace marian {
|
||||
namespace spm {
|
||||
class SentencePieceInternal {
|
||||
std::unique_ptr<sentencepiece::SentencePieceProcessor> m_processor;
|
||||
|
||||
void checkStatus(sentencepiece::util::Status status, const char* what) {
|
||||
if(status.ok())
|
||||
return;
|
||||
std::string err = status.ToString();
|
||||
std::cerr << err << std::endl;
|
||||
throw std::runtime_error(std::string("SentencePiece error ") + what + ": " + err);
|
||||
}
|
||||
|
||||
int createNativeSentencePieceText(sentencepiece::SentencePieceText& spt, Native_SentencePieceText** outSpt) {
|
||||
Native_SentencePieceText* spt_ret = new Native_SentencePieceText();
|
||||
|
||||
spt_ret->text = new char[spt.text().size() + 1];
|
||||
::strcpy(spt_ret->text, spt.text().c_str());
|
||||
|
||||
spt_ret->num_pieces = spt.pieces().size();
|
||||
spt_ret->pieces = new Native_SentencePiecePiece*[spt_ret->num_pieces];
|
||||
|
||||
int counter = 0;
|
||||
for(auto& piece : spt.pieces()) {
|
||||
spt_ret->pieces[counter] = new Native_SentencePiecePiece();
|
||||
spt_ret->pieces[counter]->id = piece.id();
|
||||
spt_ret->pieces[counter]->begin = piece.begin();
|
||||
spt_ret->pieces[counter]->end = piece.end();
|
||||
spt_ret->pieces[counter]->surface = new char[piece.surface().size() + 1];
|
||||
::strcpy((spt_ret->pieces)[counter]->surface, (char*)piece.surface().c_str());
|
||||
spt_ret->pieces[counter]->piece = new char[piece.piece().size() + 1];
|
||||
::strcpy((spt_ret->pieces)[counter]->piece, (char*)piece.piece().c_str());
|
||||
counter++;
|
||||
}
|
||||
*outSpt = spt_ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
SentencePieceInternal(const uint16_t* modelPath, const uint16_t** vocab, size_t vocabSize) {
|
||||
m_processor.reset(new sentencepiece::SentencePieceProcessor());
|
||||
// load the model file
|
||||
const auto status = m_processor->Load(utf16_to_utf8(utf16string(modelPath)));
|
||||
// implant the restricted vocabulary, if given
|
||||
if(vocab && vocabSize > 0) {
|
||||
std::vector<std::string> vocab_str;
|
||||
for(size_t i = 0; i < vocabSize; i++)
|
||||
vocab_str.push_back(utf16_to_utf8(utf16string(vocab[i])));
|
||||
|
||||
m_processor->SetVocabulary(vocab_str);
|
||||
}
|
||||
checkStatus(status, "loading");
|
||||
}
|
||||
|
||||
int getPieceID(char* sentence) {
|
||||
std::string sentInUtf8(sentence);
|
||||
return m_processor->PieceToId(absl::string_view(sentInUtf8));
|
||||
}
|
||||
|
||||
int encodeAligned(char* sentence, Native_SentencePieceText** nSpt) {
|
||||
sentencepiece::SentencePieceText spt;
|
||||
std::string sentInUtf8(sentence);
|
||||
m_processor->Encode(absl::string_view(sentInUtf8), &spt);
|
||||
|
||||
return createNativeSentencePieceText(spt, nSpt);
|
||||
}
|
||||
|
||||
int decodeAligned(int num_tokens, char** inp_tokens, Native_SentencePieceText** nSpt) {
|
||||
sentencepiece::SentencePieceText spt;
|
||||
std::vector<std::string> tokens;
|
||||
for(int i = 0; i < num_tokens; i++) {
|
||||
std::string tok((char*)inp_tokens[i]);
|
||||
tokens.push_back(tok);
|
||||
}
|
||||
m_processor->Decode(tokens, &spt);
|
||||
return createNativeSentencePieceText(spt, nSpt);
|
||||
}
|
||||
};
|
||||
|
||||
int SentencePieceInteropFreeNativeSentencePieceText(Native_SentencePieceText* spt) {
|
||||
auto num_pieces = (*spt).num_pieces;
|
||||
for(int i = 0; i < num_pieces; i++) {
|
||||
Native_SentencePiecePiece* piece = (*spt).pieces[i];
|
||||
delete(piece->surface);
|
||||
delete(piece->piece);
|
||||
delete(piece);
|
||||
}
|
||||
delete[]((*spt).pieces);
|
||||
delete[]((*spt).text);
|
||||
delete(spt);
|
||||
spt = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
intptr_t SentencePieceInteropLoadModel(const uint16_t* modelPath,
|
||||
const uint16_t** vocab,
|
||||
size_t vocabSize) {
|
||||
try {
|
||||
return (intptr_t) new SentencePieceInternal(modelPath, vocab, vocabSize);
|
||||
}
|
||||
catch(...) { return (intptr_t) nullptr; }
|
||||
}
|
||||
|
||||
int SentencePieceInteropDecodeAligned(intptr_t object,
|
||||
int num_tokens,
|
||||
char** tokens,
|
||||
Native_SentencePieceText** nSpt) {
|
||||
try {
|
||||
return ((SentencePieceInternal*)object)->decodeAligned(num_tokens, tokens, nSpt);
|
||||
}
|
||||
catch(...) { return -1; }
|
||||
}
|
||||
|
||||
int SentencePieceInteropEncodeAligned(intptr_t object,
|
||||
char* word,
|
||||
Native_SentencePieceText** nSpt) {
|
||||
try {
|
||||
return ((SentencePieceInternal*)object)->encodeAligned(word, nSpt);
|
||||
}
|
||||
catch(...) { return -1; }
|
||||
}
|
||||
|
||||
int SentencePieceInteropGetPieceID(intptr_t object, char* word) {
|
||||
try {
|
||||
return ((SentencePieceInternal*)object)->getPieceID(word);
|
||||
}
|
||||
catch(...) { return -1; }
|
||||
}
|
||||
|
||||
int SentencePieceInteropUnloadModel(intptr_t object) {
|
||||
delete(SentencePieceInternal*)object;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int SentencepieceInteropTrainModel(char* args) {
|
||||
std::stringstream command;
|
||||
command << std::string(args);
|
||||
auto status = sentencepiece::SentencePieceTrainer::Train(command.str());
|
||||
return (int)status.code();
|
||||
}
|
||||
|
||||
} // namespace spm
|
||||
} // namespace marian
|
||||
|
||||
#endif
|
38
src/microsoft/sentencepiece.h
Normal file
38
src/microsoft/sentencepiece.h
Normal file
@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
|
||||
namespace marian {
|
||||
namespace spm {
|
||||
|
||||
// Describes an individual token in a sentencepiece encoding
|
||||
struct Native_SentencePiecePiece {
|
||||
int id;
|
||||
int begin;
|
||||
int end;
|
||||
char* surface;
|
||||
char* piece;
|
||||
};
|
||||
|
||||
// Mirrors the SentencePieceText protobuf struct returned by SPM
|
||||
// and provides individual piece and corresponding surface details
|
||||
struct Native_SentencePieceText {
|
||||
char* text;
|
||||
int num_pieces;
|
||||
Native_SentencePiecePiece** pieces;
|
||||
};
|
||||
|
||||
int SentencePieceInteropFreeNativeSentencePieceText(Native_SentencePieceText* spt);
|
||||
intptr_t SentencePieceInteropLoadModel(const uint16_t* modelPath,
|
||||
const uint16_t** vocab,
|
||||
size_t vocabSize);
|
||||
int SentencePieceInteropDecodeAligned(intptr_t object,
|
||||
int num_tokens,
|
||||
char** tokens,
|
||||
Native_SentencePieceText** nSpt);
|
||||
int SentencePieceInteropEncodeAligned(intptr_t object, char* word, Native_SentencePieceText** nSpt);
|
||||
int SentencePieceInteropGetPieceID(intptr_t object, char* word);
|
||||
int SentencePieceInteropUnloadModel(intptr_t object);
|
||||
int SentencepieceInteropTrainModel(char* args);
|
||||
|
||||
} // namespace spm
|
||||
} // namespace marian
|
282
src/microsoft/unicode_conversions.h
Normal file
282
src/microsoft/unicode_conversions.h
Normal file
@ -0,0 +1,282 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
// This was extracted from https://github.com/microsoft/cpprestsdk/blob/cdae258bfb22f948c7b768b4dc56f5f4a2d9b2ce/Release/src/utilities/asyncrt_utils.cpp#L305
|
||||
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
|
||||
typedef std::basic_string<uint16_t> utf16string;
|
||||
|
||||
#define LOW_3BITS 0x7
|
||||
#define LOW_4BITS 0xF
|
||||
#define LOW_5BITS 0x1F
|
||||
#define LOW_6BITS 0x3F
|
||||
#define BIT4 0x8
|
||||
#define BIT5 0x10
|
||||
#define BIT6 0x20
|
||||
#define BIT7 0x40
|
||||
#define BIT8 0x80
|
||||
#define L_SURROGATE_START 0xDC00
|
||||
#define L_SURROGATE_END 0xDFFF
|
||||
#define H_SURROGATE_START 0xD800
|
||||
#define H_SURROGATE_END 0xDBFF
|
||||
#define SURROGATE_PAIR_START 0x10000
|
||||
|
||||
// Create a dedicated type for characters to avoid the issue
|
||||
// of different platforms defaulting char to be either signed
|
||||
// or unsigned.
|
||||
using UtilCharInternal_t = signed char;
|
||||
|
||||
inline size_t count_utf8_to_utf16(const std::string& s)
|
||||
{
|
||||
const size_t sSize = s.size();
|
||||
auto const sData = reinterpret_cast<const UtilCharInternal_t*>(s.data());
|
||||
size_t result {sSize};
|
||||
|
||||
for (size_t index = 0; index < sSize;)
|
||||
{
|
||||
if (sData[index] >= 0)
|
||||
{
|
||||
// use fast inner loop to skip single byte code points (which are
|
||||
// expected to be the most frequent)
|
||||
while ((++index < sSize) && (sData[index] >= 0))
|
||||
;
|
||||
|
||||
if (index >= sSize) break;
|
||||
}
|
||||
|
||||
// start special handling for multi-byte code points
|
||||
const UtilCharInternal_t c {sData[index++]};
|
||||
|
||||
if ((c & BIT7) == 0)
|
||||
{
|
||||
throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
|
||||
}
|
||||
else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
|
||||
{
|
||||
if (index == sSize)
|
||||
{
|
||||
throw std::range_error("UTF-8 string is missing bytes in character");
|
||||
}
|
||||
|
||||
const UtilCharInternal_t c2 {sData[index++]};
|
||||
if ((c2 & 0xC0) != BIT8)
|
||||
{
|
||||
throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
|
||||
}
|
||||
|
||||
// can't require surrogates for 7FF
|
||||
--result;
|
||||
}
|
||||
else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
|
||||
{
|
||||
if (sSize - index < 2)
|
||||
{
|
||||
throw std::range_error("UTF-8 string is missing bytes in character");
|
||||
}
|
||||
|
||||
const UtilCharInternal_t c2 {sData[index++]};
|
||||
const UtilCharInternal_t c3 {sData[index++]};
|
||||
if (((c2 | c3) & 0xC0) != BIT8)
|
||||
{
|
||||
throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
|
||||
}
|
||||
|
||||
result -= 2;
|
||||
}
|
||||
else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
|
||||
{
|
||||
if (sSize - index < 3)
|
||||
{
|
||||
throw std::range_error("UTF-8 string is missing bytes in character");
|
||||
}
|
||||
|
||||
const UtilCharInternal_t c2 {sData[index++]};
|
||||
const UtilCharInternal_t c3 {sData[index++]};
|
||||
const UtilCharInternal_t c4 {sData[index++]};
|
||||
if (((c2 | c3 | c4) & 0xC0) != BIT8)
|
||||
{
|
||||
throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
|
||||
}
|
||||
|
||||
const uint32_t codePoint =
|
||||
((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
|
||||
result -= (3 - (codePoint >= SURROGATE_PAIR_START));
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::range_error("UTF-8 string has invalid Unicode code point");
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
utf16string /*__cdecl conversions::*/utf8_to_utf16(const std::string& s)
|
||||
{
|
||||
// Save repeated heap allocations, use the length of resulting sequence.
|
||||
const size_t srcSize = s.size();
|
||||
auto const srcData = reinterpret_cast<const UtilCharInternal_t*>(s.data());
|
||||
utf16string dest(count_utf8_to_utf16(s), L'\0');
|
||||
utf16string::value_type* const destData = &dest[0];
|
||||
size_t destIndex = 0;
|
||||
|
||||
for (size_t index = 0; index < srcSize; ++index)
|
||||
{
|
||||
UtilCharInternal_t src = srcData[index];
|
||||
switch (src & 0xF0)
|
||||
{
|
||||
case 0xF0: // 4 byte character, 0x10000 to 0x10FFFF
|
||||
{
|
||||
const UtilCharInternal_t c2 {srcData[++index]};
|
||||
const UtilCharInternal_t c3 {srcData[++index]};
|
||||
const UtilCharInternal_t c4 {srcData[++index]};
|
||||
uint32_t codePoint =
|
||||
((src & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
|
||||
if (codePoint >= SURROGATE_PAIR_START)
|
||||
{
|
||||
// In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
|
||||
// - 0x10000 is subtracted from the code point
|
||||
// - high surrogate is 0xD800 added to the top ten bits
|
||||
// - low surrogate is 0xDC00 added to the low ten bits
|
||||
codePoint -= SURROGATE_PAIR_START;
|
||||
destData[destIndex++] = static_cast<utf16string::value_type>((codePoint >> 10) | H_SURROGATE_START);
|
||||
destData[destIndex++] =
|
||||
static_cast<utf16string::value_type>((codePoint & 0x3FF) | L_SURROGATE_START);
|
||||
}
|
||||
else
|
||||
{
|
||||
// In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point
|
||||
// value. U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present
|
||||
// but will encode them if encountered.
|
||||
destData[destIndex++] = static_cast<utf16string::value_type>(codePoint);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 0xE0: // 3 byte character, 0x800 to 0xFFFF
|
||||
{
|
||||
const UtilCharInternal_t c2 {srcData[++index]};
|
||||
const UtilCharInternal_t c3 {srcData[++index]};
|
||||
destData[destIndex++] = static_cast<utf16string::value_type>(
|
||||
((src & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS));
|
||||
}
|
||||
break;
|
||||
case 0xD0: // 2 byte character, 0x80 to 0x7FF
|
||||
case 0xC0:
|
||||
{
|
||||
const UtilCharInternal_t c2 {srcData[++index]};
|
||||
destData[destIndex++] =
|
||||
static_cast<utf16string::value_type>(((src & LOW_5BITS) << 6) | (c2 & LOW_6BITS));
|
||||
}
|
||||
break;
|
||||
default: // single byte character, 0x0 to 0x7F
|
||||
// try to use a fast inner loop for following single byte characters,
|
||||
// since they are quite probable
|
||||
do
|
||||
{
|
||||
destData[destIndex++] = static_cast<utf16string::value_type>(srcData[index++]);
|
||||
} while (index < srcSize && srcData[index] > 0);
|
||||
// adjust index since it will be incremented by the for loop
|
||||
--index;
|
||||
}
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
inline size_t count_utf16_to_utf8(const utf16string& w)
|
||||
{
|
||||
const utf16string::value_type* const srcData = &w[0];
|
||||
const size_t srcSize = w.size();
|
||||
size_t destSize(srcSize);
|
||||
for (size_t index = 0; index < srcSize; ++index)
|
||||
{
|
||||
const utf16string::value_type ch(srcData[index]);
|
||||
if (ch <= 0x7FF)
|
||||
{
|
||||
if (ch > 0x7F) // 2 bytes needed (11 bits used)
|
||||
{
|
||||
++destSize;
|
||||
}
|
||||
}
|
||||
// Check for high surrogate.
|
||||
else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // 4 bytes needed (21 bits used)
|
||||
{
|
||||
++index;
|
||||
if (index == srcSize)
|
||||
{
|
||||
throw std::range_error("UTF-16 string is missing low surrogate");
|
||||
}
|
||||
|
||||
const auto lowSurrogate = srcData[index];
|
||||
if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
|
||||
{
|
||||
throw std::range_error("UTF-16 string has invalid low surrogate");
|
||||
}
|
||||
|
||||
destSize += 2;
|
||||
}
|
||||
else // 3 bytes needed (16 bits used)
|
||||
{
|
||||
destSize += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return destSize;
|
||||
}
|
||||
|
||||
std::string /*__cdecl conversions::*/utf16_to_utf8(const utf16string& w)
|
||||
{
|
||||
const size_t srcSize = w.size();
|
||||
const utf16string::value_type* const srcData = &w[0];
|
||||
std::string dest(count_utf16_to_utf8(w), '\0');
|
||||
std::string::value_type* const destData = &dest[0];
|
||||
size_t destIndex(0);
|
||||
|
||||
for (size_t index = 0; index < srcSize; ++index)
|
||||
{
|
||||
const utf16string::value_type src = srcData[index];
|
||||
if (src <= 0x7FF)
|
||||
{
|
||||
if (src <= 0x7F) // single byte character
|
||||
{
|
||||
destData[destIndex++] = static_cast<char>(src);
|
||||
}
|
||||
else // 2 bytes needed (11 bits used)
|
||||
{
|
||||
destData[destIndex++] = static_cast<char>(char((src >> 6) | 0xC0)); // leading 5 bits
|
||||
destData[destIndex++] = static_cast<char>(char((src & LOW_6BITS) | BIT8)); // trailing 6 bits
|
||||
}
|
||||
}
|
||||
// Check for high surrogate.
|
||||
else if (src >= H_SURROGATE_START && src <= H_SURROGATE_END)
|
||||
{
|
||||
const auto highSurrogate = src;
|
||||
const auto lowSurrogate = srcData[++index];
|
||||
|
||||
// To get from surrogate pair to Unicode code point:
|
||||
// - subtract 0xD800 from high surrogate, this forms top ten bits
|
||||
// - subtract 0xDC00 from low surrogate, this forms low ten bits
|
||||
// - add 0x10000
|
||||
// Leaves a code point in U+10000 to U+10FFFF range.
|
||||
uint32_t codePoint = highSurrogate - H_SURROGATE_START;
|
||||
codePoint <<= 10;
|
||||
codePoint |= lowSurrogate - L_SURROGATE_START;
|
||||
codePoint += SURROGATE_PAIR_START;
|
||||
|
||||
// 4 bytes needed (21 bits used)
|
||||
destData[destIndex++] = static_cast<char>((codePoint >> 18) | 0xF0); // leading 3 bits
|
||||
destData[destIndex++] = static_cast<char>(((codePoint >> 12) & LOW_6BITS) | BIT8); // next 6 bits
|
||||
destData[destIndex++] = static_cast<char>(((codePoint >> 6) & LOW_6BITS) | BIT8); // next 6 bits
|
||||
destData[destIndex++] = static_cast<char>((codePoint & LOW_6BITS) | BIT8); // trailing 6 bits
|
||||
}
|
||||
else // 3 bytes needed (16 bits used)
|
||||
{
|
||||
destData[destIndex++] = static_cast<char>((src >> 12) | 0xE0); // leading 4 bits
|
||||
destData[destIndex++] = static_cast<char>(((src >> 6) & LOW_6BITS) | BIT8); // middle 6 bits
|
||||
destData[destIndex++] = static_cast<char>((src & LOW_6BITS) | BIT8); // trailing 6 bits
|
||||
}
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
Loading…
Reference in New Issue
Block a user