2002-08-02 23:57:46 +04:00
|
|
|
/*
|
|
|
|
* synergy -- mouse and keyboard sharing utility
|
|
|
|
* Copyright (C) 2002 Chris Schoeneman
|
|
|
|
*
|
|
|
|
* This package is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* found in the file COPYING that should have accompanied this file.
|
|
|
|
*
|
|
|
|
* This package is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*/
|
|
|
|
|
2002-07-22 21:32:51 +04:00
|
|
|
#include "CUnicode.h"
|
2003-01-05 01:01:32 +03:00
|
|
|
#include "CArch.h"
|
2002-07-22 21:32:51 +04:00
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
//
|
|
|
|
// local utility functions
|
|
|
|
//
|
|
|
|
|
|
|
|
inline
|
|
|
|
static
|
|
|
|
UInt16
|
|
|
|
decode16(const UInt8* n)
|
|
|
|
{
|
|
|
|
union x16 {
|
|
|
|
UInt8 n8[2];
|
|
|
|
UInt16 n16;
|
|
|
|
} c;
|
|
|
|
c.n8[0] = n[0];
|
|
|
|
c.n8[1] = n[1];
|
|
|
|
return c.n16;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline
|
|
|
|
static
|
|
|
|
UInt32
|
|
|
|
decode32(const UInt8* n)
|
|
|
|
{
|
|
|
|
union x32 {
|
|
|
|
UInt8 n8[4];
|
|
|
|
UInt32 n32;
|
|
|
|
} c;
|
|
|
|
c.n8[0] = n[0];
|
|
|
|
c.n8[1] = n[1];
|
|
|
|
c.n8[2] = n[2];
|
|
|
|
c.n8[3] = n[3];
|
|
|
|
return c.n32;
|
|
|
|
}
|
|
|
|
|
2002-07-23 15:36:18 +04:00
|
|
|
inline
|
|
|
|
static
|
|
|
|
void
|
|
|
|
resetError(bool* errors)
|
|
|
|
{
|
|
|
|
if (errors != NULL) {
|
|
|
|
*errors = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
inline
|
|
|
|
static
|
|
|
|
void
|
|
|
|
setError(bool* errors)
|
|
|
|
{
|
|
|
|
if (errors != NULL) {
|
|
|
|
*errors = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-10-23 02:35:13 +04:00
|
|
|
|
2002-07-22 21:32:51 +04:00
|
|
|
//
|
|
|
|
// CUnicode
|
|
|
|
//
|
|
|
|
|
2002-07-23 13:33:50 +04:00
|
|
|
UInt32 CUnicode::s_invalid = 0x0000ffff;
|
|
|
|
UInt32 CUnicode::s_replacement = 0x0000fffd;
|
2002-07-22 21:32:51 +04:00
|
|
|
|
2002-07-23 15:36:18 +04:00
|
|
|
bool
|
|
|
|
CUnicode::isUTF8(const CString& src)
|
|
|
|
{
|
|
|
|
// convert and test each character
|
|
|
|
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|
|
|
for (UInt32 n = src.size(); n > 0; ) {
|
|
|
|
if (fromUTF8(data, n) == s_invalid) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2002-07-22 21:32:51 +04:00
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::UTF8ToUCS2(const CString& src, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
2002-07-23 15:36:18 +04:00
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
|
|
|
|
2002-07-24 21:22:01 +04:00
|
|
|
// get size of input string and reserve some space in output
|
|
|
|
UInt32 n = src.size();
|
2002-07-22 21:32:51 +04:00
|
|
|
CString dst;
|
|
|
|
dst.reserve(2 * n);
|
|
|
|
|
|
|
|
// convert each character
|
|
|
|
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|
|
|
while (n > 0) {
|
|
|
|
UInt32 c = fromUTF8(data, n);
|
2002-07-23 15:36:18 +04:00
|
|
|
if (c == s_invalid) {
|
|
|
|
c = s_replacement;
|
|
|
|
}
|
|
|
|
else if (c >= 0x00010000) {
|
|
|
|
setError(errors);
|
2002-07-23 13:33:50 +04:00
|
|
|
c = s_replacement;
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
2002-07-23 13:33:50 +04:00
|
|
|
UInt16 ucs2 = static_cast<UInt16>(c);
|
|
|
|
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::UTF8ToUCS4(const CString& src, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
2002-07-23 15:36:18 +04:00
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
|
|
|
|
2002-07-24 21:22:01 +04:00
|
|
|
// get size of input string and reserve some space in output
|
|
|
|
UInt32 n = src.size();
|
2002-07-22 21:32:51 +04:00
|
|
|
CString dst;
|
|
|
|
dst.reserve(4 * n);
|
|
|
|
|
|
|
|
// convert each character
|
|
|
|
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|
|
|
while (n > 0) {
|
|
|
|
UInt32 c = fromUTF8(data, n);
|
2002-07-23 13:33:50 +04:00
|
|
|
if (c == s_invalid) {
|
|
|
|
c = s_replacement;
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
2002-07-23 13:33:50 +04:00
|
|
|
dst.append(reinterpret_cast<const char*>(&c), 4);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::UTF8ToUTF16(const CString& src, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
2002-07-23 15:36:18 +04:00
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
|
|
|
|
2002-07-24 21:22:01 +04:00
|
|
|
// get size of input string and reserve some space in output
|
|
|
|
UInt32 n = src.size();
|
2002-07-22 21:32:51 +04:00
|
|
|
CString dst;
|
|
|
|
dst.reserve(2 * n);
|
|
|
|
|
|
|
|
// convert each character
|
|
|
|
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|
|
|
while (n > 0) {
|
|
|
|
UInt32 c = fromUTF8(data, n);
|
2002-07-23 15:36:18 +04:00
|
|
|
if (c == s_invalid) {
|
|
|
|
c = s_replacement;
|
|
|
|
}
|
|
|
|
else if (c >= 0x00110000) {
|
|
|
|
setError(errors);
|
2002-07-23 13:33:50 +04:00
|
|
|
c = s_replacement;
|
|
|
|
}
|
|
|
|
if (c < 0x00010000) {
|
|
|
|
UInt16 ucs2 = static_cast<UInt16>(c);
|
|
|
|
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
c -= 0x00010000;
|
2002-07-23 19:26:40 +04:00
|
|
|
UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800);
|
|
|
|
UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00);
|
2002-07-23 13:33:50 +04:00
|
|
|
dst.append(reinterpret_cast<const char*>(&utf16h), 2);
|
|
|
|
dst.append(reinterpret_cast<const char*>(&utf16l), 2);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::UTF8ToUTF32(const CString& src, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
2002-07-23 15:36:18 +04:00
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
|
|
|
|
2002-07-24 21:22:01 +04:00
|
|
|
// get size of input string and reserve some space in output
|
|
|
|
UInt32 n = src.size();
|
2002-07-23 13:33:50 +04:00
|
|
|
CString dst;
|
|
|
|
dst.reserve(4 * n);
|
|
|
|
|
|
|
|
// convert each character
|
|
|
|
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|
|
|
while (n > 0) {
|
|
|
|
UInt32 c = fromUTF8(data, n);
|
2002-07-23 15:36:18 +04:00
|
|
|
if (c == s_invalid) {
|
|
|
|
c = s_replacement;
|
|
|
|
}
|
|
|
|
else if (c >= 0x00110000) {
|
|
|
|
setError(errors);
|
2002-07-23 13:33:50 +04:00
|
|
|
c = s_replacement;
|
|
|
|
}
|
|
|
|
dst.append(reinterpret_cast<const char*>(&c), 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
return dst;
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::UTF8ToText(const CString& src, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
2002-07-23 15:36:18 +04:00
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
// convert to wide char
|
2002-07-23 19:26:40 +04:00
|
|
|
UInt32 size;
|
|
|
|
wchar_t* tmp = UTF8ToWideChar(src, size, errors);
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
// get length of multibyte string
|
2003-01-05 01:01:32 +03:00
|
|
|
int mblen;
|
|
|
|
CArchMBState state = ARCH->newMBState();
|
2002-07-23 19:26:40 +04:00
|
|
|
size_t len = 0;
|
|
|
|
UInt32 n = size;
|
|
|
|
for (const wchar_t* scan = tmp; n > 0; ++scan, --n) {
|
2003-01-05 01:01:32 +03:00
|
|
|
mblen = ARCH->convWCToMB(NULL, *scan, state);
|
2002-07-23 13:33:50 +04:00
|
|
|
if (mblen == -1) {
|
|
|
|
// unconvertable character
|
2002-07-23 15:36:18 +04:00
|
|
|
setError(errors);
|
2002-07-23 13:33:50 +04:00
|
|
|
len += 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
len += mblen;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-07-24 21:22:01 +04:00
|
|
|
// handle nul terminator
|
2003-01-05 01:01:32 +03:00
|
|
|
mblen = ARCH->convWCToMB(NULL, L'\0', state);
|
2002-07-24 21:22:01 +04:00
|
|
|
if (mblen != -1) {
|
2002-07-24 23:23:46 +04:00
|
|
|
len += mblen;
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
2003-01-05 01:01:32 +03:00
|
|
|
assert(ARCH->isInitMBState(state) != 0);
|
2002-07-23 13:33:50 +04:00
|
|
|
|
|
|
|
// allocate multibyte string
|
2002-07-24 21:22:01 +04:00
|
|
|
char* mbs = new char[len];
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
// convert to multibyte
|
2002-07-23 13:33:50 +04:00
|
|
|
char* dst = mbs;
|
2002-07-23 19:26:40 +04:00
|
|
|
n = size;
|
|
|
|
for (const wchar_t* scan = tmp; n > 0; ++scan, --n) {
|
2003-01-05 01:01:32 +03:00
|
|
|
mblen = ARCH->convWCToMB(dst, *scan, state);
|
2002-07-23 13:33:50 +04:00
|
|
|
if (mblen == -1) {
|
|
|
|
// unconvertable character
|
|
|
|
*dst++ = '?';
|
|
|
|
}
|
|
|
|
else {
|
2002-07-23 16:08:30 +04:00
|
|
|
dst += mblen;
|
2002-07-23 13:33:50 +04:00
|
|
|
}
|
|
|
|
}
|
2003-01-05 01:01:32 +03:00
|
|
|
mblen = ARCH->convWCToMB(dst, L'\0', state);
|
2002-07-24 21:22:01 +04:00
|
|
|
if (mblen != -1) {
|
|
|
|
// don't include nul terminator
|
|
|
|
dst += mblen - 1;
|
|
|
|
}
|
2002-07-23 16:08:30 +04:00
|
|
|
CString text(mbs, dst - mbs);
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
// clean up
|
2002-07-23 13:33:50 +04:00
|
|
|
delete[] mbs;
|
2002-07-22 21:32:51 +04:00
|
|
|
delete[] tmp;
|
2003-01-05 01:01:32 +03:00
|
|
|
ARCH->closeMBState(state);
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
return text;
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::UCS2ToUTF8(const CString& src, bool* errors)
|
|
|
|
{
|
2002-07-23 15:51:13 +04:00
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
|
|
|
|
|
|
|
// convert
|
2002-07-23 15:36:18 +04:00
|
|
|
UInt32 n = src.size() >> 1;
|
|
|
|
return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
|
|
|
CUnicode::UCS4ToUTF8(const CString& src, bool* errors)
|
|
|
|
{
|
2002-07-23 15:51:13 +04:00
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
|
|
|
|
|
|
|
// convert
|
2002-07-23 15:36:18 +04:00
|
|
|
UInt32 n = src.size() >> 2;
|
|
|
|
return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
|
|
|
CUnicode::UTF16ToUTF8(const CString& src, bool* errors)
|
|
|
|
{
|
2002-07-23 15:51:13 +04:00
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
|
|
|
|
|
|
|
// convert
|
2002-07-23 15:36:18 +04:00
|
|
|
UInt32 n = src.size() >> 1;
|
|
|
|
return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
|
|
|
CUnicode::UTF32ToUTF8(const CString& src, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
2002-07-23 15:51:13 +04:00
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
|
|
|
|
|
|
|
// convert
|
2002-07-23 15:36:18 +04:00
|
|
|
UInt32 n = src.size() >> 2;
|
|
|
|
return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
|
|
|
CUnicode::textToUTF8(const CString& src, bool* errors)
|
|
|
|
{
|
|
|
|
// default to success
|
|
|
|
resetError(errors);
|
|
|
|
|
|
|
|
// get length of multibyte string
|
2002-07-24 21:39:52 +04:00
|
|
|
UInt32 n = src.size();
|
2002-07-23 15:36:18 +04:00
|
|
|
size_t len = 0;
|
2003-01-05 01:01:32 +03:00
|
|
|
CArchMBState state = ARCH->newMBState();
|
2002-07-23 19:26:40 +04:00
|
|
|
for (const char* scan = src.c_str(); n > 0; ) {
|
2003-01-05 01:01:32 +03:00
|
|
|
int mblen = ARCH->convMBToWC(NULL, scan, n, state);
|
2002-07-23 15:36:18 +04:00
|
|
|
switch (mblen) {
|
2003-01-05 01:01:32 +03:00
|
|
|
case -2:
|
2002-07-23 15:36:18 +04:00
|
|
|
// incomplete last character. convert to unknown character.
|
|
|
|
setError(errors);
|
|
|
|
len += 1;
|
|
|
|
n = 0;
|
|
|
|
break;
|
|
|
|
|
2003-01-05 01:01:32 +03:00
|
|
|
case -1:
|
2002-07-23 15:36:18 +04:00
|
|
|
// invalid character. count one unknown character and
|
|
|
|
// start at the next byte.
|
|
|
|
setError(errors);
|
|
|
|
len += 1;
|
|
|
|
scan += 1;
|
|
|
|
n -= 1;
|
|
|
|
break;
|
|
|
|
|
2002-07-23 19:26:40 +04:00
|
|
|
case 0:
|
|
|
|
len += 1;
|
|
|
|
scan += 1;
|
|
|
|
n -= 1;
|
|
|
|
break;
|
|
|
|
|
2002-07-23 15:36:18 +04:00
|
|
|
default:
|
|
|
|
// normal character
|
|
|
|
len += 1;
|
|
|
|
scan += mblen;
|
|
|
|
n -= mblen;
|
|
|
|
break;
|
|
|
|
}
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
2003-01-05 01:01:32 +03:00
|
|
|
ARCH->initMBState(state);
|
2002-07-23 15:36:18 +04:00
|
|
|
|
|
|
|
// allocate wide character string
|
2002-07-23 19:26:40 +04:00
|
|
|
wchar_t* wcs = new wchar_t[len];
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
// convert multibyte to wide char
|
2002-07-23 15:36:18 +04:00
|
|
|
n = src.size();
|
|
|
|
wchar_t* dst = wcs;
|
2002-07-23 19:26:40 +04:00
|
|
|
for (const char* scan = src.c_str(); n > 0; ++dst) {
|
2003-01-05 01:01:32 +03:00
|
|
|
int mblen = ARCH->convMBToWC(dst, scan, n, state);
|
2002-07-23 15:36:18 +04:00
|
|
|
switch (mblen) {
|
2003-01-05 01:01:32 +03:00
|
|
|
case -2:
|
2002-07-23 15:36:18 +04:00
|
|
|
// incomplete character. convert to unknown character.
|
|
|
|
*dst = (wchar_t)0xfffd;
|
|
|
|
n = 0;
|
|
|
|
break;
|
|
|
|
|
2003-01-05 01:01:32 +03:00
|
|
|
case -1:
|
2002-07-23 15:36:18 +04:00
|
|
|
// invalid character. count one unknown character and
|
|
|
|
// start at the next byte.
|
2002-07-23 19:26:40 +04:00
|
|
|
*dst = (wchar_t)0xfffd;
|
|
|
|
scan += 1;
|
|
|
|
n -= 1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
*dst = (wchar_t)0x0000;
|
2002-07-23 15:36:18 +04:00
|
|
|
scan += 1;
|
|
|
|
n -= 1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
// normal character
|
|
|
|
scan += mblen;
|
|
|
|
n -= mblen;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
// convert to UTF8
|
2002-07-23 19:26:40 +04:00
|
|
|
CString utf8 = wideCharToUTF8(wcs, len, errors);
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
// clean up
|
2002-07-23 15:36:18 +04:00
|
|
|
delete[] wcs;
|
2003-02-16 22:49:44 +03:00
|
|
|
ARCH->closeMBState(state);
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
return utf8;
|
|
|
|
}
|
|
|
|
|
|
|
|
wchar_t*
|
2002-07-23 19:26:40 +04:00
|
|
|
CUnicode::UTF8ToWideChar(const CString& src, UInt32& size, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
2002-07-24 21:22:01 +04:00
|
|
|
// convert to platform's wide character encoding
|
2003-01-06 00:48:54 +03:00
|
|
|
CString tmp;
|
|
|
|
switch (ARCH->getWideCharEncoding()) {
|
|
|
|
case IArchString::kUCS2:
|
|
|
|
tmp = UTF8ToUCS2(src, errors);
|
|
|
|
size = tmp.size() >> 1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case IArchString::kUCS4:
|
|
|
|
tmp = UTF8ToUCS4(src, errors);
|
|
|
|
size = tmp.size() >> 2;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case IArchString::kUTF16:
|
|
|
|
tmp = UTF8ToUTF16(src, errors);
|
|
|
|
size = tmp.size() >> 1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case IArchString::kUTF32:
|
|
|
|
tmp = UTF8ToUTF32(src, errors);
|
|
|
|
size = tmp.size() >> 2;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
assert(0 && "unknown wide character encoding");
|
|
|
|
}
|
2002-07-22 21:32:51 +04:00
|
|
|
|
|
|
|
// copy to a wchar_t array
|
|
|
|
wchar_t* dst = new wchar_t[size];
|
2002-07-22 22:46:57 +04:00
|
|
|
::memcpy(dst, tmp.data(), sizeof(wchar_t) * size);
|
2002-07-22 21:32:51 +04:00
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 19:26:40 +04:00
|
|
|
CUnicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
|
|
|
// convert from platform's wide character encoding.
|
|
|
|
// note -- this must include a wide nul character (independent of
|
|
|
|
// the CString's nul character).
|
2003-01-06 00:48:54 +03:00
|
|
|
switch (ARCH->getWideCharEncoding()) {
|
|
|
|
case IArchString::kUCS2:
|
|
|
|
return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
|
|
|
|
|
|
|
case IArchString::kUCS4:
|
|
|
|
return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
|
|
|
|
|
|
|
case IArchString::kUTF16:
|
|
|
|
return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
|
|
|
|
|
|
|
case IArchString::kUTF32:
|
|
|
|
return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
|
|
|
|
|
|
|
default:
|
|
|
|
assert(0 && "unknown wide character encoding");
|
|
|
|
return CString();
|
|
|
|
}
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
|
|
|
// make some space
|
|
|
|
CString dst;
|
|
|
|
dst.reserve(n);
|
|
|
|
|
|
|
|
// convert each character
|
|
|
|
for (; n > 0; data += 2, --n) {
|
|
|
|
UInt32 c = decode16(data);
|
2002-07-23 15:36:18 +04:00
|
|
|
toUTF8(dst, c, errors);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
|
|
|
// make some space
|
|
|
|
CString dst;
|
|
|
|
dst.reserve(n);
|
|
|
|
|
|
|
|
// convert each character
|
|
|
|
for (; n > 0; data += 4, --n) {
|
|
|
|
UInt32 c = decode32(data);
|
2002-07-23 15:36:18 +04:00
|
|
|
toUTF8(dst, c, errors);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
|
|
|
// make some space
|
|
|
|
CString dst;
|
|
|
|
dst.reserve(n);
|
|
|
|
|
|
|
|
// convert each character
|
|
|
|
for (; n > 0; data += 2, --n) {
|
|
|
|
UInt32 c = decode16(data);
|
|
|
|
if (c < 0x0000d800 || c > 0x0000dfff) {
|
2002-07-23 15:36:18 +04:00
|
|
|
toUTF8(dst, c, errors);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
else if (n == 1) {
|
|
|
|
// error -- missing second word
|
2002-07-23 15:36:18 +04:00
|
|
|
setError(errors);
|
|
|
|
toUTF8(dst, s_replacement, NULL);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
else if (c >= 0x0000d800 && c <= 0x0000dbff) {
|
|
|
|
UInt32 c2 = decode16(data);
|
|
|
|
data += 2;
|
|
|
|
--n;
|
|
|
|
if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
|
|
|
|
// error -- [d800,dbff] not followed by [dc00,dfff]
|
2002-07-23 15:36:18 +04:00
|
|
|
setError(errors);
|
|
|
|
toUTF8(dst, s_replacement, NULL);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
|
2002-07-23 15:36:18 +04:00
|
|
|
toUTF8(dst, c, errors);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// error -- [dc00,dfff] without leading [d800,dbff]
|
2002-07-23 15:36:18 +04:00
|
|
|
setError(errors);
|
|
|
|
toUTF8(dst, s_replacement, NULL);
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
}
|
|
|
|
|
|
|
|
CString
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
2002-07-23 13:33:50 +04:00
|
|
|
// make some space
|
|
|
|
CString dst;
|
|
|
|
dst.reserve(n);
|
|
|
|
|
|
|
|
// convert each character
|
|
|
|
for (; n > 0; data += 4, --n) {
|
|
|
|
UInt32 c = decode32(data);
|
|
|
|
if (c >= 0x00110000) {
|
2002-07-23 15:36:18 +04:00
|
|
|
setError(errors);
|
2002-07-23 13:33:50 +04:00
|
|
|
c = s_replacement;
|
|
|
|
}
|
2002-07-23 15:36:18 +04:00
|
|
|
toUTF8(dst, c, errors);
|
2002-07-23 13:33:50 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return dst;
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
UInt32
|
|
|
|
CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
|
|
|
|
{
|
|
|
|
assert(data != NULL);
|
|
|
|
assert(n != 0);
|
|
|
|
|
|
|
|
// compute character encoding length, checking for overlong
|
|
|
|
// sequences (i.e. characters that don't use the shortest
|
|
|
|
// possible encoding).
|
|
|
|
UInt32 size;
|
|
|
|
if (data[0] < 0x80) {
|
|
|
|
// 0xxxxxxx
|
|
|
|
size = 1;
|
|
|
|
}
|
|
|
|
else if (data[0] < 0xc0) {
|
2002-07-23 19:26:40 +04:00
|
|
|
// 10xxxxxx -- in the middle of a multibyte character. counts
|
|
|
|
// as one invalid character.
|
|
|
|
--n;
|
|
|
|
++data;
|
2002-07-22 21:32:51 +04:00
|
|
|
return s_invalid;
|
|
|
|
}
|
|
|
|
else if (data[0] < 0xe0) {
|
|
|
|
// 110xxxxx
|
|
|
|
size = 2;
|
|
|
|
}
|
|
|
|
else if (data[0] < 0xf0) {
|
|
|
|
// 1110xxxx
|
|
|
|
size = 3;
|
|
|
|
}
|
|
|
|
else if (data[0] < 0xf8) {
|
|
|
|
// 11110xxx
|
|
|
|
size = 4;
|
|
|
|
}
|
|
|
|
else if (data[0] < 0xfc) {
|
|
|
|
// 111110xx
|
|
|
|
size = 5;
|
|
|
|
}
|
|
|
|
else if (data[0] < 0xfe) {
|
|
|
|
// 1111110x
|
|
|
|
size = 6;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// invalid sequence. dunno how many bytes to skip so skip one.
|
|
|
|
--n;
|
|
|
|
++data;
|
|
|
|
return s_invalid;
|
|
|
|
}
|
|
|
|
|
|
|
|
// make sure we have enough data
|
|
|
|
if (size > n) {
|
|
|
|
data += n;
|
|
|
|
n = 0;
|
|
|
|
return s_invalid;
|
|
|
|
}
|
|
|
|
|
|
|
|
// extract character
|
|
|
|
UInt32 c;
|
|
|
|
switch (size) {
|
|
|
|
case 1:
|
|
|
|
c = static_cast<UInt32>(data[0]);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
c = ((static_cast<UInt32>(data[0]) & 0x1f) << 6) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) );
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 3:
|
|
|
|
c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
|
|
|
((static_cast<UInt32>(data[2]) & 0x3f) );
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 4:
|
|
|
|
c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) );
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 5:
|
|
|
|
c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) );
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 6:
|
|
|
|
c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 24) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
|
|
|
((static_cast<UInt32>(data[1]) & 0x3f) );
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
assert(0 && "invalid size");
|
2002-07-23 19:26:40 +04:00
|
|
|
return s_invalid;
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
|
2002-07-23 13:33:50 +04:00
|
|
|
// check that all bytes after the first have the pattern 10xxxxxx.
|
|
|
|
// truncated sequences are treated as a single malformed character.
|
|
|
|
bool truncated = false;
|
|
|
|
switch (size) {
|
|
|
|
case 6:
|
|
|
|
if ((data[5] & 0xc0) != 0x80) {
|
|
|
|
truncated = true;
|
|
|
|
size = 5;
|
|
|
|
}
|
|
|
|
// fall through
|
|
|
|
|
|
|
|
case 5:
|
|
|
|
if ((data[4] & 0xc0) != 0x80) {
|
|
|
|
truncated = true;
|
|
|
|
size = 4;
|
|
|
|
}
|
|
|
|
// fall through
|
|
|
|
|
|
|
|
case 4:
|
|
|
|
if ((data[3] & 0xc0) != 0x80) {
|
|
|
|
truncated = true;
|
|
|
|
size = 3;
|
|
|
|
}
|
|
|
|
// fall through
|
|
|
|
|
|
|
|
case 3:
|
|
|
|
if ((data[2] & 0xc0) != 0x80) {
|
|
|
|
truncated = true;
|
|
|
|
size = 2;
|
|
|
|
}
|
|
|
|
// fall through
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
if ((data[1] & 0xc0) != 0x80) {
|
|
|
|
truncated = true;
|
|
|
|
size = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-07-22 21:32:51 +04:00
|
|
|
// update parameters
|
|
|
|
data += size;
|
|
|
|
n -= size;
|
|
|
|
|
2002-07-23 13:33:50 +04:00
|
|
|
// invalid if sequence was truncated
|
|
|
|
if (truncated) {
|
|
|
|
return s_invalid;
|
|
|
|
}
|
|
|
|
|
2002-07-22 21:32:51 +04:00
|
|
|
// check for characters that didn't use the smallest possible encoding
|
|
|
|
static UInt32 s_minChar[] = {
|
|
|
|
0,
|
|
|
|
0x00000000,
|
|
|
|
0x00000080,
|
|
|
|
0x00000800,
|
|
|
|
0x00010000,
|
|
|
|
0x00200000,
|
|
|
|
0x04000000
|
|
|
|
};
|
|
|
|
if (c < s_minChar[size]) {
|
|
|
|
return s_invalid;
|
|
|
|
}
|
|
|
|
|
2002-07-23 13:33:50 +04:00
|
|
|
// check for characters not in ISO-10646
|
|
|
|
if (c >= 0x0000d800 && c <= 0x0000dfff) {
|
|
|
|
return s_invalid;
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
2002-07-23 13:33:50 +04:00
|
|
|
if (c >= 0x0000fffe && c <= 0x0000ffff) {
|
2002-07-22 21:32:51 +04:00
|
|
|
return s_invalid;
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2002-07-23 15:36:18 +04:00
|
|
|
CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors)
|
2002-07-22 21:32:51 +04:00
|
|
|
{
|
|
|
|
UInt8 data[6];
|
|
|
|
|
2002-07-23 13:33:50 +04:00
|
|
|
// handle characters outside the valid range
|
2002-07-23 15:36:18 +04:00
|
|
|
if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) {
|
|
|
|
setError(errors);
|
2002-07-23 13:33:50 +04:00
|
|
|
c = s_replacement;
|
|
|
|
}
|
|
|
|
|
|
|
|
// convert to UTF-8
|
2002-07-22 21:32:51 +04:00
|
|
|
if (c < 0x00000080) {
|
|
|
|
data[0] = static_cast<UInt8>(c);
|
|
|
|
dst.append(reinterpret_cast<char*>(data), 1);
|
|
|
|
}
|
|
|
|
else if (c < 0x00000800) {
|
2002-07-23 19:26:40 +04:00
|
|
|
data[0] = static_cast<UInt8>(((c >> 6) & 0x0000001f) + 0xc0);
|
|
|
|
data[1] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
2002-07-22 21:32:51 +04:00
|
|
|
dst.append(reinterpret_cast<char*>(data), 2);
|
|
|
|
}
|
|
|
|
else if (c < 0x00010000) {
|
2002-07-23 19:26:40 +04:00
|
|
|
data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0);
|
|
|
|
data[1] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
|
|
|
data[2] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
2002-07-22 21:32:51 +04:00
|
|
|
dst.append(reinterpret_cast<char*>(data), 3);
|
|
|
|
}
|
|
|
|
else if (c < 0x00200000) {
|
2002-07-23 19:26:40 +04:00
|
|
|
data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0);
|
|
|
|
data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
|
|
|
|
data[2] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
|
|
|
data[3] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
2002-07-22 21:32:51 +04:00
|
|
|
dst.append(reinterpret_cast<char*>(data), 4);
|
|
|
|
}
|
|
|
|
else if (c < 0x04000000) {
|
2002-07-23 19:26:40 +04:00
|
|
|
data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8);
|
|
|
|
data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
|
|
|
|
data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
|
|
|
|
data[3] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
|
|
|
data[4] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
2002-07-22 21:32:51 +04:00
|
|
|
dst.append(reinterpret_cast<char*>(data), 5);
|
|
|
|
}
|
|
|
|
else if (c < 0x80000000) {
|
2002-07-23 19:26:40 +04:00
|
|
|
data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc);
|
|
|
|
data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80);
|
|
|
|
data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
|
|
|
|
data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
|
|
|
|
data[4] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
|
|
|
data[5] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
2002-07-22 21:32:51 +04:00
|
|
|
dst.append(reinterpret_cast<char*>(data), 6);
|
|
|
|
}
|
|
|
|
else {
|
2002-07-23 13:33:50 +04:00
|
|
|
assert(0 && "character out of range");
|
2002-07-22 21:32:51 +04:00
|
|
|
}
|
|
|
|
}
|