sapling/eden/fs/utils/Utf8.h
Xavier Deguillard a3115dacd8 utils: add a constexpr utf8 checker
Summary:
In a future diff, paths will be validated to make sure they are valid utf8. The
path sanity checker needs to be constexpr to construct global paths, but the
utf8 functions aren't, so let's write one that is.

Reviewed By: chadaustin

Differential Revision: D25562681

fbshipit-source-id: e48ec835c2cc9dc01090918cc7ee8f61b6c05a20
2020-12-16 01:03:32 -08:00

156 lines
3.8 KiB
C++

/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This software may be used and distributed according to the terms of the
* GNU General Public License version 2.
*/
#pragma once
#include <folly/Range.h>
#include <folly/Utility.h>
namespace facebook {
namespace eden {
namespace detail {
/**
* Test if the most significant bit is set.
*/
constexpr bool isBitSet(char c, size_t bit) {
return (folly::to_unsigned(c) & (1u << bit)) == (1u << bit);
}
/**
* Test if the character is a valid UTF-8 continuation byte.
*
* Continuation bytes are of the form 10xxxxxx
*/
constexpr bool isValidContinuation(char c) {
return isBitSet(c, 7) && !isBitSet(c, 6);
}
constexpr unsigned char clearContinuationBit(char c) {
return folly::to_unsigned(c) & ~(1u << 7);
}
/**
* Test if the next num characters are continuation bytes.
*
* Set codepoint with the unicode codepoint.
*/
constexpr bool isValidContinuation(
const char*& begin,
const char* const end,
size_t num,
uint32_t& codepoint) {
if (begin + num - 1 > end) {
return false;
}
for (size_t i = 0; i < num; i++) {
auto c = *begin++;
if (!isValidContinuation(c)) {
return false;
}
codepoint = (codepoint << 6) | clearContinuationBit(c);
}
return true;
}
} // namespace detail
/**
* Returns whether the given string is correctly-encoded UTF-8.
*
* This doesn't verify whether the codepoints are actually valid unicode
* characters.
*/
constexpr bool isValidUtf8(folly::StringPiece str) {
const char* begin = str.begin();
const char* const end = str.end();
while (begin != end) {
char first = *begin++;
if (!detail::isBitSet(first, 7)) {
// ASCII character, nothing to do.
} else if (!detail::isBitSet(first, 6)) {
// 10xxxxxx isn't a valid for the first byte.
return false;
} else if (!detail::isBitSet(first, 5)) {
// 110xxxxx: 2 bytes
uint32_t codepoint = folly::to_unsigned(first) & 0x1F;
if (!detail::isValidContinuation(begin, end, 1, codepoint)) {
return false;
}
// Is this an overlong encoding?
if (codepoint < 0x80) {
return false;
}
} else if (!detail::isBitSet(first, 4)) {
// 1110xxxx: 3 bytes
uint32_t codepoint = folly::to_unsigned(first) & 0xF;
if (!detail::isValidContinuation(begin, end, 2, codepoint)) {
return false;
}
// Is this an overlong encoding?
if (codepoint < 0x800) {
return false;
}
} else if (!detail::isBitSet(first, 3)) {
// 11110xxx: 4 bytes
uint32_t codepoint = folly::to_unsigned(first) & 0x7;
if (!detail::isValidContinuation(begin, end, 3, codepoint)) {
return false;
}
// Is this an overlong encoding?
if (codepoint < 0x10000) {
return false;
}
} else {
// 11111xxx isn't ever valid.
return false;
}
}
return true;
}
std::string ensureValidUtf8(folly::ByteRange str);
/**
* Returns a valid UTF-8 encoding of str, with all invalid code points replaced
* with FFFD, the Unicode replacement character.
*/
inline std::string ensureValidUtf8(folly::StringPiece str) {
return ensureValidUtf8(folly::ByteRange{str});
}
/**
* Returns a valid UTF-8 encoding of str, with all invalid code points replaced
* with FFFD, the Unicode replacement character.
*
* This overload avoids a copy in the common case that the given std::string is
* already valid UTF-8.
*/
inline std::string ensureValidUtf8(std::string&& str) {
// Avoid a copy in the common case by checking for validity before attempting
// to re-encode.
if (isValidUtf8(str)) {
return std::move(str);
} else {
return ensureValidUtf8(str);
}
}
template <size_t N>
inline std::string ensureValidUtf8(const char (&str)[N]) {
return ensureValidUtf8(std::string{str, N - 1});
}
} // namespace eden
} // namespace facebook