/* * Copyright 2016 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include <stdexcept> #include <iterator> #ifndef FOLLY_STRING_H_ #error This file may only be included from String.h #endif namespace folly { namespace detail { // Map from character code to value of one-character escape sequence // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as // an octal escape sequence, or 'P' if the character is printable and // should be printed as is. extern const char cEscapeTable[]; } // namespace detail template <class String> void cEscape(StringPiece str, String& out) { char esc[4]; esc[0] = '\\'; out.reserve(out.size() + str.size()); auto p = str.begin(); auto last = p; // last regular character // We advance over runs of regular characters (printable, not double-quote or // backslash) and copy them in one go; this is faster than calling push_back // repeatedly. while (p != str.end()) { char c = *p; unsigned char v = static_cast<unsigned char>(c); char e = detail::cEscapeTable[v]; if (e == 'P') { // printable ++p; } else if (e == 'O') { // octal out.append(&*last, p - last); esc[1] = '0' + ((v >> 6) & 7); esc[2] = '0' + ((v >> 3) & 7); esc[3] = '0' + (v & 7); out.append(esc, 4); ++p; last = p; } else { // special 1-character escape out.append(&*last, p - last); esc[1] = e; out.append(esc, 2); ++p; last = p; } } out.append(&*last, p - last); } namespace detail { // Map from the character code of the character following a backslash to // the unescaped character if a valid one-character escape sequence // ('n' maps to 10 = '\n'), 'O' if this is the first character of an // octal escape sequence, 'X' if this is the first character of a // hexadecimal escape sequence, or 'I' if this escape sequence is invalid. extern const char cUnescapeTable[]; // Map from the character code to the hex value, or 16 if invalid hex char. extern const unsigned char hexTable[]; } // namespace detail template <class String> void cUnescape(StringPiece str, String& out, bool strict) { out.reserve(out.size() + str.size()); auto p = str.begin(); auto last = p; // last regular character (not part of an escape sequence) // We advance over runs of regular characters (not backslash) and copy them // in one go; this is faster than calling push_back repeatedly. while (p != str.end()) { char c = *p; if (c != '\\') { // normal case ++p; continue; } out.append(&*last, p - last); if (p == str.end()) { // backslash at end of string if (strict) { throw std::invalid_argument("incomplete escape sequence"); } out.push_back('\\'); last = p; continue; } ++p; char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)]; if (e == 'O') { // octal unsigned char val = 0; for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7'; ++i, ++p) { val = (val << 3) | (*p - '0'); } out.push_back(val); last = p; } else if (e == 'X') { // hex ++p; if (p == str.end()) { // \x at end of string if (strict) { throw std::invalid_argument("incomplete hex escape sequence"); } out.append("\\x"); last = p; continue; } unsigned char val = 0; unsigned char h; for (; (p != str.end() && (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16); ++p) { val = (val << 4) | h; } out.push_back(val); last = p; } else if (e == 'I') { // invalid if (strict) { throw std::invalid_argument("invalid escape sequence"); } out.push_back('\\'); out.push_back(*p); ++p; last = p; } else { // standard escape sequence, \' etc out.push_back(e); ++p; last = p; } } out.append(&*last, p - last); } namespace detail { // Map from character code to escape mode: // 0 = pass through // 1 = unused // 2 = pass through in PATH mode // 3 = space, replace with '+' in QUERY mode // 4 = percent-encode extern const unsigned char uriEscapeTable[]; } // namespace detail template <class String> void uriEscape(StringPiece str, String& out, UriEscapeMode mode) { static const char hexValues[] = "0123456789abcdef"; char esc[3]; esc[0] = '%'; // Preallocate assuming that 25% of the input string will be escaped out.reserve(out.size() + str.size() + 3 * (str.size() / 4)); auto p = str.begin(); auto last = p; // last regular character // We advance over runs of passthrough characters and copy them in one go; // this is faster than calling push_back repeatedly. unsigned char minEncode = static_cast<unsigned char>(mode); while (p != str.end()) { char c = *p; unsigned char v = static_cast<unsigned char>(c); unsigned char discriminator = detail::uriEscapeTable[v]; if (LIKELY(discriminator <= minEncode)) { ++p; } else if (mode == UriEscapeMode::QUERY && discriminator == 3) { out.append(&*last, p - last); out.push_back('+'); ++p; last = p; } else { out.append(&*last, p - last); esc[1] = hexValues[v >> 4]; esc[2] = hexValues[v & 0x0f]; out.append(esc, 3); ++p; last = p; } } out.append(&*last, p - last); } template <class String> void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) { out.reserve(out.size() + str.size()); auto p = str.begin(); auto last = p; // We advance over runs of passthrough characters and copy them in one go; // this is faster than calling push_back repeatedly. while (p != str.end()) { char c = *p; switch (c) { case '%': { if (UNLIKELY(std::distance(p, str.end()) < 3)) { throw std::invalid_argument("incomplete percent encode sequence"); } auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])]; auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])]; if (UNLIKELY(h1 == 16 || h2 == 16)) { throw std::invalid_argument("invalid percent encode sequence"); } out.append(&*last, p - last); out.push_back((h1 << 4) | h2); p += 3; last = p; break; } case '+': if (mode == UriEscapeMode::QUERY) { out.append(&*last, p - last); out.push_back(' '); ++p; last = p; break; } // else fallthrough default: ++p; break; } } out.append(&*last, p - last); } namespace detail { /* * The following functions are type-overloaded helpers for * internalSplit(). */ inline size_t delimSize(char) { return 1; } inline size_t delimSize(StringPiece s) { return s.size(); } inline bool atDelim(const char* s, char c) { return *s == c; } inline bool atDelim(const char* s, StringPiece sp) { return !std::memcmp(s, sp.start(), sp.size()); } // These are used to short-circuit internalSplit() in the case of // 1-character strings. inline char delimFront(char c) { // This one exists only for compile-time; it should never be called. std::abort(); return c; } inline char delimFront(StringPiece s) { assert(!s.empty() && s.start() != nullptr); return *s.start(); } /* * Shared implementation for all the split() overloads. * * This uses some external helpers that are overloaded to let this * algorithm be more performant if the deliminator is a single * character instead of a whole string. * * @param ignoreEmpty iff true, don't copy empty segments to output */ template<class OutStringT, class DelimT, class OutputIterator> void internalSplit(DelimT delim, StringPiece sp, OutputIterator out, bool ignoreEmpty) { assert(sp.empty() || sp.start() != nullptr); const char* s = sp.start(); const size_t strSize = sp.size(); const size_t dSize = delimSize(delim); if (dSize > strSize || dSize == 0) { if (!ignoreEmpty || strSize > 0) { *out++ = to<OutStringT>(sp); } return; } if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) { // Call the char version because it is significantly faster. return internalSplit<OutStringT>(delimFront(delim), sp, out, ignoreEmpty); } size_t tokenStartPos = 0; size_t tokenSize = 0; for (size_t i = 0; i <= strSize - dSize; ++i) { if (atDelim(&s[i], delim)) { if (!ignoreEmpty || tokenSize > 0) { *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize)); } tokenStartPos = i + dSize; tokenSize = 0; i += dSize - 1; } else { ++tokenSize; } } tokenSize = strSize - tokenStartPos; if (!ignoreEmpty || tokenSize > 0) { *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize)); } } template<class String> StringPiece prepareDelim(const String& s) { return StringPiece(s); } inline char prepareDelim(char c) { return c; } template <bool exact, class Delim, class OutputType> bool splitFixed(const Delim& delimiter, StringPiece input, OutputType& output) { static_assert( exact || std::is_same<OutputType, StringPiece>::value || IsSomeString<OutputType>::value, "split<false>() requires that the last argument be a string type"); if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) { return false; } output = folly::to<OutputType>(input); return true; } template <bool exact, class Delim, class OutputType, class... OutputTypes> bool splitFixed( const Delim& delimiter, StringPiece input, OutputType& outHead, OutputTypes&... outTail) { size_t cut = input.find(delimiter); if (UNLIKELY(cut == std::string::npos)) { return false; } StringPiece head(input.begin(), input.begin() + cut); StringPiece tail(input.begin() + cut + detail::delimSize(delimiter), input.end()); if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) { outHead = folly::to<OutputType>(head); return true; } return false; } } ////////////////////////////////////////////////////////////////////// template<class Delim, class String, class OutputType> void split(const Delim& delimiter, const String& input, std::vector<OutputType>& out, bool ignoreEmpty) { detail::internalSplit<OutputType>( detail::prepareDelim(delimiter), StringPiece(input), std::back_inserter(out), ignoreEmpty); } template<class Delim, class String, class OutputType> void split(const Delim& delimiter, const String& input, fbvector<OutputType>& out, bool ignoreEmpty) { detail::internalSplit<OutputType>( detail::prepareDelim(delimiter), StringPiece(input), std::back_inserter(out), ignoreEmpty); } template<class OutputValueType, class Delim, class String, class OutputIterator> void splitTo(const Delim& delimiter, const String& input, OutputIterator out, bool ignoreEmpty) { detail::internalSplit<OutputValueType>( detail::prepareDelim(delimiter), StringPiece(input), out, ignoreEmpty); } template <bool exact, class Delim, class... OutputTypes> typename std::enable_if< AllConvertible<OutputTypes...>::value && sizeof...(OutputTypes) >= 1, bool>::type split(const Delim& delimiter, StringPiece input, OutputTypes&... outputs) { return detail::splitFixed<exact>( detail::prepareDelim(delimiter), input, outputs...); } namespace detail { /* * If a type can have its string size determined cheaply, we can more * efficiently append it in a loop (see internalJoinAppend). Note that the * struct need not conform to the std::string api completely (ex. does not need * to implement append()). */ template <class T> struct IsSizableString { enum { value = IsSomeString<T>::value || std::is_same<T, StringPiece>::value }; }; template <class Iterator> struct IsSizableStringContainerIterator : IsSizableString<typename std::iterator_traits<Iterator>::value_type> { }; template <class Delim, class Iterator, class String> void internalJoinAppend(Delim delimiter, Iterator begin, Iterator end, String& output) { assert(begin != end); if (std::is_same<Delim, StringPiece>::value && delimSize(delimiter) == 1) { internalJoinAppend(delimFront(delimiter), begin, end, output); return; } toAppend(*begin, &output); while (++begin != end) { toAppend(delimiter, *begin, &output); } } template <class Delim, class Iterator, class String> typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) { output.clear(); if (begin == end) { return; } const size_t dsize = delimSize(delimiter); Iterator it = begin; size_t size = it->size(); while (++it != end) { size += dsize + it->size(); } output.reserve(size); internalJoinAppend(delimiter, begin, end, output); } template <class Delim, class Iterator, class String> typename std::enable_if<!IsSizableStringContainerIterator<Iterator>::value>::type internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) { output.clear(); if (begin == end) { return; } internalJoinAppend(delimiter, begin, end, output); } } // namespace detail template <class Delim, class Iterator, class String> void join(const Delim& delimiter, Iterator begin, Iterator end, String& output) { detail::internalJoin( detail::prepareDelim(delimiter), begin, end, output); } template <class String1, class String2> void backslashify(const String1& input, String2& output, bool hex_style) { static const char hexValues[] = "0123456789abcdef"; output.clear(); output.reserve(3 * input.size()); for (unsigned char c : input) { // less than space or greater than '~' are considered unprintable if (c < 0x20 || c > 0x7e || c == '\\') { bool hex_append = false; output.push_back('\\'); if (hex_style) { hex_append = true; } else { if (c == '\r') output += 'r'; else if (c == '\n') output += 'n'; else if (c == '\t') output += 't'; else if (c == '\a') output += 'a'; else if (c == '\b') output += 'b'; else if (c == '\0') output += '0'; else if (c == '\\') output += '\\'; else { hex_append = true; } } if (hex_append) { output.push_back('x'); output.push_back(hexValues[(c >> 4) & 0xf]); output.push_back(hexValues[c & 0xf]); } } else { output += c; } } } template <class String1, class String2> void humanify(const String1& input, String2& output) { size_t numUnprintable = 0; size_t numPrintablePrefix = 0; for (unsigned char c : input) { if (c < 0x20 || c > 0x7e || c == '\\') { ++numUnprintable; } if (numUnprintable == 0) { ++numPrintablePrefix; } } // hexlify doubles a string's size; backslashify can potentially // explode it by 4x. Now, the printable range of the ascii // "spectrum" is around 95 out of 256 values, so a "random" binary // string should be around 60% unprintable. We use a 50% hueristic // here, so if a string is 60% unprintable, then we just use hex // output. Otherwise we backslash. // // UTF8 is completely ignored; as a result, utf8 characters will // likely be \x escaped (since most common glyphs fit in two bytes). // This is a tradeoff of complexity/speed instead of a convenience // that likely would rarely matter. Moreover, this function is more // about displaying underlying bytes, not about displaying glyphs // from languages. if (numUnprintable == 0) { output = input; } else if (5 * numUnprintable >= 3 * input.size()) { // However! If we have a "meaningful" prefix of printable // characters, say 20% of the string, we backslashify under the // assumption viewing the prefix as ascii is worth blowing the // output size up a bit. if (5 * numPrintablePrefix >= input.size()) { backslashify(input, output); } else { output = "0x"; hexlify(input, output, true /* append output */); } } else { backslashify(input, output); } } template<class InputString, class OutputString> bool hexlify(const InputString& input, OutputString& output, bool append_output) { if (!append_output) output.clear(); static char hexValues[] = "0123456789abcdef"; auto j = output.size(); output.resize(2 * input.size() + output.size()); for (size_t i = 0; i < input.size(); ++i) { int ch = input[i]; output[j++] = hexValues[(ch >> 4) & 0xf]; output[j++] = hexValues[ch & 0xf]; } return true; } template<class InputString, class OutputString> bool unhexlify(const InputString& input, OutputString& output) { if (input.size() % 2 != 0) { return false; } output.resize(input.size() / 2); int j = 0; auto unhex = [](char c) -> int { return c >= '0' && c <= '9' ? c - '0' : c >= 'A' && c <= 'F' ? c - 'A' + 10 : c >= 'a' && c <= 'f' ? c - 'a' + 10 : -1; }; for (size_t i = 0; i < input.size(); i += 2) { int highBits = unhex(input[i]); int lowBits = unhex(input[i + 1]); if (highBits < 0 || lowBits < 0) { return false; } output[j++] = (highBits << 4) + lowBits; } return true; } namespace detail { /** * Hex-dump at most 16 bytes starting at offset from a memory area of size * bytes. Return the number of bytes actually dumped. */ size_t hexDumpLine(const void* ptr, size_t offset, size_t size, std::string& line); } // namespace detail template <class OutIt> void hexDump(const void* ptr, size_t size, OutIt out) { size_t offset = 0; std::string line; while (offset < size) { offset += detail::hexDumpLine(ptr, offset, size, line); *out++ = line; } } } // namespace folly