third_party/protobuf/src/google/protobuf/stubs/strutil.cc - Issue 1291903002: Pull new version of protobuf sources.

Unified Diff: third_party/protobuf/src/google/protobuf/stubs/strutil.cc

Issue 1291903002: Pull new version of protobuf sources. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Build fix attempts Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/protobuf/src/google/protobuf/stubs/strutil.h ('k') | third_party/protobuf/src/google/protobuf/stubs/strutil_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/protobuf/src/google/protobuf/stubs/strutil.cc

diff --git a/third_party/protobuf/src/google/protobuf/stubs/strutil.cc b/third_party/protobuf/src/google/protobuf/stubs/strutil.cc

index 00d1bc633d58d0952afa5a984062bae5f5873d64..8442f2cecbf4c552e43bfe36a104147269ae384e 100644

--- a/third_party/protobuf/src/google/protobuf/stubs/strutil.cc

+++ b/third_party/protobuf/src/google/protobuf/stubs/strutil.cc

@@ -1,6 +1,6 @@

// Protocol Buffers - Google's data interchange format

-// http://code.google.com/p/protobuf/

+// https://developers.google.com/protocol-buffers/

// Redistribution and use in source and binary forms, with or without

// modification, are permitted provided that the following conditions are

@@ -31,6 +31,8 @@

// from google3/strings/strutil.cc

#include <google/protobuf/stubs/strutil.h>

+#include <google/protobuf/stubs/mathlimits.h>

#include <errno.h>

#include <float.h> // FLT_DIG and DBL_DIG

#include <limits>

@@ -38,6 +40,8 @@

#include <stdio.h>

#include <iterator>

+#include <google/protobuf/stubs/stl_util.h>

#ifdef _WIN32

// MSVC has only _snprintf, not snprintf.

@@ -55,11 +59,6 @@

namespace google {

namespace protobuf {

-inline bool IsNaN(double value) {

- // NaN is never equal to anything, even itself.

- return value != value;

// These are defined as macros on some platforms. #undef them so that we can

// redefine them.

#undef isxdigit

@@ -94,6 +93,34 @@ void StripString(string* s, const char* remove, char replacewith) {

}

+void StripWhitespace(string* str) {

+ int str_length = str->length();

+ // Strip off leading whitespace.

+ int first = 0;

+ while (first < str_length && ascii_isspace(str->at(first))) {

+ ++first;

+ }

+ // If entire string is white space.

+ if (first == str_length) {

+ str->clear();

+ return;

+ }

+ if (first > 0) {

+ str->erase(0, first);

+ str_length -= first;

+ }

+ // Strip off trailing whitespace.

+ int last = str_length - 1;

+ while (last >= 0 && ascii_isspace(str->at(last))) {

+ --last;

+ }

+ if (last != (str_length - 1) && last >= 0) {

+ str->erase(last + 1, string::npos);

+ }

// ----------------------------------------------------------------------

// StringReplace()

// Replace the "old" pattern with the "new" pattern in a string,

@@ -281,17 +308,6 @@ void JoinStrings(const vector<string>& components,

#define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))

-inline int hex_digit_to_int(char c) {

- /* Assume ASCII. */

- assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61);

- assert(isxdigit(c));

- int x = static_cast<unsigned char>(c);

- if (x > '9') {

- x += 9;

- }

- return x & 0xf;

// Protocol buffers doesn't ever care about errors, but I don't want to remove

// the code.

#define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)

@@ -596,6 +612,133 @@ uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {

return static_cast<uint32>(result);

}

+inline bool safe_parse_sign(string* text /*inout*/,

+ bool* negative_ptr /*output*/) {

+ const char* start = text->data();

+ const char* end = start + text->size();

+ // Consume whitespace.

+ while (start < end && (start[0] == ' ')) {

+ ++start;

+ }

+ while (start < end && (end[-1] == ' ')) {

+ --end;

+ }

+ if (start >= end) {

+ return false;

+ }

+ // Consume sign.

+ *negative_ptr = (start[0] == '-');

+ if (*negative_ptr || start[0] == '+') {

+ ++start;

+ if (start >= end) {

+ return false;

+ }

+ *text = text->substr(start - text->data(), end - start);

+ return true;

+template<typename IntType>

+bool safe_parse_positive_int(

+ string text, IntType* value_p) {

+ int base = 10;

+ IntType value = 0;

+ const IntType vmax = std::numeric_limits<IntType>::max();

+ assert(vmax > 0);

+ assert(vmax >= base);

+ const IntType vmax_over_base = vmax / base;

+ const char* start = text.data();

+ const char* end = start + text.size();

+ // loop over digits

+ for (; start < end; ++start) {

+ unsigned char c = static_cast<unsigned char>(start[0]);

+ int digit = c - '0';

+ if (digit >= base || digit < 0) {

+ *value_p = value;

+ return false;

+ }

+ if (value > vmax_over_base) {

+ *value_p = vmax;

+ return false;

+ }

+ value *= base;

+ if (value > vmax - digit) {

+ *value_p = vmax;

+ return false;

+ }

+ value += digit;

+ }

+ *value_p = value;

+ return true;

+template<typename IntType>

+bool safe_parse_negative_int(

+ const string& text, IntType* value_p) {

+ int base = 10;

+ IntType value = 0;

+ const IntType vmin = std::numeric_limits<IntType>::min();

+ assert(vmin < 0);

+ assert(vmin <= 0 - base);

+ IntType vmin_over_base = vmin / base;

+ // 2003 c++ standard [expr.mul]

+ // "... the sign of the remainder is implementation-defined."

+ // Although (vmin/base)*base + vmin%base is always vmin.

+ // 2011 c++ standard tightens the spec but we cannot rely on it.

+ if (vmin % base > 0) {

+ vmin_over_base += 1;

+ }

+ const char* start = text.data();

+ const char* end = start + text.size();

+ // loop over digits

+ for (; start < end; ++start) {

+ unsigned char c = static_cast<unsigned char>(start[0]);

+ int digit = c - '0';

+ if (digit >= base || digit < 0) {

+ *value_p = value;

+ return false;

+ }

+ if (value < vmin_over_base) {

+ *value_p = vmin;

+ return false;

+ }

+ value *= base;

+ if (value < vmin + digit) {

+ *value_p = vmin;

+ return false;

+ }

+ value -= digit;

+ }

+ *value_p = value;

+ return true;

+template<typename IntType>

+bool safe_int_internal(string text, IntType* value_p) {

+ *value_p = 0;

+ bool negative;

+ if (!safe_parse_sign(&text, &negative)) {

+ return false;

+ }

+ if (!negative) {

+ return safe_parse_positive_int(text, value_p);

+ } else {

+ return safe_parse_negative_int(text, value_p);

+ }

+template<typename IntType>

+bool safe_uint_internal(string text, IntType* value_p) {

+ *value_p = 0;

+ bool negative;

+ if (!safe_parse_sign(&text, &negative) || negative) {

+ return false;

+ }

+ return safe_parse_positive_int(text, value_p);

// ----------------------------------------------------------------------

// FastIntToBuffer()

// FastInt64ToBuffer()

@@ -1056,7 +1199,7 @@ char* DoubleToBuffer(double value, char* buffer) {

} else if (value == -numeric_limits<double>::infinity()) {

strcpy(buffer, "-inf");

return buffer;

- } else if (IsNaN(value)) {

+ } else if (MathLimits<double>::IsNaN(value)) {

strcpy(buffer, "nan");

return buffer;

}

@@ -1087,6 +1230,41 @@ char* DoubleToBuffer(double value, char* buffer) {

return buffer;

}

+static int memcasecmp(const char *s1, const char *s2, size_t len) {

+ const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);

+ const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);

+ for ( int i = 0; i < len; i++ ) {

+ const int diff =

+ static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -

+ static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));

+ if (diff != 0) return diff;

+ }

+ return 0;

+inline bool CaseEqual(StringPiece s1, StringPiece s2) {

+ if (s1.size() != s2.size()) return false;

+ return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;

+bool safe_strtob(StringPiece str, bool* value) {

+ GOOGLE_CHECK(value != NULL) << "NULL output boolean given.";

+ if (CaseEqual(str, "true") || CaseEqual(str, "t") ||

+ CaseEqual(str, "yes") || CaseEqual(str, "y") ||

+ CaseEqual(str, "1")) {

+ *value = true;

+ return true;

+ }

+ if (CaseEqual(str, "false") || CaseEqual(str, "f") ||

+ CaseEqual(str, "no") || CaseEqual(str, "n") ||

+ CaseEqual(str, "0")) {

+ *value = false;

+ return true;

+ }

+ return false;

bool safe_strtof(const char* str, float* value) {

char* endptr;

errno = 0; // errno only gets set on errors

@@ -1098,6 +1276,34 @@ bool safe_strtof(const char* str, float* value) {

return *str != 0 && *endptr == 0 && errno == 0;

}

+bool safe_strtod(const char* str, double* value) {

+ char* endptr;

+ *value = strtod(str, &endptr);

+ if (endptr != str) {

+ while (ascii_isspace(*endptr)) ++endptr;

+ }

+ // Ignore range errors from strtod. The values it

+ // returns on underflow and overflow are the right

+ // fallback in a robust setting.

+ return *str != '\0' && *endptr == '\0';

+bool safe_strto32(const string& str, int32* value) {

+ return safe_int_internal(str, value);

+bool safe_strtou32(const string& str, uint32* value) {

+ return safe_uint_internal(str, value);

+bool safe_strto64(const string& str, int64* value) {

+ return safe_int_internal(str, value);

+bool safe_strtou64(const string& str, uint64* value) {

+ return safe_uint_internal(str, value);

char* FloatToBuffer(float value, char* buffer) {

// FLT_DIG is 6 for IEEE-754 floats, which are used on almost all

// platforms these days. Just in case some system exists where FLT_DIG

@@ -1111,7 +1317,7 @@ char* FloatToBuffer(float value, char* buffer) {

} else if (value == -numeric_limits<double>::infinity()) {

strcpy(buffer, "-inf");

return buffer;

- } else if (IsNaN(value)) {

+ } else if (MathLimits<float>::IsNaN(value)) {

strcpy(buffer, "nan");

return buffer;

}

@@ -1136,68 +1342,893 @@ char* FloatToBuffer(float value, char* buffer) {

return buffer;

}

+namespace strings {

+AlphaNum::AlphaNum(strings::Hex hex) {

+ char *const end = &digits[kFastToBufferSize];

+ char *writer = end;

+ uint64 value = hex.value;

+ uint64 width = hex.spec;

+ // We accomplish minimum width by OR'ing in 0x10000 to the user's value,

+ // where 0x10000 is the smallest hex number that is as wide as the user

+ // asked for.

+ uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;

+ static const char hexdigits[] = "0123456789abcdef";

+ do {

+ *--writer = hexdigits[value & 0xF];

+ value >>= 4;

+ mask >>= 4;

+ } while (mask != 0);

+ piece_data_ = writer;

+ piece_size_ = end - writer;

+} // namespace strings

// ----------------------------------------------------------------------

-// NoLocaleStrtod()

-// This code will make you cry.

+// StrCat()

+// This merges the given strings or integers, with no delimiter. This

+// is designed to be the fastest possible way to construct a string out

+// of a mix of raw C strings, C++ strings, and integer values.

// ----------------------------------------------------------------------

-// Returns a string identical to *input except that the character pointed to

-// by radix_pos (which should be '.') is replaced with the locale-specific

-// radix character.

-string LocalizeRadix(const char* input, const char* radix_pos) {

- // Determine the locale-specific radix character by calling sprintf() to

- // print the number 1.5, then stripping off the digits. As far as I can

- // tell, this is the only portable, thread-safe way to get the C library

- // to divuldge the locale's radix character. No, localeconv() is NOT

- // thread-safe.

- char temp[16];

- int size = sprintf(temp, "%.1f", 1.5);

- GOOGLE_CHECK_EQ(temp[0], '1');

- GOOGLE_CHECK_EQ(temp[size-1], '5');

- GOOGLE_CHECK_LE(size, 6);

- // Now replace the '.' in the input with it.

+// Append is merely a version of memcpy that returns the address of the byte

+// after the area just overwritten. It comes in multiple flavors to minimize

+// call overhead.

+static char *Append1(char *out, const AlphaNum &x) {

+ memcpy(out, x.data(), x.size());

+ return out + x.size();

+static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {

+ memcpy(out, x1.data(), x1.size());

+ out += x1.size();

+ memcpy(out, x2.data(), x2.size());

+ return out + x2.size();

+static char *Append4(char *out,

+ const AlphaNum &x1, const AlphaNum &x2,

+ const AlphaNum &x3, const AlphaNum &x4) {

+ memcpy(out, x1.data(), x1.size());

+ out += x1.size();

+ memcpy(out, x2.data(), x2.size());

+ out += x2.size();

+ memcpy(out, x3.data(), x3.size());

+ out += x3.size();

+ memcpy(out, x4.data(), x4.size());

+ return out + x4.size();

+string StrCat(const AlphaNum &a, const AlphaNum &b) {

+ string result;

+ result.resize(a.size() + b.size());

+ char *const begin = &*result.begin();

+ char *out = Append2(begin, a, b);

+ GOOGLE_DCHECK_EQ(out, begin + result.size());

+ return result;

+string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {

+ string result;

+ result.resize(a.size() + b.size() + c.size());

+ char *const begin = &*result.begin();

+ char *out = Append2(begin, a, b);

+ out = Append1(out, c);

+ GOOGLE_DCHECK_EQ(out, begin + result.size());

+ return result;

+string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,

+ const AlphaNum &d) {

+ string result;

+ result.resize(a.size() + b.size() + c.size() + d.size());

+ char *const begin = &*result.begin();

+ char *out = Append4(begin, a, b, c, d);

+ GOOGLE_DCHECK_EQ(out, begin + result.size());

+ return result;

+string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,

+ const AlphaNum &d, const AlphaNum &e) {

+ string result;

+ result.resize(a.size() + b.size() + c.size() + d.size() + e.size());

+ char *const begin = &*result.begin();

+ char *out = Append4(begin, a, b, c, d);

+ out = Append1(out, e);

+ GOOGLE_DCHECK_EQ(out, begin + result.size());

+ return result;

+string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,

+ const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {

+ string result;

+ result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +

+ f.size());

+ char *const begin = &*result.begin();

+ char *out = Append4(begin, a, b, c, d);

+ out = Append2(out, e, f);

+ GOOGLE_DCHECK_EQ(out, begin + result.size());

+ return result;

+string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,

+ const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,

+ const AlphaNum &g) {

string result;

- result.reserve(strlen(input) + size - 3);

- result.append(input, radix_pos);

- result.append(temp + 1, size - 2);

- result.append(radix_pos + 1);

+ result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +

+ f.size() + g.size());

+ char *const begin = &*result.begin();

+ char *out = Append4(begin, a, b, c, d);

+ out = Append2(out, e, f);

+ out = Append1(out, g);

+ GOOGLE_DCHECK_EQ(out, begin + result.size());

return result;

}

-double NoLocaleStrtod(const char* text, char** original_endptr) {

- // We cannot simply set the locale to "C" temporarily with setlocale()

- // as this is not thread-safe. Instead, we try to parse in the current

- // locale first. If parsing stops at a '.' character, then this is a

- // pretty good hint that we're actually in some other locale in which

- // '.' is not the radix character.

- char* temp_endptr;

- double result = strtod(text, &temp_endptr);

- if (original_endptr != NULL) *original_endptr = temp_endptr;

- if (*temp_endptr != '.') return result;

- // Parsing halted on a '.'. Perhaps we're in a different locale? Let's

- // try to replace the '.' with a locale-specific radix character and

- // try again.

- string localized = LocalizeRadix(text, temp_endptr);

- const char* localized_cstr = localized.c_str();

- char* localized_endptr;

- result = strtod(localized_cstr, &localized_endptr);

- if ((localized_endptr - localized_cstr) >

- (temp_endptr - text)) {

- // This attempt got further, so replacing the decimal must have helped.

- // Update original_endptr to point at the right location.

- if (original_endptr != NULL) {

- // size_diff is non-zero if the localized radix has multiple bytes.

- int size_diff = localized.size() - strlen(text);

- // const_cast is necessary to match the strtod() interface.

- *original_endptr = const_cast<char*>(

- text + (localized_endptr - localized_cstr - size_diff));

+string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,

+ const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,

+ const AlphaNum &g, const AlphaNum &h) {

+ string result;

+ result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +

+ f.size() + g.size() + h.size());

+ char *const begin = &*result.begin();

+ char *out = Append4(begin, a, b, c, d);

+ out = Append4(out, e, f, g, h);

+ GOOGLE_DCHECK_EQ(out, begin + result.size());

+ return result;

+string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,

+ const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,

+ const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {

+ string result;

+ result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +

+ f.size() + g.size() + h.size() + i.size());

+ char *const begin = &*result.begin();

+ char *out = Append4(begin, a, b, c, d);

+ out = Append4(out, e, f, g, h);

+ out = Append1(out, i);

+ GOOGLE_DCHECK_EQ(out, begin + result.size());

+ return result;

+// It's possible to call StrAppend with a char * pointer that is partway into

+// the string we're appending to. However the results of this are random.

+// Therefore, check for this in debug mode. Use unsigned math so we only have

+// to do one comparison.

+#define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \

+ GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \

+ uintptr_t((dest).size()))

+void StrAppend(string *result, const AlphaNum &a) {

+ GOOGLE_DCHECK_NO_OVERLAP(*result, a);

+ result->append(a.data(), a.size());

+void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {

+ GOOGLE_DCHECK_NO_OVERLAP(*result, a);

+ GOOGLE_DCHECK_NO_OVERLAP(*result, b);

+ string::size_type old_size = result->size();

+ result->resize(old_size + a.size() + b.size());

+ char *const begin = &*result->begin();

+ char *out = Append2(begin + old_size, a, b);

+ GOOGLE_DCHECK_EQ(out, begin + result->size());

+void StrAppend(string *result,

+ const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {

+ GOOGLE_DCHECK_NO_OVERLAP(*result, a);

+ GOOGLE_DCHECK_NO_OVERLAP(*result, b);

+ GOOGLE_DCHECK_NO_OVERLAP(*result, c);

+ string::size_type old_size = result->size();

+ result->resize(old_size + a.size() + b.size() + c.size());

+ char *const begin = &*result->begin();

+ char *out = Append2(begin + old_size, a, b);

+ out = Append1(out, c);

+ GOOGLE_DCHECK_EQ(out, begin + result->size());

+void StrAppend(string *result,

+ const AlphaNum &a, const AlphaNum &b,

+ const AlphaNum &c, const AlphaNum &d) {

+ GOOGLE_DCHECK_NO_OVERLAP(*result, a);

+ GOOGLE_DCHECK_NO_OVERLAP(*result, b);

+ GOOGLE_DCHECK_NO_OVERLAP(*result, c);

+ GOOGLE_DCHECK_NO_OVERLAP(*result, d);

+ string::size_type old_size = result->size();

+ result->resize(old_size + a.size() + b.size() + c.size() + d.size());

+ char *const begin = &*result->begin();

+ char *out = Append4(begin + old_size, a, b, c, d);

+ GOOGLE_DCHECK_EQ(out, begin + result->size());

+int GlobalReplaceSubstring(const string& substring,

+ const string& replacement,

+ string* s) {

+ GOOGLE_CHECK(s != NULL);

+ if (s->empty() || substring.empty())

+ return 0;

+ string tmp;

+ int num_replacements = 0;

+ int pos = 0;

+ for (int match_pos = s->find(substring.data(), pos, substring.length());

+ match_pos != string::npos;

+ pos = match_pos + substring.length(),

+ match_pos = s->find(substring.data(), pos, substring.length())) {

+ ++num_replacements;

+ // Append the original content before the match.

+ tmp.append(*s, pos, match_pos - pos);

+ // Append the replacement for the match.

+ tmp.append(replacement.begin(), replacement.end());

+ }

+ // Append the content after the last match. If no replacements were made, the

+ // original string is left untouched.

+ if (num_replacements > 0) {

+ tmp.append(*s, pos, s->length() - pos);

+ s->swap(tmp);

+ }

+ return num_replacements;

+int CalculateBase64EscapedLen(int input_len, bool do_padding) {

+ // Base64 encodes three bytes of input at a time. If the input is not

+ // divisible by three, we pad as appropriate.

+ //

+ // (from http://tools.ietf.org/html/rfc3548)

+ // Special processing is performed if fewer than 24 bits are available

+ // at the end of the data being encoded. A full encoding quantum is

+ // always completed at the end of a quantity. When fewer than 24 input

+ // bits are available in an input group, zero bits are added (on the

+ // right) to form an integral number of 6-bit groups. Padding at the

+ // end of the data is performed using the '=' character. Since all base

+ // 64 input is an integral number of octets, only the following cases

+ // can arise:

+ // Base64 encodes each three bytes of input into four bytes of output.

+ int len = (input_len / 3) * 4;

+ if (input_len % 3 == 0) {

+ // (from http://tools.ietf.org/html/rfc3548)

+ // (1) the final quantum of encoding input is an integral multiple of 24

+ // bits; here, the final unit of encoded output will be an integral

+ // multiple of 4 characters with no "=" padding,

+ } else if (input_len % 3 == 1) {

+ // (from http://tools.ietf.org/html/rfc3548)

+ // (2) the final quantum of encoding input is exactly 8 bits; here, the

+ // final unit of encoded output will be two characters followed by two

+ // "=" padding characters, or

+ len += 2;

+ if (do_padding) {

+ len += 2;

+ }

+ } else { // (input_len % 3 == 2)

+ // (from http://tools.ietf.org/html/rfc3548)

+ // (3) the final quantum of encoding input is exactly 16 bits; here, the

+ // final unit of encoded output will be three characters followed by one

+ // "=" padding character.

+ len += 3;

+ if (do_padding) {

+ len += 1;

}

- return result;

+ assert(len >= input_len); // make sure we didn't overflow

+ return len;

+// Base64Escape does padding, so this calculation includes padding.

+int CalculateBase64EscapedLen(int input_len) {

+ return CalculateBase64EscapedLen(input_len, true);

+// ----------------------------------------------------------------------

+// int Base64Unescape() - base64 decoder

+// int Base64Escape() - base64 encoder

+// int WebSafeBase64Unescape() - Google's variation of base64 decoder

+// int WebSafeBase64Escape() - Google's variation of base64 encoder

+//

+// Check out

+// http://tools.ietf.org/html/rfc2045 for formal description, but what we

+// care about is that...

+// Take the encoded stuff in groups of 4 characters and turn each

+// character into a code 0 to 63 thus:

+// A-Z map to 0 to 25

+// a-z map to 26 to 51

+// 0-9 map to 52 to 61

+// +(- for WebSafe) maps to 62

+// /(_ for WebSafe) maps to 63

+// There will be four numbers, all less than 64 which can be represented

+// by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).

+// Arrange the 6 digit binary numbers into three bytes as such:

+// aaaaaabb bbbbcccc ccdddddd

+// Equals signs (one or two) are used at the end of the encoded block to

+// indicate that the text was not an integer multiple of three bytes long.

+// ----------------------------------------------------------------------

+int Base64UnescapeInternal(const char *src_param, int szsrc,

+ char *dest, int szdest,

+ const signed char* unbase64) {

+ static const char kPad64Equals = '=';

+ static const char kPad64Dot = '.';

+ int decode = 0;

+ int destidx = 0;

+ int state = 0;

+ unsigned int ch = 0;

+ unsigned int temp = 0;

+ // If "char" is signed by default, using *src as an array index results in

+ // accessing negative array elements. Treat the input as a pointer to

+ // unsigned char to avoid this.

+ const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);

+ // The GET_INPUT macro gets the next input character, skipping

+ // over any whitespace, and stopping when we reach the end of the

+ // string or when we read any non-data character. The arguments are

+ // an arbitrary identifier (used as a label for goto) and the number

+ // of data bytes that must remain in the input to avoid aborting the

+ // loop.

+#define GET_INPUT(label, remain) \

+ label: \

+ --szsrc; \

+ ch = *src++; \

+ decode = unbase64[ch]; \

+ if (decode < 0) { \

+ if (ascii_isspace(ch) && szsrc >= remain) \

+ goto label; \

+ state = 4 - remain; \

+ break; \

+ }

+ // if dest is null, we're just checking to see if it's legal input

+ // rather than producing output. (I suspect this could just be done

+ // with a regexp...). We duplicate the loop so this test can be

+ // outside it instead of in every iteration.

+ if (dest) {

+ // This loop consumes 4 input bytes and produces 3 output bytes

+ // per iteration. We can't know at the start that there is enough

+ // data left in the string for a full iteration, so the loop may

+ // break out in the middle; if so 'state' will be set to the

+ // number of input bytes read.

+ while (szsrc >= 4) {

+ // We'll start by optimistically assuming that the next four

+ // bytes of the string (src[0..3]) are four good data bytes

+ // (that is, no nulls, whitespace, padding chars, or illegal

+ // chars). We need to test src[0..2] for nulls individually

+ // before constructing temp to preserve the property that we

+ // never read past a null in the string (no matter how long

+ // szsrc claims the string is).

+ if (!src[0] || !src[1] || !src[2] ||

+ (temp = ((unsigned(unbase64[src[0]]) << 18) |

+ (unsigned(unbase64[src[1]]) << 12) |

+ (unsigned(unbase64[src[2]]) << 6) |

+ (unsigned(unbase64[src[3]])))) & 0x80000000) {

+ // Iff any of those four characters was bad (null, illegal,

+ // whitespace, padding), then temp's high bit will be set

+ // (because unbase64[] is -1 for all bad characters).

+ //

+ // We'll back up and resort to the slower decoder, which knows

+ // how to handle those cases.

+ GET_INPUT(first, 4);

+ temp = decode;

+ GET_INPUT(second, 3);

+ temp = (temp << 6) | decode;

+ GET_INPUT(third, 2);

+ temp = (temp << 6) | decode;

+ GET_INPUT(fourth, 1);

+ temp = (temp << 6) | decode;

+ } else {

+ // We really did have four good data bytes, so advance four

+ // characters in the string.

+ szsrc -= 4;

+ src += 4;

+ decode = -1;

+ ch = '\0';

+ }

+ // temp has 24 bits of input, so write that out as three bytes.

+ if (destidx+3 > szdest) return -1;

+ dest[destidx+2] = temp;

+ temp >>= 8;

+ dest[destidx+1] = temp;

+ temp >>= 8;

+ dest[destidx] = temp;

+ destidx += 3;

+ }

+ } else {

+ while (szsrc >= 4) {

+ if (!src[0] || !src[1] || !src[2] ||

+ (temp = ((unsigned(unbase64[src[0]]) << 18) |

+ (unsigned(unbase64[src[1]]) << 12) |

+ (unsigned(unbase64[src[2]]) << 6) |

+ (unsigned(unbase64[src[3]])))) & 0x80000000) {

+ GET_INPUT(first_no_dest, 4);

+ GET_INPUT(second_no_dest, 3);

+ GET_INPUT(third_no_dest, 2);

+ GET_INPUT(fourth_no_dest, 1);

+ } else {

+ szsrc -= 4;

+ src += 4;

+ decode = -1;

+ ch = '\0';

+ }

+ destidx += 3;

+ }

+#undef GET_INPUT

+ // if the loop terminated because we read a bad character, return

+ // now.

+ if (decode < 0 && ch != '\0' &&

+ ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))

+ return -1;

+ if (ch == kPad64Equals || ch == kPad64Dot) {

+ // if we stopped by hitting an '=' or '.', un-read that character -- we'll

+ // look at it again when we count to check for the proper number of

+ // equals signs at the end.

+ ++szsrc;

+ --src;

+ } else {

+ // This loop consumes 1 input byte per iteration. It's used to

+ // clean up the 0-3 input bytes remaining when the first, faster

+ // loop finishes. 'temp' contains the data from 'state' input

+ // characters read by the first loop.

+ while (szsrc > 0) {

+ --szsrc;

+ ch = *src++;

+ decode = unbase64[ch];

+ if (decode < 0) {

+ if (ascii_isspace(ch)) {

+ continue;

+ } else if (ch == '\0') {

+ break;

+ } else if (ch == kPad64Equals || ch == kPad64Dot) {

+ // back up one character; we'll read it again when we check

+ // for the correct number of pad characters at the end.

+ ++szsrc;

+ --src;

+ break;

+ } else {

+ return -1;

+ }

+ // Each input character gives us six bits of output.

+ temp = (temp << 6) | decode;

+ ++state;

+ if (state == 4) {

+ // If we've accumulated 24 bits of output, write that out as

+ // three bytes.

+ if (dest) {

+ if (destidx+3 > szdest) return -1;

+ dest[destidx+2] = temp;

+ temp >>= 8;

+ dest[destidx+1] = temp;

+ temp >>= 8;

+ dest[destidx] = temp;

+ }

+ destidx += 3;

+ state = 0;

+ temp = 0;

+ }

+ // Process the leftover data contained in 'temp' at the end of the input.

+ int expected_equals = 0;

+ switch (state) {

+ case 0:

+ // Nothing left over; output is a multiple of 3 bytes.

+ break;

+ case 1:

+ // Bad input; we have 6 bits left over.

+ return -1;

+ case 2:

+ // Produce one more output byte from the 12 input bits we have left.

+ if (dest) {

+ if (destidx+1 > szdest) return -1;

+ temp >>= 4;

+ dest[destidx] = temp;

+ }

+ ++destidx;

+ expected_equals = 2;

+ break;

+ case 3:

+ // Produce two more output bytes from the 18 input bits we have left.

+ if (dest) {

+ if (destidx+2 > szdest) return -1;

+ temp >>= 2;

+ dest[destidx+1] = temp;

+ temp >>= 8;

+ dest[destidx] = temp;

+ }

+ destidx += 2;

+ expected_equals = 1;

+ break;

+ default:

+ // state should have no other values at this point.

+ GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;

+ }

+ // The remainder of the string should be all whitespace, mixed with

+ // exactly 0 equals signs, or exactly 'expected_equals' equals

+ // signs. (Always accepting 0 equals signs is a google extension

+ // not covered in the RFC, as is accepting dot as the pad character.)

+ int equals = 0;

+ while (szsrc > 0 && *src) {

+ if (*src == kPad64Equals || *src == kPad64Dot)

+ ++equals;

+ else if (!ascii_isspace(*src))

+ return -1;

+ --szsrc;

+ ++src;

+ }

+ return (equals == 0 || equals == expected_equals) ? destidx : -1;

+// The arrays below were generated by the following code

+// #include <sys/time.h>

+// #include <stdlib.h>

+// #include <string.h>

+// main()

+// {

+// static const char Base64[] =

+// "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

+// char *pos;

+// int idx, i, j;

+// printf(" ");

+// for (i = 0; i < 255; i += 8) {

+// for (j = i; j < i + 8; j++) {

+// pos = strchr(Base64, j);

+// if ((pos == NULL) || (j == 0))

+// idx = -1;

+// else

+// idx = pos - Base64;

+// if (idx == -1)

+// printf(" %2d, ", idx);

+// else

+// printf(" %2d/*%c*/,", idx, j);

+// }

+// printf("\n ");

+// }

+//

+// where the value of "Base64[]" was replaced by one of the base-64 conversion

+// tables from the functions below.

+static const signed char kUnBase64[] = {

+ -1, -1, -1, -1, -1, -1, -1, -1,

+ -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,

+ 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,

+ 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,

+ -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,

+ 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,

+ 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,

+ 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,

+ -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,

+ 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,

+ 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,

+ 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,

+ -1, -1, -1, -1, -1, -1, -1, -1,

+ -1, -1, -1, -1, -1, -1, -1, -1

+};

+static const signed char kUnWebSafeBase64[] = {

+ -1, -1, -1, -1, -1, -1, -1, -1,

+ -1, -1, -1, -1, -1, 62/*-*/, -1, -1,

+ 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,

+ 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,

+ -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,

+ 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,

+ 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,

+ 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,

+ -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,

+ 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,

+ 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,

+ 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,

+ -1, -1, -1, -1, -1, -1, -1, -1,

+ -1, -1, -1, -1, -1, -1, -1, -1

+};

+int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {

+ return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);

+static bool Base64UnescapeInternal(const char* src, int slen, string* dest,

+ const signed char* unbase64) {

+ // Determine the size of the output string. Base64 encodes every 3 bytes into

+ // 4 characters. any leftover chars are added directly for good measure.

+ // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548

+ const int dest_len = 3 * (slen / 4) + (slen % 4);

+ dest->resize(dest_len);

+ // We are getting the destination buffer by getting the beginning of the

+ // string and converting it into a char *.

+ const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),

+ dest_len, unbase64);

+ if (len < 0) {

+ dest->clear();

+ return false;

+ }

+ // could be shorter if there was padding

+ GOOGLE_DCHECK_LE(len, dest_len);

+ dest->erase(len);

+ return true;

+bool Base64Unescape(StringPiece src, string* dest) {

+ return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);

+bool WebSafeBase64Unescape(StringPiece src, string* dest) {

+ return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);

+int Base64EscapeInternal(const unsigned char *src, int szsrc,

+ char *dest, int szdest, const char *base64,

+ bool do_padding) {

+ static const char kPad64 = '=';

+ if (szsrc <= 0) return 0;

+ if (szsrc * 4 > szdest * 3) return 0;

+ char *cur_dest = dest;

+ const unsigned char *cur_src = src;

+ char *limit_dest = dest + szdest;

+ const unsigned char *limit_src = src + szsrc;

+ // Three bytes of data encodes to four characters of cyphertext.

+ // So we can pump through three-byte chunks atomically.

+ while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits

+ uint32 in = BigEndian::Load32(cur_src) >> 8;

+ cur_dest[0] = base64[in >> 18];

+ in &= 0x3FFFF;

+ cur_dest[1] = base64[in >> 12];

+ in &= 0xFFF;

+ cur_dest[2] = base64[in >> 6];

+ in &= 0x3F;

+ cur_dest[3] = base64[in];

+ cur_dest += 4;

+ cur_src += 3;

+ }

+ // To save time, we didn't update szdest or szsrc in the loop. So do it now.

+ szdest = limit_dest - cur_dest;

+ szsrc = limit_src - cur_src;

+ /* now deal with the tail (<=3 bytes) */

+ switch (szsrc) {

+ case 0:

+ // Nothing left; nothing more to do.

+ break;

+ case 1: {

+ // One byte left: this encodes to two characters, and (optionally)

+ // two pad characters to round out the four-character cypherblock.

+ if ((szdest -= 2) < 0) return 0;

+ uint32 in = cur_src[0];

+ cur_dest[0] = base64[in >> 2];

+ in &= 0x3;

+ cur_dest[1] = base64[in << 4];

+ cur_dest += 2;

+ if (do_padding) {

+ if ((szdest -= 2) < 0) return 0;

+ cur_dest[0] = kPad64;

+ cur_dest[1] = kPad64;

+ cur_dest += 2;

+ }

+ break;

+ }

+ case 2: {

+ // Two bytes left: this encodes to three characters, and (optionally)

+ // one pad character to round out the four-character cypherblock.

+ if ((szdest -= 3) < 0) return 0;

+ uint32 in = BigEndian::Load16(cur_src);

+ cur_dest[0] = base64[in >> 10];

+ in &= 0x3FF;

+ cur_dest[1] = base64[in >> 4];

+ in &= 0x00F;

+ cur_dest[2] = base64[in << 2];

+ cur_dest += 3;

+ if (do_padding) {

+ if ((szdest -= 1) < 0) return 0;

+ cur_dest[0] = kPad64;

+ cur_dest += 1;

+ }

+ break;

+ }

+ case 3: {

+ // Three bytes left: same as in the big loop above. We can't do this in

+ // the loop because the loop above always reads 4 bytes, and the fourth

+ // byte is past the end of the input.

+ if ((szdest -= 4) < 0) return 0;

+ uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);

+ cur_dest[0] = base64[in >> 18];

+ in &= 0x3FFFF;

+ cur_dest[1] = base64[in >> 12];

+ in &= 0xFFF;

+ cur_dest[2] = base64[in >> 6];

+ in &= 0x3F;

+ cur_dest[3] = base64[in];

+ cur_dest += 4;

+ break;

+ }

+ default:

+ // Should not be reached: blocks of 4 bytes are handled

+ // in the while loop before this switch statement.

+ GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;

+ break;

+ }

+ return (cur_dest - dest);

+static const char kBase64Chars[] =

+"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

+static const char kWebSafeBase64Chars[] =

+"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";

+int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {

+ return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);

+int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,

+ int szdest, bool do_padding) {

+ return Base64EscapeInternal(src, szsrc, dest, szdest,

+ kWebSafeBase64Chars, do_padding);

+void Base64EscapeInternal(const unsigned char* src, int szsrc,

+ string* dest, bool do_padding,

+ const char* base64_chars) {

+ const int calc_escaped_size =

+ CalculateBase64EscapedLen(szsrc, do_padding);

+ dest->resize(calc_escaped_size);

+ const int escaped_len = Base64EscapeInternal(src, szsrc,

+ string_as_array(dest),

+ dest->size(),

+ base64_chars,

+ do_padding);

+ GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);

+ dest->erase(escaped_len);

+void Base64Escape(const unsigned char *src, int szsrc,

+ string* dest, bool do_padding) {

+ Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);

+void WebSafeBase64Escape(const unsigned char *src, int szsrc,

+ string *dest, bool do_padding) {

+ Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);

+void Base64Escape(StringPiece src, string* dest) {

+ Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),

+ src.size(), dest, true);

+void WebSafeBase64Escape(StringPiece src, string* dest) {

+ WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),

+ src.size(), dest, false);

+void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {

+ WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),

+ src.size(), dest, true);

+// Helper to append a Unicode code point to a string as UTF8, without bringing

+// in any external dependencies.

+int EncodeAsUTF8Char(uint32 code_point, char* output) {

+ uint32 tmp = 0;

+ int len = 0;

+ if (code_point <= 0x7f) {

+ tmp = code_point;

+ len = 1;

+ } else if (code_point <= 0x07ff) {

+ tmp = 0x0000c080 |

+ ((code_point & 0x07c0) << 2) |

+ (code_point & 0x003f);

+ len = 2;

+ } else if (code_point <= 0xffff) {

+ tmp = 0x00e08080 |

+ ((code_point & 0xf000) << 4) |

+ ((code_point & 0x0fc0) << 2) |

+ (code_point & 0x003f);

+ len = 3;

+ } else {

+ // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is

+ // normally only defined up to there as well.

+ tmp = 0xf0808080 |

+ ((code_point & 0x1c0000) << 6) |

+ ((code_point & 0x03f000) << 4) |

+ ((code_point & 0x000fc0) << 2) |

+ (code_point & 0x003f);

+ len = 4;

+ }

+ tmp = ghtonl(tmp);

+ memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);

+ return len;

+// Table of UTF-8 character lengths, based on first byte

+static const unsigned char kUTF8LenTbl[256] = {

+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

+ 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4

+};

+// Return length of a single UTF-8 source character

+int UTF8FirstLetterNumBytes(const char* src, int len) {

+ if (len == 0) {

+ return 0;

+ }

+ return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];

}

} // namespace protobuf