Index: src/unicode-decoder.cc |
diff --git a/src/unicode-decoder.cc b/src/unicode-decoder.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..88eff3ad2660d3b230bfe04922b5c9dfede8a2f9 |
--- /dev/null |
+++ b/src/unicode-decoder.cc |
@@ -0,0 +1,78 @@ |
+// Copyright 2014 the V8 project authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+ |
+#include "src/unicode-inl.h" |
+#include "src/unicode-decoder.h" |
+#include <stdio.h> |
+#include <stdlib.h> |
+ |
+namespace unibrow { |
+ |
+void Utf8DecoderBase::Reset(uint16_t* buffer, unsigned buffer_length, |
+ const uint8_t* stream, unsigned stream_length) { |
+ // Assume everything will fit in the buffer and stream won't be needed. |
+ last_byte_of_buffer_unused_ = false; |
+ unbuffered_start_ = NULL; |
+ bool writing_to_buffer = true; |
+ // Loop until stream is read, writing to buffer as long as buffer has space. |
+ unsigned utf16_length = 0; |
+ while (stream_length != 0) { |
+ unsigned cursor = 0; |
+ uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor); |
+ DCHECK(cursor > 0 && cursor <= stream_length); |
+ stream += cursor; |
+ stream_length -= cursor; |
+ bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode; |
+ utf16_length += is_two_characters ? 2 : 1; |
+ // Don't need to write to the buffer, but still need utf16_length. |
+ if (!writing_to_buffer) continue; |
+ // Write out the characters to the buffer. |
+ // Must check for equality with buffer_length as we've already updated it. |
+ if (utf16_length <= buffer_length) { |
+ if (is_two_characters) { |
+ *buffer++ = Utf16::LeadSurrogate(character); |
+ *buffer++ = Utf16::TrailSurrogate(character); |
+ } else { |
+ *buffer++ = character; |
+ } |
+ if (utf16_length == buffer_length) { |
+ // Just wrote last character of buffer |
+ writing_to_buffer = false; |
+ unbuffered_start_ = stream; |
+ } |
+ continue; |
+ } |
+ // Have gone over buffer. |
+ // Last char of buffer is unused, set cursor back. |
+ DCHECK(is_two_characters); |
+ writing_to_buffer = false; |
+ last_byte_of_buffer_unused_ = true; |
+ unbuffered_start_ = stream - cursor; |
+ } |
+ utf16_length_ = utf16_length; |
+} |
+ |
+ |
+void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream, uint16_t* data, |
+ unsigned data_length) { |
+ while (data_length != 0) { |
+ unsigned cursor = 0; |
+ uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor); |
+ // There's a total lack of bounds checking for stream |
+ // as it was already done in Reset. |
+ stream += cursor; |
+ if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
+ *data++ = Utf16::LeadSurrogate(character); |
+ *data++ = Utf16::TrailSurrogate(character); |
+ DCHECK(data_length > 1); |
+ data_length -= 2; |
+ } else { |
+ *data++ = character; |
+ data_length -= 1; |
+ } |
+ } |
+} |
+ |
+} // namespace unibrow |