Index: src/unicode-decoder.h |
diff --git a/src/unicode-decoder.h b/src/unicode-decoder.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..35ea30cf1a5ad63ef02a41a9cbc377264edc2904 |
--- /dev/null |
+++ b/src/unicode-decoder.h |
@@ -0,0 +1,121 @@ |
+// Copyright 2014 the V8 project authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#ifndef V8_UNICODE_DECODER_H_ |
+#define V8_UNICODE_DECODER_H_ |
+ |
+#include <sys/types.h> |
+#include "src/globals.h" |
+ |
+namespace unibrow { |
+ |
+class Utf8DecoderBase { |
+ public: |
+ // Initialization done in subclass. |
+ inline Utf8DecoderBase(); |
+ inline Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length, |
+ const uint8_t* stream, unsigned stream_length); |
+ inline unsigned Utf16Length() const { return utf16_length_; } |
+ |
+ protected: |
+ // This reads all characters and sets the utf16_length_. |
+ // The first buffer_length utf16 chars are cached in the buffer. |
+ void Reset(uint16_t* buffer, unsigned buffer_length, const uint8_t* stream, |
+ unsigned stream_length); |
+ static void WriteUtf16Slow(const uint8_t* stream, uint16_t* data, |
+ unsigned length); |
+ const uint8_t* unbuffered_start_; |
+ unsigned utf16_length_; |
+ bool last_byte_of_buffer_unused_; |
+ |
+ private: |
+ DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); |
+}; |
+ |
+template <unsigned kBufferSize> |
+class Utf8Decoder : public Utf8DecoderBase { |
+ public: |
+ inline Utf8Decoder() {} |
+ inline Utf8Decoder(const char* stream, unsigned length); |
+ inline void Reset(const char* stream, unsigned length); |
+ inline unsigned WriteUtf16(uint16_t* data, unsigned length) const; |
+ |
+ private: |
+ uint16_t buffer_[kBufferSize]; |
+}; |
+ |
+ |
+Utf8DecoderBase::Utf8DecoderBase() |
+ : unbuffered_start_(NULL), |
+ utf16_length_(0), |
+ last_byte_of_buffer_unused_(false) {} |
+ |
+ |
+Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, unsigned buffer_length, |
+ const uint8_t* stream, |
+ unsigned stream_length) { |
+ Reset(buffer, buffer_length, stream, stream_length); |
+} |
+ |
+ |
+template <unsigned kBufferSize> |
+Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length) |
+ : Utf8DecoderBase(buffer_, kBufferSize, |
+ reinterpret_cast<const uint8_t*>(stream), length) {} |
+ |
+ |
+template <unsigned kBufferSize> |
+void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) { |
+ Utf8DecoderBase::Reset(buffer_, kBufferSize, |
+ reinterpret_cast<const uint8_t*>(stream), length); |
+} |
+ |
+ |
+template <unsigned kBufferSize> |
+unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data, |
+ unsigned length) const { |
+ DCHECK(length > 0); |
+ if (length > utf16_length_) length = utf16_length_; |
+ // memcpy everything in buffer. |
+ unsigned buffer_length = |
+ last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize; |
+ unsigned memcpy_length = length <= buffer_length ? length : buffer_length; |
+ v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); |
+ if (length <= buffer_length) return length; |
+ DCHECK(unbuffered_start_ != NULL); |
+ // Copy the rest the slow way. |
+ WriteUtf16Slow(unbuffered_start_, data + buffer_length, |
+ length - buffer_length); |
+ return length; |
+} |
+ |
+class Latin1 { |
+ public: |
+ static const unsigned kMaxChar = 0xff; |
+ // Returns 0 if character does not convert to single latin-1 character |
+ // or if the character doesn't not convert back to latin-1 via inverse |
+ // operation (upper to lower, etc). |
+ static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); |
+}; |
+ |
+ |
+uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { |
+ DCHECK(c > Latin1::kMaxChar); |
+ switch (c) { |
+ // This are equivalent characters in unicode. |
+ case 0x39c: |
+ case 0x3bc: |
+ return 0xb5; |
+ // This is an uppercase of a Latin-1 character |
+ // outside of Latin-1. |
+ case 0x178: |
+ return 0xff; |
+ } |
+ return 0; |
+} |
+ |
+ |
+} // namespace unibrow |
+ |
+#endif // V8_UNICODE_DECODER_H_ |