Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(105)

Unified Diff: third_party/WebKit/Source/platform/fonts/Character.cpp

Issue 1541393003: Improve performance of Character::isCJKIdeographOrSymbol by using trie tree (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Cleanup, remove isCJKIdeograph, remove perf test code Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/platform/fonts/Character.cpp
diff --git a/third_party/WebKit/Source/platform/fonts/Character.cpp b/third_party/WebKit/Source/platform/fonts/Character.cpp
index b0dc240d950286ad8b231606f65e639ec169f338..5b527cd234b7cb2870733908a7c8d0ae8da6a3a8 100644
--- a/third_party/WebKit/Source/platform/fonts/Character.cpp
+++ b/third_party/WebKit/Source/platform/fonts/Character.cpp
@@ -33,7 +33,10 @@
#include "wtf/StdLibExtras.h"
#include "wtf/text/StringBuilder.h"
#include <algorithm>
+#include <unicode/uobject.h>
#include <unicode/uscript.h>
+#define MUTEX_H
drott 2016/01/14 12:10:53 Why is this needed?
kojii 2016/01/14 15:34:33 ICU fails to compile without this, at least on Win
+#include <utrie2.h>
using namespace WTF;
using namespace Unicode;
@@ -61,6 +64,273 @@ static const UChar32 cjkIsolatedSymbolsArray[] = {
0x1F100
};
+static const UChar32 cjkIdeographRanges[] = {
+ // CJK Radicals Supplement and Kangxi Radicals.
+ 0x2E80, 0x2FDF,
+ // CJK Strokes.
+ 0x31C0, 0x31EF,
+ // CJK Unified Ideographs Extension A.
+ 0x3400, 0x4DBF,
+ // The basic CJK Unified Ideographs block.
+ 0x4E00, 0x9FFF,
+ // CJK Compatibility Ideographs.
+ 0xF900, 0xFAFF,
+ // CJK Unified Ideographs Extension B.
+ 0x20000, 0x2A6DF,
+ // CJK Unified Ideographs Extension C.
+ // CJK Unified Ideographs Extension D.
+ 0x2A700, 0x2B81F,
+ // CJK Compatibility Ideographs Supplement.
+ 0x2F800, 0x2FA1F
+};
+
+static const UChar32 cjkSymbolRanges[] = {
+ 0x2156, 0x215A,
+ 0x2160, 0x216B,
+ 0x2170, 0x217B,
+ 0x23BE, 0x23CC,
+ 0x2460, 0x2492,
+ 0x249C, 0x24FF,
+ 0x25CE, 0x25D3,
+ 0x25E2, 0x25E6,
+ 0x2600, 0x2603,
+ 0x2660, 0x266F,
+ 0x2672, 0x267D,
drott 2016/01/14 12:10:53 Could you perhaps include // Emoji HEAVY HEART EX
kojii 2016/01/14 15:34:33 Done.
+ 0x2776, 0x277F,
+ // Ideographic Description Characters, with CJK Symbols and Punctuation, excluding 0x3030.
drott 2016/01/14 12:10:53 I know that this is previous code in a way. Howeve
kojii 2016/01/14 15:34:33 Wasn't aware that 80 column limit is also coming.
+ // Then Hiragana 0x3040 .. 0x309F, Katakana 0x30A0 .. 0x30FF, Bopomofo 0x3100 .. 0x312F
+ 0x2FF0, 0x302F,
+ 0x3031, 0x312F,
+ // More Bopomofo and Bopomofo Extended 0x31A0 .. 0x31BF
+ 0x3190, 0x31BF,
+ // Enclosed CJK Letters and Months (0x3200 .. 0x32FF).
+ // CJK Compatibility (0x3300 .. 0x33FF).
+ 0x3200, 0x33FF,
+ 0xF860, 0xF862,
+ // CJK Compatibility Forms.
+ 0xFE30, 0xFE4F,
+ // Halfwidth and Fullwidth Forms
+ // Usually only used in CJK
+ 0xFF00, 0xFF0C,
+ 0xFF0E, 0xFF1A,
+ 0xFF1F, 0xFFEF,
+ // Emoji.
+ 0x1F110, 0x1F129,
+ 0x1F130, 0x1F149,
+ 0x1F150, 0x1F169,
+ 0x1F170, 0x1F189,
+ 0x1F200, 0x1F6FF
+};
+
+// UNICODE VERTICAL TEXT LAYOUT http://www.unicode.org/reports/tr50/
drott 2016/01/14 12:10:53 I wouldn't uppercase the comment here. Perhaps: /
kojii 2016/01/14 15:34:33 Done (actually data is still rev 13, I need to upd
+static const UChar32 isUprightInMixedVerticalArray[] = {
+ 0x000A7,
+ 0x000A9,
+ 0x000AE,
+ 0x000B1,
+ 0x000D7,
+ 0x000F7
+};
+
+static const UChar32 isUprightInMixedVerticalRanges[] = {
+ 0x000BC, 0x000BE,
+ // Spacing Modifier Letters (Part of)
+ 0x002EA, 0x002EB,
+ // Hangul Jamo
+ 0x01100, 0x011FF,
+ // Unified Canadian Aboriginal Syllabics
+ 0x01401, 0x0167F,
+ // Unified Canadian Aboriginal Syllabics Extended
+ 0x018B0, 0x018FF,
+ // General Punctuation (Part of)
+ 0x02016, 0x02016,
+ 0x02020, 0x02021,
+ 0x02030, 0x02031,
+ 0x0203B, 0x0203C,
+ 0x02042, 0x02042,
+ 0x02047, 0x02049,
+ 0x02051, 0x02051,
+ 0x02065, 0x02069,
+ // Combining Diacritical Marks for Symbols (Part of)
+ 0x020DD, 0x020E0,
+ 0x020E2, 0x020E4,
+ // Letterlike Symbols (Part of)/Number Forms
+ 0x02100, 0x02101,
+ 0x02103, 0x02109,
+ 0x0210F, 0x0210F,
+ 0x02113, 0x02114,
+ 0x02116, 0x02117,
+ 0x0211E, 0x02123,
+ 0x02125, 0x02125,
+ 0x02127, 0x02127,
+ 0x02129, 0x02129,
+ 0x0212E, 0x0212E,
+ 0x02135, 0x0213F,
+ 0x02145, 0x0214A,
+ 0x0214C, 0x0214D,
+ 0x0214F, 0x0218F,
+ // Mathematical Operators (Part of)
+ 0x0221E, 0x0221E,
+ 0x02234, 0x02235,
+ // Miscellaneous Technical (Part of)
+ 0x02300, 0x02307,
+ 0x0230C, 0x0231F,
+ 0x02324, 0x0232B,
+ 0x0237D, 0x0239A,
+ 0x023BE, 0x023CD,
+ 0x023CF, 0x023CF,
+ 0x023D1, 0x023DB,
+ 0x023E2, 0x02422,
+ // Control Pictures (Part of)/Optical Character Recognition/Enclosed Alphanumerics
+ 0x02424, 0x024FF,
+ // Geometric Shapes/Miscellaneous Symbols (Part of)
+ 0x025A0, 0x02619,
+ 0x02620, 0x02767,
+ 0x02776, 0x02793,
+ // Miscellaneous Symbols and Arrows (Part of)
+ 0x02B12, 0x02B2F,
+ 0x02B50, 0x02B59,
+ 0x02BB8, 0x02BFF,
+ // Common CJK
+ 0x02E80, 0x0A4CF,
+ // Hangul Jamo Extended-A
+ 0x0A960, 0x0A97F,
+ // Hangul Syllables/Hangul Jamo Extended-B
+ 0x0AC00, 0x0D7FF,
+ // Private Use Area/CJK Compatibility Ideographs
+ 0x0E000, 0x0FAFF,
+ // Vertical Forms
+ 0x0FE10, 0x0FE1F,
+ // CJK Compatibility Forms (Part of)
+ 0x0FE30, 0x0FE48,
+ // Small Form Variants (Part of)
+ 0x0FE50, 0x0FE57,
+ 0x0FE59, 0x0FE62,
+ 0x0FE67, 0x0FE6F,
+ // Halfwidth and Fullwidth Forms
+ 0x0FF01, 0x0FF0C,
+ 0x0FF0E, 0x0FF1B,
+ 0x0FF1F, 0x0FF60,
+ 0x0FFE0, 0x0FFE7,
+ // Specials (Part of)
+ 0x0FFF0, 0x0FFF8,
+ 0x0FFFC, 0x0FFFD,
+ // Meroitic Hieroglyphs
+ 0x10980, 0x1099F,
+ // Siddham
+ 0x11580, 0x115FF,
+ // Egyptian Hieroglyphs
+ 0x13000, 0x1342F,
+ // Kana Supplement
+ 0x1B000, 0x1B0FF,
+ // Byzantine Musical Symbols/Musical Symbols
+ 0x1D000, 0x1D1FF,
+ // Tai Xuan Jing Symbols/Counting Rod Numerals
+ 0x1D300, 0x1D37F,
+ // Mahjong Tiles/Domino Tiles/Playing Cards/Enclosed Alphanumeric Supplement
+ // Enclosed Ideographic Supplement/Enclosed Ideographic Supplement
+ // Emoticons/Ornamental Dingbats/Transport and Map Symbols/Alchemical Symbols
+ // Alchemical Symbols
+ 0x1F000, 0x1F7FF,
+ // CJK Unified Ideographs Extension B/C/D
+ // CJK Compatibility Ideographs Supplement
+ 0x20000, 0x2FFFD,
+ 0x30000, 0x3FFFD,
+ // Supplementary Private Use Area-A
+ 0xF0000, 0xFFFFD,
+ // Supplementary Private Use Area-B
+ 0x100000, 0x10FFFD,
+};
+
+using CharacterPropertiesType = uint8_t;
+
+enum class CharacterProperties : CharacterPropertiesType {
+ isCJKIdeographOrSymbol = 0x0001,
+ isUprightInMixedVertical = 0x0002,
+};
+
+inline CharacterProperties operator | (CharacterProperties a, CharacterProperties b)
+{
+ return static_cast<CharacterProperties>((static_cast<CharacterPropertiesType>(a) | static_cast<CharacterPropertiesType>(b)));
+}
+
+inline CharacterProperties operator & (CharacterProperties a, CharacterProperties b)
+{
+ return static_cast<CharacterProperties>((static_cast<CharacterPropertiesType>(a) & static_cast<CharacterPropertiesType>(b)));
+}
+
+inline CharacterProperties operator |= (CharacterProperties& a, CharacterProperties b)
+{
+ a = a | b;
+ return a;
+}
+
+const UChar32 maxCodePointForPropertyValues = 0x10FFFD;
drott 2016/01/14 12:10:53 As far as I can see this is only used for the asse
kojii 2016/01/14 15:34:33 Couldn't find that definition, thanks!
+
+static void setRanges(CharacterProperties* values, const UChar32* ranges, size_t length, CharacterProperties value)
drott 2016/01/14 12:10:53 If I understand correctly, the CharacterProperties
kojii 2016/01/14 15:34:33 It's 1.1MB because CharacterProperties is uint8_t.
+{
+ ASSERT(length % 2 == 0);
+ const UChar32* end = ranges + length;
+ for (; ranges != end; ranges += 2) {
+ ASSERT(ranges[0] <= ranges[1] && ranges[1] <= maxCodePointForPropertyValues);
+ for (UChar32 c = ranges[0]; c <= ranges[1]; c++)
+ values[c] |= value;
+ }
+}
+
+static void setValues(CharacterProperties* values, const UChar32* begin, size_t length, CharacterProperties value)
+{
+ const UChar32* end = begin + length;
+ for (; begin != end; begin++) {
+ ASSERT(*begin <= maxCodePointForPropertyValues);
+ values[*begin] |= value;
+ }
+}
+
+static UTrie2* createTrie()
+{
+ // Create a value array of all possible code points.
+ const UChar32 size = maxCodePointForPropertyValues + 1;
+ OwnPtr<CharacterProperties[]> values = adoptArrayPtr(new CharacterProperties[size]);
+ memset(values.get(), 0, sizeof(CharacterProperties) * size);
+ setRanges(values.get(), cjkIdeographRanges, WTF_ARRAY_LENGTH(cjkIdeographRanges),
+ CharacterProperties::isCJKIdeographOrSymbol);
+ setRanges(values.get(), cjkSymbolRanges, WTF_ARRAY_LENGTH(cjkSymbolRanges),
+ CharacterProperties::isCJKIdeographOrSymbol);
+ setValues(values.get(), cjkIsolatedSymbolsArray, WTF_ARRAY_LENGTH(cjkIsolatedSymbolsArray),
+ CharacterProperties::isCJKIdeographOrSymbol);
+ setRanges(values.get(), isUprightInMixedVerticalRanges, WTF_ARRAY_LENGTH(isUprightInMixedVerticalRanges),
+ CharacterProperties::isUprightInMixedVertical);
+ setValues(values.get(), isUprightInMixedVerticalArray, WTF_ARRAY_LENGTH(isUprightInMixedVerticalArray),
+ CharacterProperties::isUprightInMixedVertical);
+
+ // Create a Trie from the value array.
+ UErrorCode error = U_ZERO_ERROR;
+ UTrie2* trie = utrie2_open(0, 0, &error);
+ UChar32 start = 0;
+ CharacterProperties value = values[0];
+ for (UChar32 c = 1; ; c++) {
+ if (c < size && values[c] == value)
+ continue;
+ if (static_cast<uint32_t>(value))
+ utrie2_setRange32(trie, start, c - 1, static_cast<uint32_t>(value), TRUE, &error);
+ if (c >= size)
+ break;
+ start = c;
+ value = values[c];
+ }
+ utrie2_freeze(trie, UTrie2ValueBits::UTRIE2_16_VALUE_BITS, &error);
+ return trie;
+}
+
+static bool hasCharacterProperty(UChar32 c, CharacterProperties property)
+{
+ static UTrie2* trie = nullptr;
+ if (!trie)
+ trie = createTrie();
+ return UTRIE2_GET16(trie, c) & static_cast<CharacterPropertiesType>(property);
+}
+
// Takes a flattened list of closed intervals
template <class T, size_t size>
bool valueInIntervalList(const T (&intervalList)[size], const T& value)
@@ -177,163 +447,7 @@ CodePath Character::characterRangeCodePath(const UChar* characters, unsigned len
bool Character::isUprightInMixedVertical(UChar32 character)
{
- // Fast path for common non-CJK
- if (character < 0x000A7)
- return false;
-
- // Fast path for common CJK
- if (isInRange(character, 0x02E80, 0x0A4CF))
- return true;
-
- if (isInRange(character, 0x0FF01, 0x0FFE7)) {
- if (character <= 0x0FF0C || isInRange(character, 0x0FF0E, 0x0FF1B)
- || isInRange(character, 0x0FF1F, 0x0FF60) || character >= 0x0FFE0)
- return true;
- return false;
- }
-
- // Fast path for medium-common non-CJK
- if (character == 0x000A7 || character == 0x000A9 || character == 0x000AE)
- return true;
- if (character == 0x000B1 || character == 0x000BC || character == 0x000BD || character == 0x000BE)
- return true;
- if (character == 0x000D7 || character == 0x000F7)
- return true;
- if (character < 0x002EA)
- return false;
-
- static const UChar32 uprightRanges[] = {
- // Spacing Modifier Letters (Part of)
- 0x002EA, 0x002EB,
- // Hangul Jamo
- 0x01100, 0x011FF,
- // Unified Canadian Aboriginal Syllabics
- 0x01401, 0x0167F,
- // Unified Canadian Aboriginal Syllabics Extended
- 0x018B0, 0x018FF,
- // General Punctuation (Part of)
- 0x02016, 0x02016,
- 0x02020, 0x02021,
- 0x02030, 0x02031,
- 0x0203B, 0x0203C,
- 0x02042, 0x02042,
- 0x02047, 0x02049,
- 0x02051, 0x02051,
- 0x02065, 0x02069,
- // Combining Diacritical Marks for Symbols (Part of)
- 0x020DD, 0x020E0,
- 0x020E2, 0x020E4,
- // Letterlike Symbols (Part of)/Number Forms
- 0x02100, 0x02101,
- 0x02103, 0x02109,
- 0x0210F, 0x0210F,
- 0x02113, 0x02114,
- 0x02116, 0x02117,
- 0x0211E, 0x02123,
- 0x02125, 0x02125,
- 0x02127, 0x02127,
- 0x02129, 0x02129,
- 0x0212E, 0x0212E,
- 0x02135, 0x0213F,
- 0x02145, 0x0214A,
- 0x0214C, 0x0214D,
- 0x0214F, 0x0218F,
- // Mathematical Operators (Part of)
- 0x0221E, 0x0221E,
- 0x02234, 0x02235,
- // Miscellaneous Technical (Part of)
- 0x02300, 0x02307,
- 0x0230C, 0x0231F,
- 0x02324, 0x0232B,
- 0x0237D, 0x0239A,
- 0x023BE, 0x023CD,
- 0x023CF, 0x023CF,
- 0x023D1, 0x023DB,
- 0x023E2, 0x02422,
- // Control Pictures (Part of)/Optical Character Recognition/Enclosed Alphanumerics
- 0x02424, 0x024FF,
- // Geometric Shapes/Miscellaneous Symbols (Part of)
- 0x025A0, 0x02619,
- 0x02620, 0x02767,
- 0x02776, 0x02793,
- // Miscellaneous Symbols and Arrows (Part of)
- 0x02B12, 0x02B2F,
- 0x02B50, 0x02B59,
- 0x02BB8, 0x02BFF,
- // Hangul Jamo Extended-A
- 0x0A960, 0x0A97F,
- // Hangul Syllables/Hangul Jamo Extended-B
- 0x0AC00, 0x0D7FF,
- // Private Use Area/CJK Compatibility Ideographs
- 0x0E000, 0x0FAFF,
- // Vertical Forms
- 0x0FE10, 0x0FE1F,
- // CJK Compatibility Forms (Part of)
- 0x0FE30, 0x0FE48,
- // Small Form Variants (Part of)
- 0x0FE50, 0x0FE57,
- 0x0FE59, 0x0FE62,
- 0x0FE67, 0x0FE6F,
- // Specials (Part of)
- 0x0FFF0, 0x0FFF8,
- 0x0FFFC, 0x0FFFD,
- // Meroitic Hieroglyphs
- 0x10980, 0x1099F,
- // Siddham
- 0x11580, 0x115FF,
- // Egyptian Hieroglyphs
- 0x13000, 0x1342F,
- // Kana Supplement
- 0x1B000, 0x1B0FF,
- // Byzantine Musical Symbols/Musical Symbols
- 0x1D000, 0x1D1FF,
- // Tai Xuan Jing Symbols/Counting Rod Numerals
- 0x1D300, 0x1D37F,
- // Mahjong Tiles/Domino Tiles/Playing Cards/Enclosed Alphanumeric Supplement
- // Enclosed Ideographic Supplement/Enclosed Ideographic Supplement
- // Emoticons/Ornamental Dingbats/Transport and Map Symbols/Alchemical Symbols
- // Alchemical Symbols
- 0x1F000, 0x1F7FF,
- // CJK Unified Ideographs Extension B/C/D
- // CJK Compatibility Ideographs Supplement
- 0x20000, 0x2FFFD,
- 0x30000, 0x3FFFD,
- // Supplementary Private Use Area-A
- 0xF0000, 0xFFFFD,
- // Supplementary Private Use Area-B
- 0x100000, 0x10FFFD,
- };
- return valueInIntervalList(uprightRanges, character);
-}
-
-bool Character::isCJKIdeograph(UChar32 c)
-{
- static const UChar32 cjkIdeographRanges[] = {
- // CJK Radicals Supplement and Kangxi Radicals.
- 0x2E80, 0x2FDF,
- // CJK Strokes.
- 0x31C0, 0x31EF,
- // CJK Unified Ideographs Extension A.
- 0x3400, 0x4DBF,
- // The basic CJK Unified Ideographs block.
- 0x4E00, 0x9FFF,
- // CJK Compatibility Ideographs.
- 0xF900, 0xFAFF,
- // CJK Unified Ideographs Extension B.
- 0x20000, 0x2A6DF,
- // CJK Unified Ideographs Extension C.
- // CJK Unified Ideographs Extension D.
- 0x2A700, 0x2B81F,
- // CJK Compatibility Ideographs Supplement.
- 0x2F800, 0x2FA1F
- };
- static size_t cjkIdeographRangesCount = WTF_ARRAY_LENGTH(cjkIdeographRanges);
-
- // Early out
- if (c < cjkIdeographRanges[0] || c > cjkIdeographRanges[cjkIdeographRangesCount - 1])
- return false;
-
- return valueInIntervalList(cjkIdeographRanges, c);
+ return hasCharacterProperty(character, CharacterProperties::isUprightInMixedVertical);
}
bool Character::isCJKIdeographOrSymbol(UChar32 c)
@@ -342,64 +456,7 @@ bool Character::isCJKIdeographOrSymbol(UChar32 c)
if (c < 0x2C7)
return false;
- if (isCJKIdeograph(c))
- return true;
-
- static const UChar32 cjkSymbolRanges[] = {
- 0x2156, 0x215A,
- 0x2160, 0x216B,
- 0x2170, 0x217B,
- 0x23BE, 0x23CC,
- 0x2460, 0x2492,
- 0x249C, 0x24FF,
- 0x25CE, 0x25D3,
- 0x25E2, 0x25E6,
- 0x2600, 0x2603,
- 0x2660, 0x266F,
- 0x2672, 0x267D,
- 0x2776, 0x277F,
- // Ideographic Description Characters, with CJK Symbols and Punctuation, excluding 0x3030.
- // Then Hiragana 0x3040 .. 0x309F, Katakana 0x30A0 .. 0x30FF, Bopomofo 0x3100 .. 0x312F
- 0x2FF0, 0x302F,
- 0x3031, 0x312F,
- // More Bopomofo and Bopomofo Extended 0x31A0 .. 0x31BF
- 0x3190, 0x31BF,
- // Enclosed CJK Letters and Months (0x3200 .. 0x32FF).
- // CJK Compatibility (0x3300 .. 0x33FF).
- 0x3200, 0x33FF,
- 0xF860, 0xF862,
- // CJK Compatibility Forms.
- 0xFE30, 0xFE4F,
- // Halfwidth and Fullwidth Forms
- // Usually only used in CJK
- 0xFF00, 0xFF0C,
- 0xFF0E, 0xFF1A,
- 0xFF1F, 0xFFEF,
- // Emoji.
- 0x1F110, 0x1F129,
- 0x1F130, 0x1F149,
- 0x1F150, 0x1F169,
- 0x1F170, 0x1F189,
- 0x1F200, 0x1F6FF
- };
-
- if (c >= cjkSymbolRanges[0]
- && c <= cjkSymbolRanges[WTF_ARRAY_LENGTH(cjkSymbolRanges) - 1]
- && valueInIntervalList(cjkSymbolRanges, c)) {
- return true;
- }
-
- if (c < 0x2020 && c > 0x2D9)
- return false;
-
- // Hash lookup for isolated symbols (those not part of a contiguous range)
- static HashSet<UChar32>* cjkIsolatedSymbols = 0;
- if (!cjkIsolatedSymbols) {
- cjkIsolatedSymbols = new HashSet<UChar32>();
- for (size_t i = 0; i < WTF_ARRAY_LENGTH(cjkIsolatedSymbolsArray); ++i)
- cjkIsolatedSymbols->add(cjkIsolatedSymbolsArray[i]);
- }
- return cjkIsolatedSymbols->contains(c);
+ return hasCharacterProperty(c, CharacterProperties::isCJKIdeographOrSymbol);
}
unsigned Character::expansionOpportunityCount(const LChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion, const TextJustify textJustify)
« no previous file with comments | « third_party/WebKit/Source/platform/fonts/Character.h ('k') | third_party/WebKit/Source/platform/fonts/FontTest.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698