Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(262)

Unified Diff: third_party/WebKit/Source/core/editing/EditingUtilities.cpp

Issue 1833413002: [All-in-one patch] Implement own grapheme boundary breaker for editing. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Addressed comments Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/editing/EditingUtilities.cpp
diff --git a/third_party/WebKit/Source/core/editing/EditingUtilities.cpp b/third_party/WebKit/Source/core/editing/EditingUtilities.cpp
index a5748f859505025388e33c7a977601d7afb6f13f..3b50f461edc0b501293681dfbc84d596cd3360a5 100644
--- a/third_party/WebKit/Source/core/editing/EditingUtilities.cpp
+++ b/third_party/WebKit/Source/core/editing/EditingUtilities.cpp
@@ -52,10 +52,15 @@
#include "core/html/HTMLUListElement.h"
#include "core/layout/LayoutObject.h"
#include "core/layout/LayoutTableCell.h"
+#include "platform/fonts/Character.h"
+#include "third_party/icu/source/common/unicode/uchar.h"
+#include "third_party/icu/source/common/unicode/utf16.h"
#include "wtf/Assertions.h"
#include "wtf/StdLibExtras.h"
#include "wtf/text/StringBuilder.h"
+#include <algorithm>
+
namespace blink {
using namespace HTMLNames;
@@ -542,18 +547,155 @@ PositionInFlatTree lastEditablePositionBeforePositionInRoot(const PositionInFlat
return lastEditablePositionBeforePositionInRootAlgorithm<EditingInFlatTreeStrategy>(position, highestRoot);
}
+// Returns true if the code point has Glue_After_Zwj grapheme break property.
+// See http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakProperty-9.0.0d18.txt
+bool isGlueAfterZwj(uint32_t codePoint)
+{
+ return codePoint == 0x2764 // HEAVY BLACK HEART.
+ || codePoint == 0x1F48B // KISS MARK.
+ || codePoint == 0x1F5E8; // LEFT_SPEECH BUBBLE.
+}
+
+// Returns true if the code point has E_Basae_GAZ grapheme break property.
+// See http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakProperty-9.0.0d18.txt
+bool isEBaseGAZ(uint32_t codePoint)
+{
+ return codePoint >= 0x1F466 && codePoint <= 0x1F469;
+}
+
+// The list of code points which has Indic_Syllabic_Category=Virama property.
+// Must be sorted.
+const uint32_t kIndicSyllabicCategoryViramaList[] = {
+ 0x094D, 0x09CD, 0x0A4D, 0x0ACD, 0x0B4D, 0x0BCD, 0x0C4D, 0x0CCD, 0x0D4D, 0x0DCA, 0x1B44, 0xA8C4,
+ 0xA9C0, 0x11046, 0x110B9, 0x111C0, 0x11235, 0x1134D, 0x11442, 0x114C2, 0x115BF, 0x1163F,
+ 0x116B6, 0x11C3F,
+};
+
+// Returns true if the code point has Indic_Syllabic_Category=Virama property.
+bool isIndicSyllabicCategoryVirama(uint32_t codePoint)
+{
+ const int length = WTF_ARRAY_LENGTH(kIndicSyllabicCategoryViramaList);
+ return std::binary_search(kIndicSyllabicCategoryViramaList,
+ kIndicSyllabicCategoryViramaList + length,
+ codePoint);
+}
+
+bool isGraphemeBreak(const uint16_t* buf, int length, int offset)
yosin_UTC9 2016/03/29 01:27:30 Can we make this in state machine as BackspaceStat
+{
+ DCHECK_GE(length, 0);
+ DCHECK_GE(offset, 0);
+ DCHECK_LE(offset, length);
+ // The following breaking rules come from Unicode Standard Annex #29 on Unicode Text Segmaentation.
+ // See http://www.unicode.org/reports/tr29/
+ // Note that some of rules are in proposal. also see http://www.unicode.org/reports/tr29/proposed.html
+ const uint32_t ZWJ = 0x200D;
+
+ // Rule1 GB1 sot ÷
+ // Rule2 GB2 ÷ eot
+ if (offset <= 0 || offset >= length)
+ return true;
+ if (U16_IS_TRAIL(buf[offset])) {
+ // Don't break a surrogate pair, but break lonely trailing surrogate.
+ return !U16_IS_LEAD(buf[offset - 1]);
+ }
+ uint32_t prevCodePoint = 0;
+ uint32_t nextCodePoint = 0;
+ int offsetBack = offset;
+ U16_PREV(buf, 0, offsetBack, prevCodePoint);
+ U16_NEXT(buf, offset, length, nextCodePoint);
+ int prevProp = u_getIntPropertyValue(prevCodePoint, UCHAR_GRAPHEME_CLUSTER_BREAK);
+ int nextProp = u_getIntPropertyValue(nextCodePoint, UCHAR_GRAPHEME_CLUSTER_BREAK);
+
+ // Rule GB3, CR x LF
+ if (prevProp == U_GCB_CR && nextProp == U_GCB_LF)
+ return false;
+
+ // Rule GB4, (Control | CR | LF) ÷
+ if (prevProp == U_GCB_CONTROL || prevProp == U_GCB_CR || prevProp == U_GCB_LF)
+ return true;
+
+ // Rule GB5, ÷ (Control | CR | LF)
+ if (nextProp == U_GCB_CONTROL || nextProp == U_GCB_CR || nextProp == U_GCB_LF)
+ return true;
+
+ // Rule GB6, L x (L | V | LV | LVT)
+ if (prevProp == U_GCB_L
+ && (nextProp == U_GCB_L || nextProp == U_GCB_V || nextProp == U_GCB_LV || nextProp == U_GCB_LVT))
+ return false;
+
+ // Rule GB7, (LV | V) x (V | T)
+ if ((prevProp == U_GCB_LV || prevProp == U_GCB_V) && (nextProp == U_GCB_V || nextProp == U_GCB_T))
+ return false;
+
+ // Rule GB8, (LVT | T) x T
+ if ((prevProp == U_GCB_LVT || prevProp == U_GCB_T) && nextProp == U_GCB_T)
+ return false;
+
+ // Rule GB8a
+ //
+ // sot (RI RI)* RI x RI
+ // [^RI] (RI RI)* RI x RI
+ // RI ÷ RI
+ if (Character::isRegionalIndicator(prevCodePoint) && Character::isRegionalIndicator(nextCodePoint)) {
+ while (offsetBack > 0) {
+ U16_PREV(buf, 0, offsetBack, prevCodePoint);
+ if (!Character::isRegionalIndicator(prevCodePoint)) {
+ offsetBack += U16_LENGTH(prevCodePoint);
+ break;
+ }
+ }
+
+ // Note that the offset has moved forwared 2 code units by U16_NEXT.
+ // The number 4 comes from the number of code units in a whole flag.
+ return (offset - 2 - offsetBack) % 4 == 0;
+ }
+
+ // Rule GB9, x (Extend | ZWJ)
+ // Rule GB9a, x SpacingMark
+ if (nextProp == U_GCB_EXTEND || nextCodePoint == ZWJ || nextProp == U_GCB_SPACING_MARK)
+ return false;
+
+ // Rule GB9b, Prepend x
+ if (prevProp == U_GCB_PREPEND)
+ return false;
+
+ // Cluster Indic syllables together.
+ if (isIndicSyllabicCategoryVirama(prevCodePoint)
+ && u_getIntPropertyValue(nextCodePoint, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER)
+ return false;
+
+ // Proposed Rule GB10, (E_Base | EBG) x E_Modifier
+ if ((Character::isEmojiModifierBase(prevCodePoint) || isEBaseGAZ(prevCodePoint))
+ && Character::isModifier(nextCodePoint))
+ return false;
+
+ // Proposed Rule GB11, ZWJ x (Glue_After_Zwj | EBG)
+ if (prevCodePoint == ZWJ && (isGlueAfterZwj(nextCodePoint) || isEBaseGAZ(nextCodePoint)))
+ return false;
+
+ // Rule GB999 any ÷ any
+ return true;
+}
+
int uncheckedPreviousOffset(const Node* node, int current)
{
+ DCHECK_GE(current, 0);
if (!node->isTextNode())
return current - 1;
+ if (current <= 1)
+ return current - 1; // It's fine to return -1;
const String& text = toText(node)->data();
- if (text.is8Bit())
- return current - 1; // TODO(nona): Good to support CR x LF.
- TextBreakIterator* iterator = cursorMovementIterator(text.characters16(), text.length());
- if (!iterator)
- return current - 1;
- const int result = iterator->preceding(current);
- return result == TextBreakDone ? current - 1 : result;
+ if (text.is8Bit()) {
+ const uint8_t* buf = text.characters8();
+ --current;
+ if (buf[current - 1] == 0x0D && buf[current] == 0x0A)
+ --current;
+ return current;
+ }
+ const uint16_t* buf = text.characters16();
+ const int length = text.length();
+ while (!isGraphemeBreak(buf, length, --current)) {}
+ return current;
}
static int uncheckedPreviousOffsetForBackwardDeletion(const Node* n, int current)
@@ -563,16 +705,23 @@ static int uncheckedPreviousOffsetForBackwardDeletion(const Node* n, int current
int uncheckedNextOffset(const Node* node, int current)
{
+ DCHECK_GE(current, 0);
if (!node->isTextNode())
return current + 1;
const String& text = toText(node)->data();
- if (text.is8Bit())
- return current + 1; // TODO(nona): Good to support CR x LF.
- TextBreakIterator* iterator = cursorMovementIterator(text.characters16(), text.length());
- if (!iterator)
- return current + 1;
- const int result = iterator->following(current);
- return result == TextBreakDone ? current + 1 : result;
+ const int length = text.length();
+ if (current >= (length - 1))
+ return current + 1; // It's fine to return length + 1;
+ if (text.is8Bit()) {
+ const uint8_t* buf = text.characters8();
+ ++current;
+ if (buf[current - 1] == 0x0D && buf[current] == 0x0A)
+ ++current;
+ return current;
+ }
+ const uint16_t* buf = text.characters16();
+ while (!isGraphemeBreak(buf, length, ++current)) {}
+ return current;
}
template <typename Strategy>
« no previous file with comments | « third_party/WebKit/Source/core/DEPS ('k') | third_party/WebKit/Source/core/editing/EditingUtilitiesTest.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698