|
16 | 16 | */
|
17 | 17 |
|
18 | 18 | import { randomBytes } from '../platform/random_bytes';
|
19 |
| -import { newTextEncoder } from '../platform/text_serializer'; |
20 | 19 |
|
21 | 20 | import { debugAssert } from './assert';
|
22 | 21 |
|
@@ -77,63 +76,50 @@ export interface Equatable<T> {
|
77 | 76 |
|
78 | 77 | /** Compare strings in UTF-8 encoded byte order */
|
79 | 78 | export function compareUtf8Strings(left: string, right: string): number {
|
80 |
| - let i = 0; |
81 |
| - while (i < left.length && i < right.length) { |
82 |
| - const leftCodePoint = left.codePointAt(i)!; |
83 |
| - const rightCodePoint = right.codePointAt(i)!; |
84 |
| - |
85 |
| - if (leftCodePoint !== rightCodePoint) { |
86 |
| - if (leftCodePoint < 128 && rightCodePoint < 128) { |
87 |
| - // ASCII comparison |
88 |
| - return primitiveComparator(leftCodePoint, rightCodePoint); |
89 |
| - } else { |
90 |
| - // Lazy instantiate TextEncoder |
91 |
| - const encoder = newTextEncoder(); |
92 |
| - |
93 |
| - // UTF-8 encode the character at index i for byte comparison. |
94 |
| - const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i)); |
95 |
| - const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i)); |
96 |
| - |
97 |
| - const comp = compareByteArrays(leftBytes, rightBytes); |
98 |
| - if (comp !== 0) { |
99 |
| - return comp; |
100 |
| - } else { |
101 |
| - // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte |
102 |
| - // representations are identical. This can happen with malformed input |
103 |
| - // (invalid surrogate pairs). The backend also actively prevents invalid |
104 |
| - // surrogates as INVALID_ARGUMENT errors, so we almost never receive |
105 |
| - // invalid strings from backend. |
106 |
| - // Fallback to code point comparison for graceful handling. |
107 |
| - return primitiveComparator(leftCodePoint, rightCodePoint); |
108 |
| - } |
109 |
| - } |
| 79 | + // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and, |
| 80 | + // if found, use that character to determine the relative ordering of the two strings as a |
| 81 | + // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by |
| 82 | + // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8 |
| 83 | + // and UTF-16 happen to represent Unicode code points. |
| 84 | + // |
| 85 | + // After finding the first pair of differing characters, there are two cases: |
| 86 | + // |
| 87 | + // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or |
| 88 | + // both are surrogates from a surrogate pair (that collectively represent code points greater |
| 89 | + // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the |
| 90 | + // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is |
| 91 | + // sufficient. |
| 92 | + // |
| 93 | + // Case 2: One character is a surrogate and the other is not. In this case the surrogate- |
| 94 | + // containing string is always ordered after the non-surrogate. This is because surrogates are |
| 95 | + // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations |
| 96 | + // and are lexicographically greater than the 1, 2, or 3-byte representations of code points |
| 97 | + // less than or equal to 0xFFFF. |
| 98 | + const length = Math.min(left.length, right.length); |
| 99 | + for (let i = 0; i < length; i++) { |
| 100 | + const leftChar = left.charAt(i); |
| 101 | + const rightChar = right.charAt(i); |
| 102 | + if (leftChar !== rightChar) { |
| 103 | + return isSurrogate(leftChar) === isSurrogate(rightChar) |
| 104 | + ? primitiveComparator(leftChar, rightChar) |
| 105 | + : isSurrogate(leftChar) |
| 106 | + ? 1 |
| 107 | + : -1; |
110 | 108 | }
|
111 |
| - // Increment by 2 for surrogate pairs, 1 otherwise |
112 |
| - i += leftCodePoint > 0xffff ? 2 : 1; |
113 | 109 | }
|
114 | 110 |
|
115 |
| - // Compare lengths if all characters are equal |
| 111 | + // Use the lengths of the strings to determine the overall comparison result since either the |
| 112 | + // strings were equal or one is a prefix of the other. |
116 | 113 | return primitiveComparator(left.length, right.length);
|
117 | 114 | }
|
118 | 115 |
|
119 |
| -function getUtf8SafeSubstring(str: string, index: number): string { |
120 |
| - const firstCodePoint = str.codePointAt(index)!; |
121 |
| - if (firstCodePoint > 0xffff) { |
122 |
| - // It's a surrogate pair, return the whole pair |
123 |
| - return str.substring(index, index + 2); |
124 |
| - } else { |
125 |
| - // It's a single code point, return it |
126 |
| - return str.substring(index, index + 1); |
127 |
| - } |
128 |
| -} |
| 116 | +const MIN_SURROGATE = 0xd800; |
| 117 | +const MAX_SURROGATE = 0xdfff; |
129 | 118 |
|
130 |
| -function compareByteArrays(left: Uint8Array, right: Uint8Array): number { |
131 |
| - for (let i = 0; i < left.length && i < right.length; ++i) { |
132 |
| - if (left[i] !== right[i]) { |
133 |
| - return primitiveComparator(left[i], right[i]); |
134 |
| - } |
135 |
| - } |
136 |
| - return primitiveComparator(left.length, right.length); |
| 119 | +export function isSurrogate(s: string): boolean { |
| 120 | + debugAssert(s.length === 1, `s.length == ${s.length}, but expected 1`); |
| 121 | + const c = s.charCodeAt(0); |
| 122 | + return c >= MIN_SURROGATE && c <= MAX_SURROGATE; |
137 | 123 | }
|
138 | 124 |
|
139 | 125 | export interface Iterable<V> {
|
|
0 commit comments