Skip to content

Commit da7185b

Browse files
committed
misc.ts: re-implemented compareUtf8Strings() based on the greatly-improved algorithm from firebase/firebase-android-sdk#7098
1 parent ab5c2a0 commit da7185b

File tree

2 files changed

+42
-51
lines changed

2 files changed

+42
-51
lines changed

.changeset/twelve-walls-exist.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@firebase/firestore': patch
3+
---
4+
5+
Further improved performance of UTF-8 string ordering logic, which had degraded in v11.3.0, was reverted in v11.3.1, and was re-introduced with some improvements in v11.5.0.

packages/firestore/src/util/misc.ts

Lines changed: 37 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
*/
1717

1818
import { randomBytes } from '../platform/random_bytes';
19-
import { newTextEncoder } from '../platform/text_serializer';
2019

2120
import { debugAssert } from './assert';
2221

@@ -77,63 +76,50 @@ export interface Equatable<T> {
7776

7877
/** Compare strings in UTF-8 encoded byte order */
7978
export function compareUtf8Strings(left: string, right: string): number {
80-
let i = 0;
81-
while (i < left.length && i < right.length) {
82-
const leftCodePoint = left.codePointAt(i)!;
83-
const rightCodePoint = right.codePointAt(i)!;
84-
85-
if (leftCodePoint !== rightCodePoint) {
86-
if (leftCodePoint < 128 && rightCodePoint < 128) {
87-
// ASCII comparison
88-
return primitiveComparator(leftCodePoint, rightCodePoint);
89-
} else {
90-
// Lazy instantiate TextEncoder
91-
const encoder = newTextEncoder();
92-
93-
// UTF-8 encode the character at index i for byte comparison.
94-
const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i));
95-
const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i));
96-
97-
const comp = compareByteArrays(leftBytes, rightBytes);
98-
if (comp !== 0) {
99-
return comp;
100-
} else {
101-
// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte
102-
// representations are identical. This can happen with malformed input
103-
// (invalid surrogate pairs). The backend also actively prevents invalid
104-
// surrogates as INVALID_ARGUMENT errors, so we almost never receive
105-
// invalid strings from backend.
106-
// Fallback to code point comparison for graceful handling.
107-
return primitiveComparator(leftCodePoint, rightCodePoint);
108-
}
109-
}
79+
// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
80+
// if found, use that character to determine the relative ordering of the two strings as a
81+
// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
82+
// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
83+
// and UTF-16 happen to represent Unicode code points.
84+
//
85+
// After finding the first pair of differing characters, there are two cases:
86+
//
87+
// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
88+
// both are surrogates from a surrogate pair (that collectively represent code points greater
89+
// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
90+
// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
91+
// sufficient.
92+
//
93+
// Case 2: One character is a surrogate and the other is not. In this case the surrogate-
94+
// containing string is always ordered after the non-surrogate. This is because surrogates are
95+
// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
96+
// and are lexicographically greater than the 1, 2, or 3-byte representations of code points
97+
// less than or equal to 0xFFFF.
98+
const length = Math.min(left.length, right.length);
99+
for (let i = 0; i < length; i++) {
100+
const leftChar = left.charAt(i);
101+
const rightChar = right.charAt(i);
102+
if (leftChar !== rightChar) {
103+
return isSurrogate(leftChar) === isSurrogate(rightChar)
104+
? primitiveComparator(leftChar, rightChar)
105+
: isSurrogate(leftChar)
106+
? 1
107+
: -1;
110108
}
111-
// Increment by 2 for surrogate pairs, 1 otherwise
112-
i += leftCodePoint > 0xffff ? 2 : 1;
113109
}
114110

115-
// Compare lengths if all characters are equal
111+
// Use the lengths of the strings to determine the overall comparison result since either the
112+
// strings were equal or one is a prefix of the other.
116113
return primitiveComparator(left.length, right.length);
117114
}
118115

119-
function getUtf8SafeSubstring(str: string, index: number): string {
120-
const firstCodePoint = str.codePointAt(index)!;
121-
if (firstCodePoint > 0xffff) {
122-
// It's a surrogate pair, return the whole pair
123-
return str.substring(index, index + 2);
124-
} else {
125-
// It's a single code point, return it
126-
return str.substring(index, index + 1);
127-
}
128-
}
116+
const MIN_SURROGATE = 0xd800;
117+
const MAX_SURROGATE = 0xdfff;
129118

130-
function compareByteArrays(left: Uint8Array, right: Uint8Array): number {
131-
for (let i = 0; i < left.length && i < right.length; ++i) {
132-
if (left[i] !== right[i]) {
133-
return primitiveComparator(left[i], right[i]);
134-
}
135-
}
136-
return primitiveComparator(left.length, right.length);
119+
export function isSurrogate(s: string): boolean {
120+
debugAssert(s.length === 1, `s.length == ${s.length}, but expected 1`);
121+
const c = s.charCodeAt(0);
122+
return c >= MIN_SURROGATE && c <= MAX_SURROGATE;
137123
}
138124

139125
export interface Iterable<V> {

0 commit comments

Comments
 (0)