Skip to content

Commit 76f3b89

Browse files
committed
Auto merge of rust-lang#99487 - bmacnaughton:is_whitespace_updates, r=thomcc
is_whitespace() performance improvements This is my first rust PR, so if I miss anything obvious please let me know and I'll do my best to fix it. This was a bit more of a challenge than I realized because, while I made working code locally and tested it against the native `is_whitespace()`, this PR required changing `src/tools/unicode-table-generator`, the code that generated the code. I have benchmarked this locally, using criterion, and have seen meaningful performance improvements. I can add those outputs to this if you'd like, but am guessing that the perf run that `@fmease` recommended is what's needed. I have run ` ./x.py test --stage 0 library/std` after building it locally after executing `./x.py build library`. I didn't try to build the whole compiler, but maybe I should have - any guidance would be appreciated. If this general approach makes sense, I'll take a look at some other candidate categories, e.g., `Cc`, in the future. Oh, and I wasn't sure whether the generated code should be included in this PR or not. I did include it.
2 parents cfb5ae2 + 5d048eb commit 76f3b89

File tree

4 files changed

+113
-12
lines changed

4 files changed

+113
-12
lines changed

library/core/src/unicode/unicode_data.rs

+18-10
Original file line numberDiff line numberDiff line change
@@ -544,18 +544,26 @@ pub mod uppercase {
544544

545545
#[rustfmt::skip]
546546
pub mod white_space {
547-
static SHORT_OFFSET_RUNS: [u32; 4] = [
548-
5760, 18882560, 23080960, 40972289,
549-
];
550-
static OFFSETS: [u8; 21] = [
551-
9, 5, 18, 1, 100, 1, 26, 1, 0, 1, 0, 11, 29, 2, 5, 1, 47, 1, 0, 1, 0,
547+
static WHITESPACE_MAP: [u8; 256] = [
548+
2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549+
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551+
0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553+
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
554+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
555+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
556+
0, 0, 0, 0, 0, 0, 0, 0, 0,
552557
];
558+
#[inline]
553559
pub fn lookup(c: char) -> bool {
554-
super::skip_search(
555-
c as u32,
556-
&SHORT_OFFSET_RUNS,
557-
&OFFSETS,
558-
)
560+
match c as u32 >> 8 {
561+
0 => WHITESPACE_MAP[c as usize & 0xff] & 1 != 0,
562+
22 => c as u32 == 0x1680,
563+
32 => WHITESPACE_MAP[c as usize & 0xff] & 2 != 0,
564+
48 => c as u32 == 0x3000,
565+
_ => false,
566+
}
559567
}
560568
}
561569

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
use crate::fmt_list;
2+
use crate::raw_emitter::RawEmitter;
3+
use std::collections::HashMap;
4+
use std::fmt::Write as _;
5+
use std::ops::Range;
6+
7+
impl RawEmitter {
8+
pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
9+
let mut map: [u8; 256] = [
10+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
11+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
12+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
13+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
14+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
18+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19+
];
20+
21+
let points = ranges
22+
.iter()
23+
.flat_map(|r| (r.start..r.end).into_iter().collect::<Vec<u32>>())
24+
.collect::<Vec<u32>>();
25+
26+
println!("there are {} points", points.len());
27+
28+
// how many distinct ranges need to be counted?
29+
let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
30+
for point in points {
31+
// assert that there is no whitespace over the 0x3000 range.
32+
assert!(point <= 0x3000, "the highest unicode whitespace value has changed");
33+
let high_bytes = point as usize >> 8;
34+
let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new);
35+
codepoints.push(point);
36+
}
37+
38+
let mut bit_for_high_byte = 1u8;
39+
let mut arms = Vec::<String>::new();
40+
41+
let mut high_bytes: Vec<usize> =
42+
codepoints_by_high_bytes.keys().map(|k| k.clone()).collect();
43+
high_bytes.sort();
44+
for high_byte in high_bytes {
45+
let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
46+
if codepoints.len() == 1 {
47+
let ch = codepoints.pop().unwrap();
48+
arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch));
49+
continue;
50+
}
51+
// more than 1 codepoint in this arm
52+
for codepoint in codepoints {
53+
map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
54+
}
55+
arms.push(format!(
56+
"{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0",
57+
high_byte, bit_for_high_byte
58+
));
59+
bit_for_high_byte <<= 1;
60+
}
61+
62+
writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter()))
63+
.unwrap();
64+
self.bytes_used += 256;
65+
66+
writeln!(&mut self.file, "#[inline]").unwrap();
67+
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
68+
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
69+
for arm in arms {
70+
writeln!(&mut self.file, " {},", arm).unwrap();
71+
}
72+
writeln!(&mut self.file, " _ => false,").unwrap();
73+
writeln!(&mut self.file, " }}").unwrap();
74+
writeln!(&mut self.file, "}}").unwrap();
75+
76+
true
77+
}
78+
}

src/tools/unicode-table-generator/src/main.rs

+8-2
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,13 @@ use std::collections::{BTreeMap, HashMap};
7575
use std::ops::Range;
7676
use ucd_parse::Codepoints;
7777

78+
mod cascading_map;
7879
mod case_mapping;
7980
mod raw_emitter;
8081
mod skiplist;
8182
mod unicode_download;
8283

83-
use raw_emitter::{emit_codepoints, RawEmitter};
84+
use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter};
8485

8586
static PROPERTIES: &[&str] = &[
8687
"Alphabetic",
@@ -241,8 +242,13 @@ fn main() {
241242
let mut modules = Vec::new();
242243
for (property, ranges) in ranges_by_property {
243244
let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
245+
244246
let mut emitter = RawEmitter::new();
245-
emit_codepoints(&mut emitter, &ranges);
247+
if property == &"White_Space" {
248+
emit_whitespace(&mut emitter, &ranges);
249+
} else {
250+
emit_codepoints(&mut emitter, &ranges);
251+
}
246252

247253
modules.push((property.to_lowercase().to_string(), emitter.file));
248254
println!(

src/tools/unicode-table-generator/src/raw_emitter.rs

+9
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,15 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
170170
}
171171
}
172172

173+
pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
174+
emitter.blank_line();
175+
176+
let mut cascading = emitter.clone();
177+
cascading.emit_cascading_map(&ranges);
178+
*emitter = cascading;
179+
emitter.desc = String::from("cascading");
180+
}
181+
173182
struct Canonicalized {
174183
canonical_words: Vec<u64>,
175184
canonicalized_words: Vec<(u8, u8)>,

0 commit comments

Comments
 (0)