|
| 1 | +use crate::fmt_list; |
| 2 | +use crate::raw_emitter::RawEmitter; |
| 3 | +use std::collections::HashMap; |
| 4 | +use std::fmt::Write as _; |
| 5 | +use std::ops::Range; |
| 6 | + |
| 7 | +impl RawEmitter { |
| 8 | + pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool { |
| 9 | + let mut map: [u8; 256] = [ |
| 10 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 11 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 12 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 13 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 14 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 15 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 16 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 17 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 18 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 19 | + ]; |
| 20 | + |
| 21 | + let points = ranges |
| 22 | + .iter() |
| 23 | + .flat_map(|r| (r.start..r.end).into_iter().collect::<Vec<u32>>()) |
| 24 | + .collect::<Vec<u32>>(); |
| 25 | + |
| 26 | + println!("there are {} points", points.len()); |
| 27 | + |
| 28 | + // how many distinct ranges need to be counted? |
| 29 | + let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new(); |
| 30 | + for point in points { |
| 31 | + // assert that there is no whitespace over the 0x3000 range. |
| 32 | + assert!(point <= 0x3000, "the highest unicode whitespace value has changed"); |
| 33 | + let high_bytes = point as usize >> 8; |
| 34 | + let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new); |
| 35 | + codepoints.push(point); |
| 36 | + } |
| 37 | + |
| 38 | + let mut bit_for_high_byte = 1u8; |
| 39 | + let mut arms = Vec::<String>::new(); |
| 40 | + |
| 41 | + let mut high_bytes: Vec<usize> = |
| 42 | + codepoints_by_high_bytes.keys().map(|k| k.clone()).collect(); |
| 43 | + high_bytes.sort(); |
| 44 | + for high_byte in high_bytes { |
| 45 | + let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap(); |
| 46 | + if codepoints.len() == 1 { |
| 47 | + let ch = codepoints.pop().unwrap(); |
| 48 | + arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch)); |
| 49 | + continue; |
| 50 | + } |
| 51 | + // more than 1 codepoint in this arm |
| 52 | + for codepoint in codepoints { |
| 53 | + map[(*codepoint & 0xff) as usize] |= bit_for_high_byte; |
| 54 | + } |
| 55 | + arms.push(format!( |
| 56 | + "{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0", |
| 57 | + high_byte, bit_for_high_byte |
| 58 | + )); |
| 59 | + bit_for_high_byte <<= 1; |
| 60 | + } |
| 61 | + |
| 62 | + writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter())) |
| 63 | + .unwrap(); |
| 64 | + self.bytes_used += 256; |
| 65 | + |
| 66 | + writeln!(&mut self.file, "#[inline]").unwrap(); |
| 67 | + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); |
| 68 | + writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); |
| 69 | + for arm in arms { |
| 70 | + writeln!(&mut self.file, " {},", arm).unwrap(); |
| 71 | + } |
| 72 | + writeln!(&mut self.file, " _ => false,").unwrap(); |
| 73 | + writeln!(&mut self.file, " }}").unwrap(); |
| 74 | + writeln!(&mut self.file, "}}").unwrap(); |
| 75 | + |
| 76 | + true |
| 77 | + } |
| 78 | +} |
0 commit comments