-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild.rs
148 lines (131 loc) · 5.22 KB
/
build.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
//! Generates code from entities.json depending on enabled features.
//!
//! The canonical source is <https://html.spec.whatwg.org/entities.json> (see
//! <https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references>).
//!
//! The entities.json file looks like:
//!
//! {
//! "Æ": { "codepoints": [198], "characters": "\u00C6" },
//! . . .
//! }
fn main() {
#[cfg(any(feature = "unescape_fast", feature = "entities"))]
let entities = load_entities("entities.json");
#[cfg(feature = "unescape_fast")]
generate_matcher_rs(&entities);
#[cfg(feature = "entities")]
generate_entities_rs(&entities);
}
/// Generate entities.rs file containing all valid HTML entities in a
/// [`phf::Map`] along with a few useful constants. It also generates
/// documentation with all entities in a table.
#[cfg(feature = "entities")]
fn generate_entities_rs(entities: &[(String, String)]) {
use std::cmp::{max, min};
use std::env;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;
let out_path = Path::new(&env::var("OUT_DIR").unwrap()).join("entities.rs");
let mut out = BufWriter::new(File::create(out_path).unwrap());
writeln!(out, "\
#[allow(clippy::doc_markdown)] // Doesn’t work correctly here.\n\
/// A map of all valid HTML entities to their expansions.\n\
///\n\
/// The keys of the map are full entity byte strings, e.g. `b\"©\"`, and the\n\
/// values are their expansions, e.g. `b\"©\"`.\n\
///\n\
/// See the [WHATWG HTML spec][spec] for the canonical list of entities with\n\
/// their codepoints and glyphs. The [entities.json][] file linked there is\n\
/// used to generate this constant.\n\
///\n\
/// [spec]: https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references\n\
/// [entities.json]: https://html.spec.whatwg.org/entities.json\n\
///\n\
/// Entity | Codepoints | Glyph\n\
/// -------------------------------|--------------------|------").unwrap();
let mut map_builder = phf_codegen::Map::<&[u8]>::new();
let mut max_len: usize = 0;
let mut min_len: usize = usize::MAX;
for (name, glyph) in entities {
map_builder.entry(name.as_bytes(), &format!("&{:?}", glyph.as_bytes()));
max_len = max(max_len, name.len());
min_len = min(min_len, name.len());
// `{:28}` would pad the output inside the backticks.
let name = format!("`{name}`");
let codepoints = glyph
.chars()
.map(|c| format!("U+{:06X}", u32::from(c)))
.collect::<Vec<_>>()
.join(", ");
// Suppress a few inconvenient glyphs. Newline adds an extra line, and
// tab causes a clippy warning. Backticks are actually fine, but it’s
// correct to escape them.
let glyph = match glyph.as_str() {
"\n" | "\t" => "",
"`" => "\\`",
v => v,
};
writeln!(out, "/// {name:30} | {codepoints:18} | {glyph}",).unwrap();
}
let map = map_builder.build();
writeln!(
out,
"\
#[allow(clippy::unreadable_literal)]\n\
pub static ENTITIES: phf::Map<&[u8], &[u8]> = {map};\n\
\n\
/// Length of longest entity including ‘&’ and possibly ‘;’.\n\
pub const ENTITY_MAX_LENGTH: usize = {max_len};\n\
\n\
/// Length of shortest entity including ‘&’ and possibly ‘;’.\n\
pub const ENTITY_MIN_LENGTH: usize = {min_len};"
)
.unwrap();
}
/// Generated matcher.rs file containing a function `entity_matcher()` that is
/// basically just a giant nested tree of `match` expressions to check if the
/// next bytes in an iterator are an HTML entity.
#[cfg(feature = "unescape_fast")]
fn generate_matcher_rs(entities: &[(String, String)]) {
use std::env;
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;
let out_path = Path::new(&env::var("OUT_DIR").unwrap()).join("matcher.rs");
let mut out = BufWriter::new(File::create(out_path).unwrap());
let mut matcher = matchgen::TreeMatcher::new(
"fn entity_matcher",
"(bool, &'static [u8])",
);
for (name, glyph) in entities {
matcher.add(
name.as_bytes(),
format!("({:?}, &{:?})", name.ends_with(';'), glyph.as_bytes()),
);
}
matcher
.doc("Used in `match_entity()`.")
.disable_clippy(true)
.input_type(matchgen::Input::Iterator)
.render(&mut out)
.unwrap();
writeln!(out).unwrap();
}
/// Load HTML entities as `vec![...(">", ">")...]`.
#[cfg(any(feature = "unescape_fast", feature = "entities"))]
fn load_entities<P: AsRef<std::path::Path>>(path: P) -> Vec<(String, String)> {
let input = std::fs::read(path.as_ref()).unwrap();
let input: serde_json::Map<String, serde_json::Value> =
serde_json::from_slice(&input).unwrap();
input
.iter()
.map(|(name, info)| {
(
name.clone(),
info["characters"].as_str().unwrap().to_owned(),
)
})
.collect()
}