11
11
//! sequence of NodeInfos to the different arrays in SerializedDepGraph. Since the
12
12
//! node and edge count are stored at the end of the file, all the arrays can be
13
13
//! pre-allocated with the right length.
14
+ //!
15
+ //! The encoding of the de-pgraph is generally designed around the fact that fixed-size
16
+ //! reads of encoded data are generally faster than variable-sized reads. Ergo we adopt
17
+ //! essentially the same varint encoding scheme used in the rmeta format; the edge lists
18
+ //! for each node on the graph store a 2-bit integer which is the number of bytes per edge
19
+ //! index in that node's edge list. We effectively ignore that an edge index of 0 could be
20
+ //! encoded with 0 bytes in order to not require 3 bits to store the byte width of the edges.
21
+ //! The overhead of calculating the correct byte width for each edge is mitigated by
22
+ //! building edge lists with [`EdgesVec`] which keeps a running max of the edges in a node.
23
+ //!
24
+ //! When we decode this data, we do not immediately create [`SerializedDepNodeIndex`] and
25
+ //! instead keep the data in its denser serialized form which lets us turn our on-disk size
26
+ //! efficiency directly into a peak memory reduction. When we convert these encoded-in-memory
27
+ //! values into their fully-deserialized type, we use a fixed-size read of the encoded array
28
+ //! then mask off any errant bytes we read. The array of edge index bytes is padded to permit this.
29
+ //!
30
+ //! We also encode and decode the entire rest of each node using [`SerializedNodeHeader`]
31
+ //! to let this encoding and decoding be done in one fixed-size operation. These headers contain
32
+ //! two [`Fingerprint`]s along with the serialized [`DepKind`], and the number of edge indices
33
+ //! in the node and the number of bytes used to encode the edge indices for this node. The
34
+ //! [`DepKind`], number of edges, and bytes per edge are all bit-packed together, if they fit.
35
+ //! If the number of edges in this node does not fit in the bits available in the header, we
36
+ //! store it directly after the header with leb128.
14
37
15
38
use super :: query:: DepGraphQuery ;
16
39
use super :: { DepKind , DepNode , DepNodeIndex } ;
@@ -37,7 +60,7 @@ const DEP_NODE_SIZE: usize = std::mem::size_of::<SerializedDepNodeIndex>();
37
60
/// Amount of padding we need to add to the edge list data so that we can retrieve every
38
61
/// SerializedDepNodeIndex with a fixed-size read then mask.
39
62
const DEP_NODE_PAD : usize = DEP_NODE_SIZE - 1 ;
40
- /// Amount of bits we need to store the number of used bytes in a SerializedDepNodeIndex.
63
+ /// Number of bits we need to store the number of used bytes in a SerializedDepNodeIndex.
41
64
/// Note that wherever we encode byte widths like this we actually store the number of bytes used
42
65
/// minus 1; for a 4-byte value we technically would have 5 widths to store, but using one byte to
43
66
/// store zeroes (which are relatively rare) is a decent tradeoff to save a bit in our bitfields.
@@ -181,8 +204,15 @@ impl<'a, K: DepKind + Decodable<MemDecoder<'a>>> Decodable<MemDecoder<'a>>
181
204
let mut nodes = IndexVec :: with_capacity ( node_count) ;
182
205
let mut fingerprints = IndexVec :: with_capacity ( node_count) ;
183
206
let mut edge_list_indices = IndexVec :: with_capacity ( node_count) ;
184
- // This slightly over-estimates the amount of bytes used for all the edge data but never by
185
- // more than ~6%, because over-estimation only occurs for large nodes.
207
+ // This estimation assumes that all of the encoded bytes are for the edge lists or for the
208
+ // fixed-size node headers. But that's not necessarily true; if any edge list has a length
209
+ // that spills out of the size we can bit-pack into SerializedNodeHeader then some of the
210
+ // total serialized size is also used by leb128-encoded edge list lengths. Neglecting that
211
+ // contribution to graph_bytes means our estimation of the bytes needed for edge_list_data
212
+ // slightly overshoots. But it cannot overshoot by much; consider that the worse case is
213
+ // for a node with length 64, which means the spilled 1-byte leb128 length is 1 byte of at
214
+ // least (34 byte header + 1 byte len + 64 bytes edge data), which is ~1%. A 2-byte leb128
215
+ // length is about the same fractional overhead and it amortizes for yet greater lengths.
186
216
let mut edge_list_data = Vec :: with_capacity (
187
217
graph_bytes - node_count * std:: mem:: size_of :: < SerializedNodeHeader < K > > ( ) ,
188
218
) ;
@@ -254,10 +284,13 @@ struct Unpacked<K> {
254
284
fingerprint : Fingerprint ,
255
285
}
256
286
257
- // Bit fields are
258
- // 0..? length of the edge
259
- // ?..?+2 bytes per index
260
- // ?+2..16 kind
287
+ // Bit fields, where
288
+ // M: bits used to store the length of a node's edge list
289
+ // N: bits used to store the byte width of elements of the edge list
290
+ // are
291
+ // 0..M length of the edge
292
+ // M..M+N bytes per index
293
+ // M+N..16 kind
261
294
impl < K : DepKind > SerializedNodeHeader < K > {
262
295
const TOTAL_BITS : usize = std:: mem:: size_of :: < K > ( ) * 8 ;
263
296
const LEN_BITS : usize = Self :: TOTAL_BITS - Self :: KIND_BITS - Self :: WIDTH_BITS ;
0 commit comments