From 1a423e0c958bb1027a8f72b8a9648e12af0462a3 Mon Sep 17 00:00:00 2001 From: Yuri Astrakhan Date: Sun, 19 Apr 2026 04:00:37 -0400 Subject: [PATCH 01/12] feat(docs): v2 proposal Work in progress! --- docs/migrating-to-v2.md | 1047 +++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 1048 insertions(+) create mode 100644 docs/migrating-to-v2.md diff --git a/docs/migrating-to-v2.md b/docs/migrating-to-v2.md new file mode 100644 index 000000000..cdb897871 --- /dev/null +++ b/docs/migrating-to-v2.md @@ -0,0 +1,1047 @@ +# MLT v2 Wire Format Migration Guide + +This document provides a detailed, side-by-side comparison of every structural element +between MLT v1 and v2. It is the authoritative reference for implementing v2 encoders +and decoders. + +--- + +## 1. Guiding Principles + +| Principle | v1 | v2 | +|---|---|---| +| Feature count | Not stored; implied by geometry stream `num_values` | `feature_count` varint in every layer header | +| Column layout | Separate metadata section then data section | Column-by-column: type + config + data together | +| Stream type identification | Explicit `stream_type` byte on every stream | Eliminated; role implied by column type and stream position | +| Count per stream | Always stored as varint | Omitted when equal to `feature_count` or `popcount(presence_bitfield)`; required only when neither applies | +| Byte length per stream | Always stored as varint | Encoded in `physical` field: `None-noLen` omits it (derivable as count × width); `None-withLen`, `VarInt`, `FastPFor128` always carry it | +| Presence streams | Bool-RLE stream with 4-field header | Raw packed bitfield, no header; sharable across columns via index reference | +| RLE auxiliary fields | `runs` + `num_rle_values` as varints | Eliminated; both derivable at decode time | +| String encoding variant | `Str`/`OptStr` type + runtime `stream_count` | Encoded in column type code; 8 flat types | +| Dict null encoding | Optional presence stream controls which features are present | Null-at-index-0 in the indices stream; no presence bitfield | +| FastPFor byte order | Big-endian u32 | Little-endian u32 | +| FastPFor block size | 256 | 128 | + +> **Note on lazy / skip-ahead parsing.** A reader that only needs a subset of +> columns must be able to advance past unwanted streams without decoding them. +> `physical=None-noLen` streams are always skippable: `byte_length = num_values +> × element_width`, where both are known at header-parse time. `physical=VarInt` +> and `physical=FastPFor128` streams always carry an explicit `byte_length` (it +> is part of their physical encoding value), so they are also skippable. +> `has_explicit_count` is **not** needed for skipping: `popcount` of a presence +> bitfield is a fast bit-counting operation. `has_explicit_count = 1` is only +> required when the count is genuinely not derivable from `feature_count` or +> `popcount(presence_bitfield)` — for example, geometry vertex counts and string +> character/offset counts. + +--- + +## 2. Layer Envelope + +### v1 + +``` +[varint body_len+1] +[u8 tag = 1] +[varint name_len] [name bytes] +[varint extent] +[varint column_count] +── metadata section ────────────────────────────── +[u8 col_type₀] [optional: varint name_len + bytes] +[u8 col_type₁] [optional: varint name_len + bytes] +... +[u8 col_typeN] [optional: varint name_len + bytes] +── data section ────────────────────────────────── +[streams for column 0] +[streams for column 1] +... +[streams for column N] +``` + +`feature_count` is not stored. It is implied by the length of the geometry types stream. +Metadata and data are completely separated; the metadata section must be fully parsed +before any column data can be located. + +### v2 + +``` +[varint body_len+1] +[u8 tag = 1] +[varint name_len] [name bytes] +[varint extent] +[varint feature_count] ← NEW +[varint column_count] +── columns (merged meta + data) ────────────────── +[column 0: type + name? + presence? + data] +[column 1: type + name? + presence? + data] +... +[column N: type + name? + presence? + data] +``` + +**What changes:** +- `feature_count` is inserted after `extent`. +- The metadata section is eliminated. Each column is fully self-describing; its type + byte, optional name, optional presence section, and stream data appear contiguously. + +**Derived invariants unlocked by `feature_count`:** + +> **Popcount** means "count the number of bits set to `1` in the presence bitfield." +> If a presence bitfield has `feature_count` bits and K of them are `1`, then K features +> have a value and the optional data stream that follows contains exactly K encoded values. +> This document uses `popcount(bitfield)` as shorthand for that count. + +| Stream | Count in v1 | Count in v2 | +|---|---|---| +| Geometry types | `num_values` varint | = `feature_count` | +| Non-optional scalar data | `num_values` varint | = `feature_count` | +| Non-optional ID data | `num_values` varint | = `feature_count` | +| Presence bitfield size | stored in header | = `ceil(feature_count / 8)` bytes | +| Optional data stream | `num_values` varint | = `popcount(presence_bitfield)` — number of `1` bits in the bitfield | + +--- + +## 3. Column Layout (Meta + Data Merge) + +Every column — whether top-level or a SharedDict child — follows one of three templates +depending on its column type: + +**Non-optional** (e.g. `I32`): +``` +[u8 column_type] +[varint name_len] [name bytes] ← only when column_type.has_name() == true +[data section] +``` + +**With own presence** (e.g. `OptI32`): own bitfield is declared inline, no prefix varint: +``` +[u8 column_type] +[varint name_len] [name bytes] +[ceil(feature_count/8) bytes: raw packed bitfield, LSB-first per byte] + ← this bitfield is registered as the next presence group (0-based, sequential) +[data section] +``` + +**With shared presence** (e.g. `I32SharedPresence`): references a previously declared bitfield: +``` +[u8 column_type] +[varint name_len] [name bytes] +[varint presence_group] ← 0-based index of a prior column's bitfield +[data section] +``` + +`has_name()` is false for `Id`, `OptId`, `LongId`, `OptLongId`, and `Geometry` (same +rule as v1). + +### v1 split vs. v2 merged — concrete example + +**v1** (two-pass layout, 3 columns: Geometry, Id, OptI32): + +``` +[varint 3] ← column_count +[0x04] ← Geometry type +[0x00] ← Id type +[0x11] ← OptI32 type +[varint name_len] "val" ← Id has no name; OptI32 name +[geometry streams...] +[id streams...] +[presence stream for OptI32] +[i32 streams...] +``` + +**v2** (one-pass layout): + +``` +[varint feature_count] +[varint 3] ← column_count +[0x04] ← Geometry column_type +[geometry_flags] +[geometry streams...] +[0x00] ← Id column_type +[enc_byte] [data...] +[0x11] ← OptI32 column_type (own presence — no prefix varint) +[varint name_len] "val" +[bitfield bytes...] ← ceil(feature_count/8) bytes; registered as presence group 0 +[enc_byte] [data...] +``` + +--- + +## 4. Encoding Byte + +Every integer stream in v2 is prefixed by exactly one **encoding byte**, whose layout +replaces the two v1 bytes (stream_type + encoding) for streams where the type is implied. + +### v1 stream header (4–14 bytes per stream) + +``` +[u8] stream_type + high nibble: category (0=Present, 1=Data, 2=Offset, 3=Length) + low nibble: subtype (DictionaryType / OffsetType / LengthType) +[u8] encoding + bits 7-5: logical1 (0=None, 1=Delta, 2=CwDelta, 3=Rle, 4=Morton, 5=PseudoDecimal) + bits 4-2: logical2 (secondary; 0=None, 1=Delta, 3=Rle — used only with Morton) + bits 1-0: physical (0=None, 1=FastPFor256, 2=VarInt) +[varint] num_values ← always present +[varint] byte_length ← always present +[varint] runs ← RLE streams only +[varint] num_rle_values ← RLE streams only +[varint] bits ← Morton streams only +[varint] shift ← Morton streams only +``` + +### v2 encoding byte (1–6 bytes per stream) + +The byte has no separate flag bits for byte_length or extension presence — both are +implied by the `logical` and `physical` fields, eliminating invalid bit combinations. + +``` +[u8] encoding_byte + bit 7: has_explicit_count (0 = use context count, 1 = varint follows) + bits 6-4: logical + 0 = None + 1 = Delta + 2 = CwDelta (ComponentwiseDelta) + 3 = Rle → physical implied = VarInt; byte_length always follows; + bits 3-2 reserved (must be 00) + 4 = DeltaRle → physical implied = VarInt; byte_length always follows; + bits 3-2 reserved (must be 00) + 5 = Morton → extension byte always follows; + bits 3-2 are physical (all four values valid) + 6 = PseudoDecimal + 7 = reserved + bits 3-2: physical (meaning when logical ∈ {None, Delta, CwDelta, Morton, PseudoDecimal}) + 0 = None-noLen raw fixed-width, byte_length omitted + (derivable: byte_length = num_values × element_width) + 1 = None-withLen raw fixed-width, varint byte_length follows + 2 = VarInt zigzag/varint encoded, varint byte_length follows + 3 = FastPFor128 block-SIMD compressed, varint byte_length follows + bits 1-0: reserved (must be 0) + +[optional: varint num_values] ← present when has_explicit_count = 1 +[optional: varint byte_length] ← present when physical ∈ {None-withLen, VarInt, FastPFor128}, + or when logical ∈ {Rle, DeltaRle} +[optional: extension_byte] ← present when logical = Morton (always) + bits 7-6: Morton sub-variant (00=None, 01=Rle, 10=Delta, 11=reserved) + bits 5-0: reserved (must be 0) +[optional: varint bits] ← present when logical = Morton +[optional: varint shift] ← present when logical = Morton +``` + +### Byte-length rules + +byte_length presence is encoded directly in the `physical` field — there is no +separate flag bit: + +| `physical` value | byte_length | Rationale | +|---|---|---| +| `None-noLen` (0) | omitted | `byte_length = num_values × element_width`; both known at header time | +| `None-withLen` (1) | varint follows | Use when explicit length is preferred (e.g. compression later) | +| `VarInt` (2) | varint follows | Cannot skip a VarInt stream without knowing its byte span | +| `FastPFor128` (3) | varint follows | Not self-delimiting | +| `Rle` / `DeltaRle` logical | varint follows | VarInt physical implied; same skippability requirement | + +### Count context rules + +`has_explicit_count = 0` is valid when the count equals `feature_count` or equals +`popcount(presence_bitfield)`. `popcount` is a fast bit-counting operation, so +recomputing it on the fly is cheaper than storing an extra varint. `has_explicit_count += 1` is required only when neither source applies. + +| Stream position | Implicit count (when `has_explicit_count = 0`) | +|---|---| +| Geometry types (first geometry stream) | `feature_count` | +| Non-optional scalar / ID data | `feature_count` | +| Dict index stream (`StrDict`, `StrFsstDict`, child refs) | `feature_count` (all features have an index) | +| Data stream after own presence bitfield | `popcount(presence_bitfield)` | +| Data stream with `presence_ref > 0` | `popcount(referenced_presence_bitfield)` | +| Geometry vertex / aux streams | must use `has_explicit_count = 1` | +| String character / offset / dict-size streams | must use `has_explicit_count = 1` | + +### Changes from v1 + +| Field | v1 | v2 | +|---|---|---| +| `stream_type` byte | Always present (1 byte) | Eliminated (role implied by position) | +| `encoding` byte | Always present (1 byte) | Replaced by `encoding_byte` | +| `has_byte_length` flag | Part of encoding byte | Eliminated; byte_length presence encoded in `physical` field | +| `has_extension` flag | Part of encoding byte | Eliminated; extension byte implied by `logical=Morton` | +| `num_values` | Always present (1–5 bytes) | Conditional: omitted when derivable | +| `byte_length` | Always present (1–5 bytes) | Omitted for `physical=None-noLen`; always present otherwise | +| `runs` (RLE) | Present for RLE (1–5 bytes) | Eliminated | +| `num_rle_values` (RLE) | Present for RLE (1–5 bytes) | Eliminated | +| `bits`/`shift` (Morton) | Present for Morton (2–10 bytes) | In extension byte + varints; extension always present when `logical=Morton` | + +--- + +## 5. Presence Streams + +### v1 presence stream + +Optional columns prepend a bool-RLE stream with a full stream header: + +``` +[u8 stream_type = 0x00] ← Present category, subtype 0 +[u8 encoding] + bits 7-5: logical1 = Rle (3) + bits 4-2: logical2 = None (0) + bits 1-0: physical = None (0) ← or VarInt +[varint num_values] ← = feature_count +[varint byte_length] +[varint runs] ← RLE pair count +[varint num_rle_values] ← = feature_count +[RLE-packed bits...] +``` + +The stream uses bool-RLE encoding: pairs of `(run_length, value)` where value is 0 +(absent) or 1 (present). Header overhead: 4–14 bytes before the data. + +### v2 presence section + +Presence is encoded in the **column type** itself, not in a prefix varint, giving three +distinct variants for any nullable type: + +**`Opt*` (own presence)** — bitfield immediately follows the optional name; no leading varint: +``` +[ceil(feature_count/8) bytes: raw packed bits, LSB-first within each byte] +← registered as the next presence group (0-based sequential index) +← num_values for the data stream = popcount(this bitfield) +``` + +**`*SharedPresence`** — references a prior column's already-declared bitfield: +``` +[varint presence_group] ← 0-based index of the group to reuse +← no bitfield bytes follow +← num_values for the data stream = popcount(referenced bitfield) +``` + +**What changes:** + +| Aspect | v1 | v2 | +|---|---|---| +| Encoding | Bool-RLE pairs | Raw packed bitfield (1 bit per feature) | +| Header overhead | 4–14 bytes | **0 bytes** for `Opt*` (bitfield directly follows column type); 1 byte (varint) for `*SharedPresence` | +| Size known in advance | No (stored in `byte_length`) | Yes: always `ceil(feature_count / 8)` bytes | +| Sharing identical bitfields | Not possible | `*SharedPresence` type with group index; bitfield stored once | +| Null count for data stream | Stored in `num_rle_values` | `popcount(bitfield)`, computed at decode time | +| "Own bitfield" marker | n/a | Encoded in column type — no `presence_ref = 0` byte needed | + +**Presence group numbering** is layer-wide and sequential. Every `Opt*` column +(top-level or SharedDict child) increments the counter when its bitfield is parsed. + +**Dict variants** (`OptStrDict`, `OptStrFsstDict`, `OptSharedDictChildRef`) do **not** +have a presence section; they encode null via index 0 in the indices stream and therefore +have no corresponding `*SharedPresence` variant either. + +--- + +## 6. Scalar and ID Columns + +Covers: `Bool`, `I8`, `U8`, `I32`, `U32`, `I64`, `U64`, `F32`, `F64`, `Id`, `LongId` +and their `Opt*` counterparts. + +### v1 layout + +``` +── metadata section (written before all column data) ── +[u8 column_type] +[optional: varint name_len + bytes] + +── data section ── +[optional presence stream] ← full stream header + bool-RLE data +[stream_type byte] ← always 0x10 (Data, DictionaryType::None) +[encoding byte] +[varint num_values] ← = feature_count (non-optional) or number of 1-bits in presence (optional) +[varint byte_length] +[encoded data bytes] +``` + +### v2 layout + +Non-optional: +``` +[u8 column_type] ← e.g. I32 +[optional: varint name_len + bytes] +[encoding_byte] +[optional: varint num_values] ← only when has_explicit_count = 1 +[optional: varint byte_length] ← present when physical ∈ {None-withLen, VarInt, FastPFor128} +[encoded data bytes] +``` + +Optional — own presence (`Opt*`): +``` +[u8 column_type] ← e.g. OptI32 +[optional: varint name_len + bytes] +[ceil(feature_count/8) bytes: bitfield] ← no prefix varint +[encoding_byte] [optional count/size] [data bytes] +``` + +Optional — shared presence (`*SharedPresence`): +``` +[u8 column_type] ← e.g. I32SharedPresence +[optional: varint name_len + bytes] +[varint presence_group] ← 0-based group index +[encoding_byte] [optional count/size] [data bytes] +``` + +**What changes per column:** + +| | v1 bytes | v2 bytes | +|---|---|---| +| Column type in metadata section | 1 (in meta pass) | 1 (inline) | +| Name in metadata section | 0–33 (in meta pass) | 0–33 (inline) | +| Presence stream header | 4–14 | 0 (`Opt*` — bitfield directly follows) / 1 (`*SharedPresence` — group ref varint) | +| Presence data | RLE packed | Raw packed bits | +| `stream_type` byte | 1 | 0 (eliminated) | +| `num_values` | 1–5 | 0 (derivable from `feature_count` or presence popcount) | +| `byte_length` | 1–5 | 0 for `None-noLen`; 1–5 for `None-withLen`, VarInt, FastPFor128 | + +**Element widths** used for `physical=None-noLen` byte-length derivation: + +| Column types | Element width | +|---|---| +| `Bool`, `I8`, `U8` | 1 byte | +| `I32`, `U32`, `F32` | 4 bytes | +| `I64`, `U64`, `F64`, `Id`, `LongId` | 8 bytes | + +--- + +## 7. Geometry Column + +### v1 layout + +``` +── metadata section ── +[u8 column_type = 0x04] ← Geometry; no name written + +── data section ── +[varint stream_count] ← number of geometry streams that follow +[stream 0: types] + [u8 stream_type = 0x10] ← Data(None) + [u8 encoding] + [varint num_values] ← = feature_count + [varint byte_length] + [geometry type data] +[stream 1..N: auxiliary streams, each with full header] + [u8 stream_type] ← identifies role: Data(Vertex/Morton), + Offset(Vertex/Index), Length(Geometries/Parts/Rings/Triangles) + [u8 encoding] + [varint num_values] + [varint byte_length] + [stream data] +``` + +Stream presence is implicit: `stream_count` streams follow, each identified by its +`stream_type` byte. + +### v2 layout + +The `geometry_flags` byte is eliminated entirely. The column type byte is one of +the named `Geo*` types below, which unambiguously specifies which streams are +present and in what order. No name is written (same rule as v1). + +``` +[u8 column_type] ← one of the Geo* column types; no name bytes follow +[streams in fixed order determined by column_type; see table below] +``` + +Every stream: `[encoding_byte] [optional: varint num_values] [optional: varint byte_length] [data]` + +| Column type | Streams (in order, all `has_explicit_count = 1` except Types) | +|---|---| +| `GeoPoints` | Types¹, Vertices | +| `GeoPointsDict` | Types¹, VertexData (dict), VertexOffsets | +| `GeoMultiPoints` | Types¹, GeoLengths, Vertices | +| `GeoMultiPointsDict` | Types¹, GeoLengths, VertexData (dict), VertexOffsets | +| `GeoLines` | Types¹, PartLengths, Vertices | +| `GeoLinesDict` | Types¹, PartLengths, VertexData (dict), VertexOffsets | +| `GeoMultiLines` | Types¹, GeoLengths, PartLengths, Vertices | +| `GeoMultiLinesDict` | Types¹, GeoLengths, PartLengths, VertexData (dict), VertexOffsets | +| `GeoPolygons` | Types¹, PartLengths, RingLengths, Vertices | +| `GeoPolygonsDict` | Types¹, PartLengths, RingLengths, VertexData (dict), VertexOffsets | +| `GeoMultiPolygons` | Types¹, GeoLengths, PartLengths, RingLengths, Vertices | +| `GeoMultiPolygonsDict` | Types¹, GeoLengths, PartLengths, RingLengths, VertexData (dict), VertexOffsets | +| `GeoTessPolygons` | Types¹, TriLengths, IndexBuffer, Vertices | +| `GeoTessPolygonsWithOutlines` | Types¹, GeoLengths, PartLengths, RingLengths, TriLengths, IndexBuffer, Vertices | + +¹ Types stream: `has_explicit_count = 0` (count = `feature_count`) + +**Dict columns** store a deduplicated vertex dictionary in VertexData and per-vertex +indices in VertexOffsets. The vertex encoding (plain delta, CwDelta, Morton) is +specified by the `logical` field in the VertexData encoding byte. + +**Mixed-type columns** use the type whose stream set is a superset of all geometry +types present: +- Point + LineString → `GeoLines` (Points consume no PartLengths entry) +- LineString + Polygon → `GeoPolygons` (LineString vertex counts stored in RingLengths) +- Polygon + MultiPolygon → `GeoMultiPolygons` + +**What changes:** + +| Aspect | v1 | v2 | +|---|---|---| +| Stream presence declaration | `varint stream_count` + `stream_type` per stream | Encoded in column type | +| `stream_count` varint | 1–5 bytes | Eliminated | +| `geometry_flags` byte | n/a (v1 has no flags) | Eliminated (merged into column type) | +| `stream_type` per stream | 1 byte × N streams | Eliminated | +| `num_values` for types stream | varint (= feature_count) | Omitted | +| Vertex encoding style | Read from `stream_type` subtype | Read from `logical` field in encoding byte | + +--- + +## 8. String Columns + +### Overview of changes + +| | v1 | v2 | +|---|---|---| +| Column types | `Str (28)`, `OptStr (29)` | 8 explicit types (28–35) | +| Encoding variant declaration | `varint stream_count` (2–5 determines variant) | Encoded in column type code | +| `stream_count` varint | 1–5 bytes | Eliminated | +| `stream_type` per stream | 1 byte × N streams | Eliminated (position-implied) | +| Null encoding for dict variants | Presence bitfield (separate stream) | Null-at-index-0 in indices stream | +| Presence sharing | Not possible | Dict variants: none; plain variants: shared presence group | + +--- + +### 8.1 StrPlain / OptStrPlain + +A plain string column stores per-feature string lengths and the concatenated string data. + +**v1 layout (`Str`, non-optional):** + +``` +── metadata ── [u8 = 28] [varint name_len + bytes] +── data ── +[varint stream_count = 2] +[stream 0: lengths] + [u8 stream_type = 0x31] ← Length(VarBinary) + [u8 encoding] + [varint num_values] ← = feature_count + [varint byte_length] + [length data] +[stream 1: string data] + [u8 stream_type = 0x10] ← Data(None) + [u8 encoding = raw/None] + [varint num_values] ← = total byte count of all strings + [varint byte_length] ← = same + [raw UTF-8 bytes] +``` + +**v1 layout (`OptStr`, optional):** + +``` +── metadata ── [u8 = 29] [varint name_len + bytes] +── data ── +[varint stream_count = 3] +[presence stream] ← full bool-RLE header + data +[stream 1: lengths] ← num_values = number of 1-bits in presence bitfield +[stream 2: string data] +``` + +**v2 layout (`StrPlain`, non-optional):** + +``` +[u8 = 28] [varint name_len + bytes] +[encoding_byte for lengths] ← has_explicit_count=0 (= feature_count) +[lengths data] +[encoding_byte for string data] + has_explicit_count=1 ← total byte count ≠ feature_count + logical=None, physical=None-noLen ← byte_length = num_values × 1 (trivial) +[varint num_values] ← total UTF-8 byte count +[raw UTF-8 bytes] +``` + +**v2 layout (`OptStrPlain`, optional — own presence):** + +``` +[u8 = 29] [varint name_len + bytes] +[ceil(feature_count/8) bytes: bitfield] ← no prefix varint; registered as next group +[encoding_byte for lengths] ← has_explicit_count=0; count = popcount(bitfield) +[lengths data] +[encoding_byte for string data] + has_explicit_count=1 + logical=None, physical=None-noLen +[varint num_values] +[raw UTF-8 bytes] +``` + +**v2 layout (`StrPlainSharedPresence`, optional — shared presence):** + +``` +[u8 = StrPlainSharedPresence] [varint name_len + bytes] +[varint presence_group] ← 0-based group index +[encoding_byte for lengths] ← has_explicit_count=0; count = popcount(referenced group) +[lengths data] +[encoding_byte for string data] has_explicit_count=1, logical=None, physical=None-noLen +[varint num_values] +[raw UTF-8 bytes] +``` + +--- + +### 8.2 StrDict / OptStrDict + +A dictionary string column stores unique string values in a dictionary and per-feature +indices into that dictionary. + +**Null encoding change — the most significant difference for dict columns:** + +| | v1 | v2 | +|---|---|---| +| Optional dict column | Presence bitfield controls which features have values; index stream only covers present features | No presence bitfield; **index 0 = null**; index stream covers all `feature_count` features | +| Non-optional dict column | Presence absent; index stream covers all features | Same; index 0 is a valid dict entry (decoder assumes non-null) | +| Index count | = number of `1` bits in presence, or `feature_count` | Always `feature_count` | + +**v1 layout (`OptStr` + dictionary variant, stream_count = 4):** + +``` +── metadata ── [u8 = 29] [varint name_len + bytes] +── data ── +[varint stream_count = 4] +[presence stream] ← bool-RLE; num_values = feature_count +[offsets stream] ← Offset(String); per-present-feature dict indices + [u8 stream_type = 0x23] + [u8 encoding] + [varint num_values] ← = number of 1-bits in presence bitfield + [varint byte_length] + [offset data] +[dict lengths stream] ← Length(Dictionary) + [u8 stream_type = 0x36] + [u8 encoding] + [varint num_values] ← = dict size + [varint byte_length] + [dict length data] +[dict data stream] ← Data(Single/Shared) + [u8 stream_type = 0x11/0x12] + [u8 encoding = raw/None] + [varint num_values] ← = total dict UTF-8 bytes + [varint byte_length] + [raw UTF-8 dict bytes] +``` + +**v2 layout (`OptStrDict`, optional):** + +``` +[u8 = 31] [varint name_len + bytes] +← NO presence section for dict variants +[encoding_byte for dict_lengths] + has_explicit_count=1 ← dict size ≠ feature_count +[varint num_values] ← dict entry count +[dict_lengths data] +[encoding_byte for dict_data] + has_explicit_count=1 + logical=None, physical=None-noLen ← byte_length = num_values × 1 +[varint num_values] ← total dict UTF-8 bytes +[raw UTF-8 dict bytes] +[encoding_byte for indices] + has_explicit_count=0 ← = feature_count (index 0 = null, all features present) +[indices data] ← index 0 = null; indices 1..N map to dict entries 0..N-1 +``` + +**v2 layout (`StrDict`, non-optional):** + +Identical wire format to `OptStrDict`. The column type code (30 vs 31) tells the +decoder whether index 0 should be treated as null. + +--- + +### 8.3 StrFsst / OptStrFsst + +FSST-plain compresses string data using a per-column symbol table. The column stores a +symbol table, per-value lengths, and the compressed corpus. + +**FSST symbol table stream** (same for all FSST variants): + +``` +[encoding_byte] + has_explicit_count=1 + logical=None, physical=None-noLen ← raw bytes; byte_length = num_values × 1 +[varint num_values] ← number of raw symbol table bytes +[symbol table bytes] +``` + +Symbol lengths (how long each symbol is) precede the symbol table bytes. + +**v1 layout (`OptStr` + FSST-plain variant, stream_count = 5):** + +``` +[varint stream_count = 5] +[presence stream] ← bool-RLE header +[symbol_lengths stream] ← Length(Symbol) +[symbol_table stream] ← Data(Fsst) +[per-value lengths stream] ← Length(Dictionary) +[compressed corpus stream] ← Data(Single/Shared) +``` + +Each stream has a full 4-field header plus optional RLE metadata. + +**v2 layout (`OptStrFsst`, optional — own presence):** + +``` +[u8 = 33] [varint name_len + bytes] +[ceil(feature_count/8) bytes: bitfield] ← no prefix varint; registered as next group +[encoding_byte] [symbol_lengths data] ← has_explicit_count=1 +[encoding_byte] [symbol_table data] ← has_explicit_count=1, logical=None, physical=None-noLen +[encoding_byte] [per-value lengths] ← has_explicit_count=0; count = popcount(bitfield) +[encoding_byte] [compressed corpus] ← has_explicit_count=1, logical=None, physical=None-noLen +``` + +**v2 layout (`StrFsst`, non-optional):** + +Same but no presence section; per-value lengths count = `feature_count`. + +--- + +### 8.4 StrFsstDict / OptStrFsstDict + +FSST-dictionary stores the dictionary corpus FSST-compressed. Per-feature values are +indices into the decoded dictionary. + +**v1 layout (`OptStr` + FSST-dictionary variant, stream_count = 6):** + +``` +[varint stream_count = 6] +[presence stream] ← bool-RLE +[symbol_lengths stream] +[symbol_table stream] +[dict_lengths stream] +[compressed dict corpus stream] +[per-feature offsets stream] ← Offset(String); indices per present feature +``` + +**v2 layout (`OptStrFsstDict`, optional):** + +``` +[u8 = 35] [varint name_len + bytes] +← NO presence section +[encoding_byte] [symbol_lengths data] ← has_explicit_count=1 +[encoding_byte] [symbol_table data] ← has_explicit_count=1, logical=None, physical=None-noLen +[encoding_byte] [dict_lengths data] ← has_explicit_count=1 (dict size ≠ feature_count) +[encoding_byte] [dict_corpus data] ← has_explicit_count=1, logical=None, physical=None-noLen +[encoding_byte] [indices data] ← has_explicit_count=0 (= feature_count); index 0 = null +``` + +**v2 layout (`StrFsstDict`, non-optional):** + +Same wire format; decoder does not treat index 0 as null. + +--- + +## 9. SharedDict Columns + +A SharedDict column stores a shared string corpus (plain or FSST-compressed) that +multiple child columns index into. + +### Overview of changes + +| Aspect | v1 | v2 | +|---|---|---| +| Column types | `SharedDict (30)` — single type with runtime variant | `SharedDictPlain (36)`, `SharedDictFsst (37)` — flat types | +| Corpus encoding variant | Runtime: detected from stream types within stream_count group | Static: encoded in column type code | +| Children | `varint stream_count` + optional presence + offset stream per child | Full column definitions with type byte (`SharedDictChildRef`/`OptSharedDictChildRef`) | +| Optional child null encoding | Presence bitfield (separate stream per optional child) | Null-at-index-0 in the indices stream | +| Shared presence for children | Not possible | Possible for plain/FSST variants; **not** for dict children (null-at-0 convention) | + +### v1 layout + +``` +── metadata ── [u8 = 30] [varint name_len + bytes] + [varint child_col_count] + [u8 child_type₀] [varint name_len + bytes] + ... +── data ── +[varint stream_count] ← dict streams + (1 or 2 per child) +[dict stream(s)] ← lengths + data, or FSST 4-stream group +[per child:] + [varint child_stream_count] ← 1 or 2 depending on optionality + [optional presence stream] ← bool-RLE header + data + [offset/index stream] ← Offset(Key) or Offset(String) + [u8 stream_type = 0x23] + [u8 encoding] + [varint num_values] + [varint byte_length] + [index data] +``` + +### v2 layout (SharedDictPlain) + +``` +[u8 = 36] [varint name_len + bytes] +[encoding_byte] [dict_lengths data] ← has_explicit_count=1 +[encoding_byte] [dict_data bytes] ← has_explicit_count=1, logical=None, physical=None-noLen + +[varint child_column_count] +[child 0:] + [u8 = 38 (SharedDictChildRef) or 39 (OptSharedDictChildRef)] + [varint name_len + bytes] + ← NO presence section for child ref types (null-at-0 for optional) + [encoding_byte] [indices data] + has_explicit_count=0 ← = feature_count for both ref types; index 0 = null when type = 39 +[child 1:] + ... +``` + +### v2 layout (SharedDictFsst) + +``` +[u8 = 37] [varint name_len + bytes] +[encoding_byte] [symbol_lengths data] ← has_explicit_count=1 +[encoding_byte] [symbol_table data] ← has_explicit_count=1, logical=None, physical=None-noLen +[encoding_byte] [dict_lengths data] ← has_explicit_count=1 +[encoding_byte] [dict_corpus data] ← has_explicit_count=1, logical=None, physical=None-noLen + +[varint child_column_count] +[children identical to SharedDictPlain children...] +``` + +### Child type codes + +| Column type | Code | Null handling | +|---|---|---| +| `SharedDictChildRef` | 38 | No nulls; index 0 is a valid dict entry | +| `OptSharedDictChildRef` | 39 | index 0 = null; all other indices map to dict | + +--- + +## 10. RLE Encoding + +### v1 RLE stream header extras + +All RLE and DeltaRle streams (except bool-RLE presence streams) carry two extra varints +after `byte_length`: + +``` +[varint runs] ← number of (run_length, value) pairs +[varint num_rle_values] ← sum of all run_lengths = total decoded element count +``` + +Both fields are redundant: they can be derived from the stream data itself. + +### v2 changes + +Both fields are **eliminated**: + +- `num_rle_values` = `num_values`, already known from context (`feature_count`, + presence popcount, or explicit count in encoding byte). +- `runs` is recoverable by scanning: the data is interleaved `[run_length_varint, + value_varint]` pairs, so `runs = total_pairs` (found by reading to byte_length). + +Bool-RLE is eliminated entirely; presence data uses raw packed bitfields (§5). + +### Encoding byte for RLE streams + +| Logical | Encoding byte bits 6-4 | Physical implied | byte_length | +|---|---|---|---| +| `Rle` | `011` | VarInt | always follows | +| `DeltaRle` | `100` | VarInt | always follows | + +bits 3-2 (physical field) must be `00` for Rle/DeltaRle; they are reserved for future +RLE sub-variants. No extension byte follows. + +--- + +## 11. Morton Encoding + +### v1 Morton header extras + +Morton streams carry `bits` and `shift` after `byte_length`: + +``` +[varint bits] ← number of bits used per coordinate component +[varint shift] ← coordinate shift value +``` + +Morton sub-variant (plain/delta/rle) is encoded via `logical2` in the encoding byte. + +### v2 changes + +`bits` and `shift` are moved to an **extension byte** that is unconditionally present +whenever `logical = Morton` (no flag needed): + +``` +[encoding_byte] + logical = 5 (Morton) ← bits 6-4 + physical = ... ← bits 3-2 (all four values valid for Morton) +[extension_byte] ← always present; no flag + bits 7-6: Morton sub-variant (00=None, 01=Rle, 10=Delta, 11=reserved) + bits 5-0: reserved (must be 0) +[varint bits] +[varint shift] +``` + +The `logical2` field in v1's encoding byte is replaced by the Morton sub-variant in the +extension byte. The extension byte is implied by `logical=Morton` rather than by an +explicit `has_extension` flag. + +--- + +## 12. FastPFor Codec + +### v1 FastPFor wire format + +``` +[u32 BE] N = number of FastPFor-compressed u32 words +[N × u32 BE] FastPFor primary codec output +[remaining u32 BE words] VariableByte remainder codec output +``` + +All u32 words are stored **big-endian**. Block size is **256**. + +### v2 FastPFor wire format + +``` +[u32 LE] N = number of FastPFor-compressed u32 words +[N × u32 LE] FastPFor primary codec output +[remaining u32 LE words] VariableByte remainder codec output +``` + +Changes: +- All u32 words are stored **little-endian**. +- Block size is **128**. +- `physical=FastPFor128` (value 3 in bits 3-2) always carries `byte_length`; it marks + where the FastPFor stream ends and the next stream begins. + +--- + +## 13. Column Type Codes + +### v1 column type codes + +| Code | Type | Code | Type | +|------|------|------|------| +| 0 | Id | 1 | OptId | +| 2 | LongId | 3 | OptLongId | +| 4 | Geometry | — | — | +| 10 | Bool | 11 | OptBool | +| 12 | I8 | 13 | OptI8 | +| 14 | U8 | 15 | OptU8 | +| 16 | I32 | 17 | OptI32 | +| 18 | U32 | 19 | OptU32 | +| 20 | I64 | 21 | OptI64 | +| 22 | U64 | 23 | OptU64 | +| 24 | F32 | 25 | OptF32 | +| 26 | F64 | 27 | OptF64 | +| 28 | Str | 29 | OptStr | +| 30 | SharedDict | — | — | + +### v2 column type codes + +Specific numeric assignments are deferred to the final spec; only the variant names +are defined here. + +Every nullable scalar/string type has **three** variants: + +| Variant suffix | Presence mechanism | +|---|---| +| *(none)* — non-optional | No presence | +| `Opt*` — own-presence | Bitfield directly follows name; no prefix varint | +| `OptRef*` — shared-presence | `varint presence_group` follows name | + +**ID and scalar types:** + +| Non-optional | Own-presence | Shared-presence | +|---|---|---| +| `Id` | `OptId` | `OptRefId` | +| `LongId` | `OptLongId` | `OptRefLongId` | +| `Bool` | `OptBool` | `OptRefBool` | +| `I8` | `OptI8` | `OptRefI8` | +| `U8` | `OptU8` | `OptRefU8` | +| `I32` | `OptI32` | `OptRefI32` | +| `U32` | `OptU32` | `OptRefU32` | +| `I64` | `OptI64` | `OptRefI64` | +| `U64` | `OptU64` | `OptRefU64` | +| `F32` | `OptF32` | `OptRefF32` | +| `F64` | `OptF64` | `OptRefF64` | + +**String and shared-dict types:** + +| Non-optional | Own-presence | Shared-presence | +|---|---|---| +| `StrPlain` | `OptStrPlain` | `OptRefStrPlain` | +| `StrDict` | — (null-at-0; no presence bitfield) | +| `StrFsst` | `OptStrFsst` | `OptRefStrFsst` | +| `StrFsstDict` | — (null-at-0; no presence bitfield) | +| `SharedDictPlain` | — | — | +| `SharedDictFsst` | — | — | +| `SharedDictChildRef` | — (null-at-0; no presence bitfield) | + +**Geometry types** (no optional variants; geometry columns are always present): + +| Column type | Description | +|---|---| +| `GeoPoints` | Point geometries | +| `GeoPointsDict` | Point geometries with vertex dictionary | +| `GeoMultiPoints` | MultiPoint geometries | +| `GeoMultiPointsDict` | MultiPoint geometries with vertex dictionary | +| `GeoLines` | LineString / mixed point+line geometries | +| `GeoLinesDict` | LineString geometries with vertex dictionary | +| `GeoMultiLines` | MultiLineString geometries | +| `GeoMultiLinesDict` | MultiLineString geometries with vertex dictionary | +| `GeoPolygons` | Polygon / mixed geometries | +| `GeoPolygonsDict` | Polygon geometries with vertex dictionary | +| `GeoMultiPolygons` | MultiPolygon geometries | +| `GeoMultiPolygonsDict` | MultiPolygon geometries with vertex dictionary | +| `GeoTessPolygons` | Tessellated polygons (index buffer, no ring topology) | +| `GeoTessPolygonsWithOutlines` | Tessellated polygons with outline ring topology | + +**Invariants:** +- Dict-family string types (`OptStrDict`, `OptStrFsstDict`, `OptSharedDictChildRef`) have no + shared-presence variant because they encode null via index 0, not via a presence bitfield. +- Shared-presence (`OptRef*`) code = own-presence code with bit 7 set. +- Geometry column types have no optional variants; a missing geometry column is simply absent + from the layer. + +--- + +## 14. Stream Type Bytes (Eliminated) + +In v1 every stream begins with a `stream_type` byte that encodes the stream's role: + +``` +bits 7-4: category (0=Present, 1=Data, 2=Offset, 3=Length) +bits 3-0: subtype (DictionaryType / OffsetType / LengthType) +``` + +In v2 this byte is **eliminated from the wire format entirely**. Stream role is always +determinable without it: + +| Column type | How stream role is known | +|---|---| +| Scalar / ID | Single data stream; role is trivial | +| `Geo*` | Column type determines which streams are present; position in fixed sequence declares role | +| `StrPlain` / `OptStrPlain` | Positions: 1=lengths, 2=string-data | +| `StrDict` / `OptStrDict` | Positions: 1=dict-lengths, 2=dict-data, 3=indices | +| `StrFsst` / `OptStrFsst` | Positions: 1=sym-lengths, 2=sym-table, 3=lengths, 4=corpus | +| `StrFsstDict` / `OptStrFsstDict` | Positions: 1=sym-lengths, 2=sym-table, 3=dict-lengths, 4=dict-corpus, 5=indices | +| `SharedDictPlain` / `SharedDictFsst` | Corpus streams in fixed order; children are self-describing column definitions | + +The `StreamType` concept (`Present`, `Data`, `Offset`, `Length`) and its subtypes +(`DictionaryType`, `OffsetType`, `LengthType`) remain useful as descriptive labels for +documentation and in-memory representations, but they are no longer serialized to the +wire in v2. + +--- + +## 15. Overhead Comparison Table + +Per-stream header costs, typical small-integer varint values. + +| Stream Kind | v1 overhead (bytes) | v2 overhead (bytes) | Saving | +|---|---|---|---| +| Presence bitfield header (`Opt*`) | 4–14 | 0 (bitfield directly follows column type) | 4–14 | +| Shared presence (`*SharedPresence`) | full bitfield copy | 1 (presence_group varint) | ≈ ceil(N/8) | +| Non-optional scalar, VarInt | 4–6 | 1 (enc) + varint(size) | 2–4 | +| Non-optional scalar, None-noLen | 4–6 | 1 | 3–5 | +| Non-optional scalar, FastPFor128 | 5–7 | 1 (enc) + varint(size) | 2–4 | +| Optional scalar, VarInt | 4–6 | 1 (enc) + varint(size) | 2–4 | +| RLE stream, VarInt | 6–10 | 1 | 5–9 | +| Geometry: stream_count varint | 1–3 | 0 (encoded in column type) | 1–3 | +| Geometry: types stream | 4–6 | 1 | 3–5 | +| Geometry: aux stream, FastPFor | 6–10 | 1 + varint(count) + varint(size) | 2–4 | +| Morton vertex stream | 8–14 | 1 + ext + 2 varints | 4–8 | +| String variant declaration | 1 (Str/OptStr type) + 1–3 (stream_count) | 0 | 2–4 | +| Dict optional: presence + index stream | 4–10 + bool-RLE | 0 (null-at-0) | 4–10 + bitfield | + +**Estimated total saving for a typical tile (100 streams, 10 optional columns):** +600–1 000 bytes of stream-header metadata, plus elimination of bool-RLE presence data +(replaced by compact bitfields). diff --git a/mkdocs.yml b/mkdocs.yml index 3a2e6c0b2..ef82aa970 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -65,6 +65,7 @@ nav: - Encoding Algorithms: encodings.md - Specification: specification.md - Geometry Spec: geometry.md + - Migrating to v2: migrating-to-v2.md extra: social: From 87d93508604342440d5efb67b502ce0833e1eda8 Mon Sep 17 00:00:00 2001 From: Yuri Astrakhan Date: Sun, 19 Apr 2026 04:20:05 -0400 Subject: [PATCH 02/12] fix shared presence name --- docs/migrating-to-v2.md | 42 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/migrating-to-v2.md b/docs/migrating-to-v2.md index cdb897871..6e1801a71 100644 --- a/docs/migrating-to-v2.md +++ b/docs/migrating-to-v2.md @@ -121,7 +121,7 @@ depending on its column type: [data section] ``` -**With shared presence** (e.g. `I32SharedPresence`): references a previously declared bitfield: +**With shared presence** (e.g. `OptRefI32`): references a previously declared bitfield: ``` [u8 column_type] [varint name_len] [name bytes] @@ -307,7 +307,7 @@ distinct variants for any nullable type: ← num_values for the data stream = popcount(this bitfield) ``` -**`*SharedPresence`** — references a prior column's already-declared bitfield: +**`OptRef*`** — references a prior column's already-declared bitfield: ``` [varint presence_group] ← 0-based index of the group to reuse ← no bitfield bytes follow @@ -319,9 +319,9 @@ distinct variants for any nullable type: | Aspect | v1 | v2 | |---|---|---| | Encoding | Bool-RLE pairs | Raw packed bitfield (1 bit per feature) | -| Header overhead | 4–14 bytes | **0 bytes** for `Opt*` (bitfield directly follows column type); 1 byte (varint) for `*SharedPresence` | +| Header overhead | 4–14 bytes | **0 bytes** for `Opt*` (bitfield directly follows column type); 1 byte (varint) for `OptRef*` | | Size known in advance | No (stored in `byte_length`) | Yes: always `ceil(feature_count / 8)` bytes | -| Sharing identical bitfields | Not possible | `*SharedPresence` type with group index; bitfield stored once | +| Sharing identical bitfields | Not possible | `OptRef*` type with group index; bitfield stored once | | Null count for data stream | Stored in `num_rle_values` | `popcount(bitfield)`, computed at decode time | | "Own bitfield" marker | n/a | Encoded in column type — no `presence_ref = 0` byte needed | @@ -330,7 +330,7 @@ distinct variants for any nullable type: **Dict variants** (`OptStrDict`, `OptStrFsstDict`, `OptSharedDictChildRef`) do **not** have a presence section; they encode null via index 0 in the indices stream and therefore -have no corresponding `*SharedPresence` variant either. +have no corresponding `OptRef*` variant either. --- @@ -375,7 +375,7 @@ Optional — own presence (`Opt*`): [encoding_byte] [optional count/size] [data bytes] ``` -Optional — shared presence (`*SharedPresence`): +Optional — shared presence (`OptRef*`): ``` [u8 column_type] ← e.g. I32SharedPresence [optional: varint name_len + bytes] @@ -389,7 +389,7 @@ Optional — shared presence (`*SharedPresence`): |---|---|---| | Column type in metadata section | 1 (in meta pass) | 1 (inline) | | Name in metadata section | 0–33 (in meta pass) | 0–33 (inline) | -| Presence stream header | 4–14 | 0 (`Opt*` — bitfield directly follows) / 1 (`*SharedPresence` — group ref varint) | +| Presence stream header | 4–14 | 0 (`Opt*` — bitfield directly follows) / 1 (`OptRef*` — group ref varint) | | Presence data | RLE packed | Raw packed bits | | `stream_type` byte | 1 | 0 (eliminated) | | `num_values` | 1–5 | 0 (derivable from `feature_count` or presence popcount) | @@ -1026,20 +1026,20 @@ wire in v2. Per-stream header costs, typical small-integer varint values. -| Stream Kind | v1 overhead (bytes) | v2 overhead (bytes) | Saving | -|---|---|---|---| -| Presence bitfield header (`Opt*`) | 4–14 | 0 (bitfield directly follows column type) | 4–14 | -| Shared presence (`*SharedPresence`) | full bitfield copy | 1 (presence_group varint) | ≈ ceil(N/8) | -| Non-optional scalar, VarInt | 4–6 | 1 (enc) + varint(size) | 2–4 | -| Non-optional scalar, None-noLen | 4–6 | 1 | 3–5 | -| Non-optional scalar, FastPFor128 | 5–7 | 1 (enc) + varint(size) | 2–4 | -| Optional scalar, VarInt | 4–6 | 1 (enc) + varint(size) | 2–4 | -| RLE stream, VarInt | 6–10 | 1 | 5–9 | -| Geometry: stream_count varint | 1–3 | 0 (encoded in column type) | 1–3 | -| Geometry: types stream | 4–6 | 1 | 3–5 | -| Geometry: aux stream, FastPFor | 6–10 | 1 + varint(count) + varint(size) | 2–4 | -| Morton vertex stream | 8–14 | 1 + ext + 2 varints | 4–8 | -| String variant declaration | 1 (Str/OptStr type) + 1–3 (stream_count) | 0 | 2–4 | +| Stream Kind | v1 overhead (bytes) | v2 overhead (bytes) | Saving | +|----------------------------------------|---|---|---| +| Presence bitfield header (`Opt*`) | 4–14 | 0 (bitfield directly follows column type) | 4–14 | +| Shared presence (`OptRef*`) | full bitfield copy | 1 (presence_group varint) | ≈ ceil(N/8) | +| Non-optional scalar, VarInt | 4–6 | 1 (enc) + varint(size) | 2–4 | +| Non-optional scalar, None-noLen | 4–6 | 1 | 3–5 | +| Non-optional scalar, FastPFor128 | 5–7 | 1 (enc) + varint(size) | 2–4 | +| Optional scalar, VarInt | 4–6 | 1 (enc) + varint(size) | 2–4 | +| RLE stream, VarInt | 6–10 | 1 | 5–9 | +| Geometry: stream_count varint | 1–3 | 0 (encoded in column type) | 1–3 | +| Geometry: types stream | 4–6 | 1 | 3–5 | +| Geometry: aux stream, FastPFor | 6–10 | 1 + varint(count) + varint(size) | 2–4 | +| Morton vertex stream | 8–14 | 1 + ext + 2 varints | 4–8 | +| String variant declaration | 1 (Str/OptStr type) + 1–3 (stream_count) | 0 | 2–4 | | Dict optional: presence + index stream | 4–10 + bool-RLE | 0 (null-at-0) | 4–10 + bitfield | **Estimated total saving for a typical tile (100 streams, 10 optional columns):** From 1f92c98648ef21fa13a00c01d1ddbf49d9f5628c Mon Sep 17 00:00:00 2001 From: Yuri Astrakhan Date: Tue, 21 Apr 2026 14:53:53 -0400 Subject: [PATCH 03/12] cleanup doc --- docs/migrating-to-v2.md | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/migrating-to-v2.md b/docs/migrating-to-v2.md index 6e1801a71..c3dfe44a1 100644 --- a/docs/migrating-to-v2.md +++ b/docs/migrating-to-v2.md @@ -6,7 +6,7 @@ and decoders. --- -## 1. Guiding Principles +## Guiding Principles | Principle | v1 | v2 | |---|---|---| @@ -36,10 +36,14 @@ and decoders. --- -## 2. Layer Envelope +## Layer Envelope ### v1 +`feature_count` is not stored. It is implied by the length of the geometry types stream. +Metadata and data are completely separated; the metadata section must be fully parsed +before any column data can be located. + ``` [varint body_len+1] [u8 tag = 1] @@ -58,10 +62,6 @@ and decoders. [streams for column N] ``` -`feature_count` is not stored. It is implied by the length of the geometry types stream. -Metadata and data are completely separated; the metadata section must be fully parsed -before any column data can be located. - ### v2 ``` @@ -100,7 +100,7 @@ before any column data can be located. --- -## 3. Column Layout (Meta + Data Merge) +## Column Layout (Meta + Data Merge) Every column — whether top-level or a SharedDict child — follows one of three templates depending on its column type: @@ -166,7 +166,7 @@ rule as v1). --- -## 4. Encoding Byte +## Encoding Byte Every integer stream in v2 is prefixed by exactly one **encoding byte**, whose layout replaces the two v1 bytes (stream_type + encoding) for streams where the type is implied. @@ -273,7 +273,7 @@ recomputing it on the fly is cheaper than storing an extra varint. `has_explici --- -## 5. Presence Streams +## Presence Streams ### v1 presence stream @@ -334,7 +334,7 @@ have no corresponding `OptRef*` variant either. --- -## 6. Scalar and ID Columns +## Scalar and ID Columns Covers: `Bool`, `I8`, `U8`, `I32`, `U32`, `I64`, `U64`, `F32`, `F64`, `Id`, `LongId` and their `Opt*` counterparts. @@ -405,7 +405,7 @@ Optional — shared presence (`OptRef*`): --- -## 7. Geometry Column +## Geometry Column ### v1 layout @@ -488,7 +488,7 @@ types present: --- -## 8. String Columns +## String Columns ### Overview of changes @@ -727,7 +727,7 @@ Same wire format; decoder does not treat index 0 as null. --- -## 9. SharedDict Columns +## SharedDict Columns A SharedDict column stores a shared string corpus (plain or FSST-compressed) that multiple child columns index into. @@ -803,7 +803,7 @@ multiple child columns index into. --- -## 10. RLE Encoding +## RLE Encoding ### v1 RLE stream header extras @@ -840,7 +840,7 @@ RLE sub-variants. No extension byte follows. --- -## 11. Morton Encoding +## Morton Encoding ### v1 Morton header extras @@ -875,7 +875,7 @@ explicit `has_extension` flag. --- -## 12. FastPFor Codec +## FastPFor Codec ### v1 FastPFor wire format @@ -903,7 +903,7 @@ Changes: --- -## 13. Column Type Codes +## Column Type Codes ### v1 column type codes @@ -993,7 +993,7 @@ Every nullable scalar/string type has **three** variants: --- -## 14. Stream Type Bytes (Eliminated) +## Stream Type Bytes (Eliminated) In v1 every stream begins with a `stream_type` byte that encodes the stream's role: @@ -1022,7 +1022,7 @@ wire in v2. --- -## 15. Overhead Comparison Table +## Overhead Comparison Table Per-stream header costs, typical small-integer varint values. From e7232cee4c7a9b4ca8d5306a5ae69f8ee11c37af Mon Sep 17 00:00:00 2001 From: Yuri Astrakhan Date: Tue, 21 Apr 2026 15:27:42 -0400 Subject: [PATCH 04/12] docs --- .cursor/rules/karpathy-guidelines.mdc | 70 ++++++++++++++++++++++++ docs/migrating-to-v2.md | 78 +++++++++++++-------------- 2 files changed, 106 insertions(+), 42 deletions(-) create mode 100644 .cursor/rules/karpathy-guidelines.mdc diff --git a/.cursor/rules/karpathy-guidelines.mdc b/.cursor/rules/karpathy-guidelines.mdc new file mode 100644 index 000000000..b16f2e5aa --- /dev/null +++ b/.cursor/rules/karpathy-guidelines.mdc @@ -0,0 +1,70 @@ +--- +description: +alwaysApply: true +--- + +# Karpathy behavioral guidelines + +Behavioral guidelines to reduce common LLM coding mistakes. Merge with project-specific instructions as needed. + +**Tradeoff:** These guidelines bias toward caution over speed. For trivial tasks, use judgment. + +## 1. Think Before Coding + +**Don't assume. Don't hide confusion. Surface tradeoffs.** + +Before implementing: +- State your assumptions explicitly. If uncertain, ask. +- If multiple interpretations exist, present them - don't pick silently. +- If a simpler approach exists, say so. Push back when warranted. +- If something is unclear, stop. Name what's confusing. Ask. + +## 2. Simplicity First + +**Minimum code that solves the problem. Nothing speculative.** + +- No features beyond what was asked. +- No abstractions for single-use code. +- No "flexibility" or "configurability" that wasn't requested. +- No error handling for impossible scenarios. +- If you write 200 lines and it could be 50, rewrite it. + +Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify. + +## 3. Surgical Changes + +**Touch only what you must. Clean up only your own mess.** + +When editing existing code: +- Don't "improve" adjacent code, comments, or formatting. +- Don't refactor things that aren't broken. +- Match existing style, even if you'd do it differently. +- If you notice unrelated dead code, mention it - don't delete it. + +When your changes create orphans: +- Remove imports/variables/functions that YOUR changes made unused. +- Don't remove pre-existing dead code unless asked. + +The test: Every changed line should trace directly to the user's request. + +## 4. Goal-Driven Execution + +**Define success criteria. Loop until verified.** + +Transform tasks into verifiable goals: +- "Add validation" → "Write tests for invalid inputs, then make them pass" +- "Fix the bug" → "Write a test that reproduces it, then make it pass" +- "Refactor X" → "Ensure tests pass before and after" + +For multi-step tasks, state a brief plan: +``` +1. [Step] → verify: [check] +2. [Step] → verify: [check] +3. [Step] → verify: [check] +``` + +Strong success criteria let you loop independently. Weak criteria ("make it work") require constant clarification. + +--- + +**These guidelines are working if:** fewer unnecessary changes in diffs, fewer rewrites due to overcomplication, and clarifying questions come before implementation rather than after mistakes. diff --git a/docs/migrating-to-v2.md b/docs/migrating-to-v2.md index c3dfe44a1..c72208a9c 100644 --- a/docs/migrating-to-v2.md +++ b/docs/migrating-to-v2.md @@ -11,8 +11,8 @@ and decoders. | Principle | v1 | v2 | |---|---|---| | Feature count | Not stored; implied by geometry stream `num_values` | `feature_count` varint in every layer header | -| Column layout | Separate metadata section then data section | Column-by-column: type + config + data together | -| Stream type identification | Explicit `stream_type` byte on every stream | Eliminated; role implied by column type and stream position | +| Column layout | Separate metadata section then data section | Geometry section first, then counted columns encoded as type + config + data | +| Stream type identification | Explicit `stream_type` byte on every stream | Eliminated; role implied by `geometry_layout` or by column type plus stream position | | Count per stream | Always stored as varint | Omitted when equal to `feature_count` or `popcount(presence_bitfield)`; required only when neither applies | | Byte length per stream | Always stored as varint | Encoded in `physical` field: `None-noLen` omits it (derivable as count × width); `None-withLen`, `VarInt`, `FastPFor128` always carry it | | Presence streams | Bool-RLE stream with 4-field header | Raw packed bitfield, no header; sharable across columns via index reference | @@ -70,8 +70,11 @@ before any column data can be located. [varint name_len] [name bytes] [varint extent] [varint feature_count] ← NEW -[varint column_count] +── geometry section ────────────────────────────── +[u8 geometry_layout] ← NEW; one of the geometry layouts below +[geometry streams...] ── columns (merged meta + data) ────────────────── +[varint column_count] ← counts only non-geometry top-level columns [column 0: type + name? + presence? + data] [column 1: type + name? + presence? + data] ... @@ -80,8 +83,13 @@ before any column data can be located. **What changes:** - `feature_count` is inserted after `extent`. +- The geometry section now begins with `geometry_layout`, followed by the geometry + streams it declares. +- The geometry section appears before the counted columns in the layer body. - The metadata section is eliminated. Each column is fully self-describing; its type byte, optional name, optional presence section, and stream data appear contiguously. +- `column_count` now appears immediately before the counted columns and covers only + ids, scalars, strings, and shared-dict columns. **Derived invariants unlocked by `feature_count`:** @@ -129,8 +137,7 @@ depending on its column type: [data section] ``` -`has_name()` is false for `Id`, `OptId`, `LongId`, `OptLongId`, and `Geometry` (same -rule as v1). +`has_name()` is false for `Id`, `OptId`, `LongId`, and `OptLongId` (same rule as v1). ### v1 split vs. v2 merged — concrete example @@ -148,14 +155,15 @@ rule as v1). [i32 streams...] ``` -**v2** (one-pass layout): +**v2** (one-pass layout; geometry moved out of `column_count`): ``` [varint feature_count] -[varint 3] ← column_count -[0x04] ← Geometry column_type -[geometry_flags] +── geometry section ── +[0x04] ← geometry_layout [geometry streams...] +── columns ──────────── +[varint 2] ← column_count (Id, OptI32 only) [0x00] ← Id column_type [enc_byte] [data...] [0x11] ← OptI32 column_type (own presence — no prefix varint) @@ -405,7 +413,7 @@ Optional — shared presence (`OptRef*`): --- -## Geometry Column +## Geometry Section ### v1 layout @@ -435,18 +443,20 @@ Stream presence is implicit: `stream_count` streams follow, each identified by i ### v2 layout -The `geometry_flags` byte is eliminated entirely. The column type byte is one of -the named `Geo*` types below, which unambiguously specifies which streams are -present and in what order. No name is written (same rule as v1). +Geometry is no longer encoded as a regular column type. Instead, v2 starts with a +geometry section. That section begins with a single `geometry_layout` byte, which +selects one of the named layouts below and unambiguously specifies which geometry +streams are present and in what order. After the geometry section, `column_count` +and the regular columns follow. Geometry does not contribute to that count. ``` -[u8 column_type] ← one of the Geo* column types; no name bytes follow -[streams in fixed order determined by column_type; see table below] +[u8 geometry_layout] ← first byte of the geometry section +[geometry streams in fixed order determined by geometry_layout; see table below] ``` Every stream: `[encoding_byte] [optional: varint num_values] [optional: varint byte_length] [data]` -| Column type | Streams (in order, all `has_explicit_count = 1` except Types) | +| Geometry layout | Streams (in order, all `has_explicit_count = 1` except Types) | |---|---| | `GeoPoints` | Types¹, Vertices | | `GeoPointsDict` | Types¹, VertexData (dict), VertexOffsets | @@ -465,11 +475,16 @@ Every stream: `[encoding_byte] [optional: varint num_values] [optional: varint b ¹ Types stream: `has_explicit_count = 0` (count = `feature_count`) +This spec intentionally does **not** include the extra tessellated-with-rings layouts, for example, cases with ring topology but missing or +degenerate tessellation payloads. Here, those cases fall back to the corresponding +non-tessellated polygon layout (`GeoPolygons*` / `GeoMultiPolygons*` as applicable), +and no tessellation streams (`TriLengths`, `IndexBuffer`) are written. + **Dict columns** store a deduplicated vertex dictionary in VertexData and per-vertex indices in VertexOffsets. The vertex encoding (plain delta, CwDelta, Morton) is specified by the `logical` field in the VertexData encoding byte. -**Mixed-type columns** use the type whose stream set is a superset of all geometry +**Mixed-type layers** use the layout whose stream set is a superset of all geometry types present: - Point + LineString → `GeoLines` (Points consume no PartLengths entry) - LineString + Polygon → `GeoPolygons` (LineString vertex counts stored in RingLengths) @@ -479,9 +494,9 @@ types present: | Aspect | v1 | v2 | |---|---|---| -| Stream presence declaration | `varint stream_count` + `stream_type` per stream | Encoded in column type | +| Stream presence declaration | `varint stream_count` + `stream_type` per stream | Encoded in `geometry_layout` | | `stream_count` varint | 1–5 bytes | Eliminated | -| `geometry_flags` byte | n/a (v1 has no flags) | Eliminated (merged into column type) | +| Geometry selector | Geometry is a regular column (`column_type = 0x04`) | First byte of the geometry section; outside `column_count` | | `stream_type` per stream | 1 byte × N streams | Eliminated | | `num_values` for types stream | varint (= feature_count) | Omitted | | Vertex encoding style | Read from `stream_type` subtype | Read from `logical` field in encoding byte | @@ -965,31 +980,10 @@ Every nullable scalar/string type has **three** variants: | `SharedDictFsst` | — | — | | `SharedDictChildRef` | — (null-at-0; no presence bitfield) | -**Geometry types** (no optional variants; geometry columns are always present): - -| Column type | Description | -|---|---| -| `GeoPoints` | Point geometries | -| `GeoPointsDict` | Point geometries with vertex dictionary | -| `GeoMultiPoints` | MultiPoint geometries | -| `GeoMultiPointsDict` | MultiPoint geometries with vertex dictionary | -| `GeoLines` | LineString / mixed point+line geometries | -| `GeoLinesDict` | LineString geometries with vertex dictionary | -| `GeoMultiLines` | MultiLineString geometries | -| `GeoMultiLinesDict` | MultiLineString geometries with vertex dictionary | -| `GeoPolygons` | Polygon / mixed geometries | -| `GeoPolygonsDict` | Polygon geometries with vertex dictionary | -| `GeoMultiPolygons` | MultiPolygon geometries | -| `GeoMultiPolygonsDict` | MultiPolygon geometries with vertex dictionary | -| `GeoTessPolygons` | Tessellated polygons (index buffer, no ring topology) | -| `GeoTessPolygonsWithOutlines` | Tessellated polygons with outline ring topology | - **Invariants:** - Dict-family string types (`OptStrDict`, `OptStrFsstDict`, `OptSharedDictChildRef`) have no shared-presence variant because they encode null via index 0, not via a presence bitfield. - Shared-presence (`OptRef*`) code = own-presence code with bit 7 set. -- Geometry column types have no optional variants; a missing geometry column is simply absent - from the layer. --- @@ -1008,7 +1002,7 @@ determinable without it: | Column type | How stream role is known | |---|---| | Scalar / ID | Single data stream; role is trivial | -| `Geo*` | Column type determines which streams are present; position in fixed sequence declares role | +| Geometry section | `geometry_layout` determines which streams are present; position in fixed sequence declares role | | `StrPlain` / `OptStrPlain` | Positions: 1=lengths, 2=string-data | | `StrDict` / `OptStrDict` | Positions: 1=dict-lengths, 2=dict-data, 3=indices | | `StrFsst` / `OptStrFsst` | Positions: 1=sym-lengths, 2=sym-table, 3=lengths, 4=corpus | @@ -1035,7 +1029,7 @@ Per-stream header costs, typical small-integer varint values. | Non-optional scalar, FastPFor128 | 5–7 | 1 (enc) + varint(size) | 2–4 | | Optional scalar, VarInt | 4–6 | 1 (enc) + varint(size) | 2–4 | | RLE stream, VarInt | 6–10 | 1 | 5–9 | -| Geometry: stream_count varint | 1–3 | 0 (encoded in column type) | 1–3 | +| Geometry: stream_count varint | 1–3 | 0 (encoded in `geometry_layout`) | 1–3 | | Geometry: types stream | 4–6 | 1 | 3–5 | | Geometry: aux stream, FastPFor | 6–10 | 1 + varint(count) + varint(size) | 2–4 | | Morton vertex stream | 8–14 | 1 + ext + 2 varints | 4–8 | From b11f4eab9d3fb839064f345877f433f647a1ac5e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:35:28 +0000 Subject: [PATCH 05/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .cursor/rules/karpathy-guidelines.mdc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cursor/rules/karpathy-guidelines.mdc b/.cursor/rules/karpathy-guidelines.mdc index b16f2e5aa..7e7b02e35 100644 --- a/.cursor/rules/karpathy-guidelines.mdc +++ b/.cursor/rules/karpathy-guidelines.mdc @@ -1,5 +1,5 @@ --- -description: +description: alwaysApply: true --- From bf455dffd36ca2e189c0701c2ff37e3ab8b6d549 Mon Sep 17 00:00:00 2001 From: Yuri Astrakhan Date: Tue, 21 Apr 2026 17:46:15 -0400 Subject: [PATCH 06/12] try implementing v2 --- rust/mlt-core/src/decoder/layer.rs | 8 +- rust/mlt-core/src/decoder/model.rs | 6 + rust/mlt-core/src/lib.rs | 1 + rust/mlt-core/src/utils/test_helpers.rs | 3 +- rust/mlt-core/src/v2.rs | 1485 +++++++++++++++++++++++ rust/mlt-core/tests/unknown_layer.rs | 11 +- rust/mlt/src/convert.rs | 62 +- 7 files changed, 1563 insertions(+), 13 deletions(-) create mode 100644 rust/mlt-core/src/v2.rs diff --git a/rust/mlt-core/src/decoder/layer.rs b/rust/mlt-core/src/decoder/layer.rs index 00e7ea9c0..e3a2bb551 100644 --- a/rust/mlt-core/src/decoder/layer.rs +++ b/rust/mlt-core/src/decoder/layer.rs @@ -1,6 +1,7 @@ use crate::codecs::varint::parse_varint; use crate::decoder::{Layer01, Unknown}; use crate::utils::{parse_u8, take}; +use crate::v2::decode_v2_layer; use crate::{DecodeState, Decoder, Layer, MltError, MltRefResult, MltResult, ParsedLayer, Parser}; impl<'a, S: DecodeState> Layer<'a, S> { @@ -9,7 +10,7 @@ impl<'a, S: DecodeState> Layer<'a, S> { pub fn as_layer01(&self) -> Option<&Layer01<'a, S>> { match self { Self::Tag01(l) => Some(l), - Self::Unknown(_) => None, + Self::Tag02(_) | Self::Unknown(_) => None, } } @@ -18,7 +19,7 @@ impl<'a, S: DecodeState> Layer<'a, S> { pub fn into_layer01(self) -> Option> { match self { Self::Tag01(l) => Some(l), - Self::Unknown(_) => None, + Self::Tag02(_) | Self::Unknown(_) => None, } } } @@ -37,8 +38,8 @@ impl<'a> Layer<'a> { let (input, value) = take(input, size)?; let layer = match tag { - // For now, we only support tag 0x01 layers, but more will be added soon 1 => Layer::Tag01(Layer01::from_bytes(value, parser)?), + 2 => Layer::Tag02(decode_v2_layer(value)?), tag => Layer::Unknown(Unknown { tag, value }), }; @@ -52,6 +53,7 @@ impl<'a> Layer<'a> { pub fn decode_all(self, dec: &mut Decoder) -> MltResult> { match self { Layer::Tag01(lazy) => Ok(Layer::Tag01(lazy.decode_all(dec)?)), + Layer::Tag02(t) => Ok(Layer::Tag02(t)), Layer::Unknown(u) => Ok(Layer::Unknown(u)), } } diff --git a/rust/mlt-core/src/decoder/model.rs b/rust/mlt-core/src/decoder/model.rs index 13c563392..f6066e246 100644 --- a/rust/mlt-core/src/decoder/model.rs +++ b/rust/mlt-core/src/decoder/model.rs @@ -14,6 +14,11 @@ use crate::{DecodeState, Lazy, Parsed}; pub enum Layer<'a, S: DecodeState = Lazy> { /// MVT-compatible layer (tag = 1) Tag01(Layer01<'a, S>), + /// Experimental v2 layer (tag = 2), decoded eagerly to owned row-oriented form. + /// + /// Unlike `Tag01`, this variant is always fully decoded when parsed: the + /// `S` type parameter is ignored and the data is stored as a [`TileLayer01`]. + Tag02(TileLayer01), /// Unknown layer with tag, size, and value Unknown(Unknown<'a>), } @@ -26,6 +31,7 @@ where fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Tag01(l) => f.debug_tuple("Tag01").field(l).finish(), + Self::Tag02(t) => f.debug_tuple("Tag02").field(t).finish(), Self::Unknown(u) => f.debug_tuple("Unknown").field(u).finish(), } } diff --git a/rust/mlt-core/src/lib.rs b/rust/mlt-core/src/lib.rs index a1cacaec1..1b9071b08 100644 --- a/rust/mlt-core/src/lib.rs +++ b/rust/mlt-core/src/lib.rs @@ -23,6 +23,7 @@ pub(crate) mod decoder; pub mod encoder; pub(crate) mod errors; pub(crate) mod utils; +pub mod v2; pub use convert::{geojson, mvt}; pub use decoder::{ diff --git a/rust/mlt-core/src/utils/test_helpers.rs b/rust/mlt-core/src/utils/test_helpers.rs index 9ab84d7ff..91b146240 100644 --- a/rust/mlt-core/src/utils/test_helpers.rs +++ b/rust/mlt-core/src/utils/test_helpers.rs @@ -25,6 +25,7 @@ pub fn assert_empty(result: MltRefResult) -> T { pub fn into_layer01(layer: Layer) -> Layer01 { match layer { Layer::Tag01(layer01) => layer01, - Layer::Unknown(_) => panic!("expected Tag01 layer"), + Layer::Tag02(_) => panic!("expected Tag01 layer, got Tag02"), + Layer::Unknown(_) => panic!("expected Tag01 layer, got Unknown"), } } diff --git a/rust/mlt-core/src/v2.rs b/rust/mlt-core/src/v2.rs new file mode 100644 index 000000000..3512990aa --- /dev/null +++ b/rust/mlt-core/src/v2.rs @@ -0,0 +1,1485 @@ +//! MLT v2 experimental encoder and decoder. +//! +//! This module implements a minimal v2 wire format for round-trip experimentation. +//! The format uses tag `2` to distinguish v2 layers from v1 layers (tag `1`). +//! +//! Simplifications in this initial implementation: +//! - VarInt physical encoding only (no FastPFor, no Morton) +//! - ComponentwiseDelta + VarInt for vertex coordinates +//! - StrPlain (lengths + raw bytes) for strings (no FSST, no shared-dict) +//! - Own-presence bitfields for optional columns (no shared-presence / OptRef*) +//! - Pure single-geometry-type layers only (no mixed geometry) + +use geo_types::{Coord, Geometry, LineString, MultiLineString, MultiPoint, MultiPolygon, Point, Polygon}; +use integer_encoding::{VarInt as _, VarIntWriter as _}; +use zigzag::ZigZag as _; + +use crate::MltError; +use crate::MltResult; +use crate::codecs::varint::parse_varint; +use crate::codecs::zigzag::encode_componentwise_delta_vec2s; +use crate::decoder::{GeometryType, PropValue, TileFeature, TileLayer01}; + +// ── Encoding byte bit positions ─────────────────────────────────────────────── +// bit 7: has_explicit_count +// bits 6-4: logical (0=None, 1=Delta, 2=CwDelta, 3=Rle, 4=DeltaRle, 5=Morton) +// bits 3-2: physical (0=None-noLen, 1=None-withLen, 2=VarInt, 3=FastPFor128) +// bits 1-0: reserved (0) + +/// `logical=None, physical=VarInt, count from context` +const ENC_VARINT: u8 = 0x08; +/// `logical=None, physical=VarInt, explicit count follows` +const ENC_VARINT_EXPL: u8 = 0x88; +/// `logical=CwDelta, physical=VarInt, explicit count follows` +const ENC_CWDELTA_VARINT_EXPL: u8 = 0xA8; +/// `logical=None, physical=None-noLen, explicit count follows` +const ENC_RAW_EXPL: u8 = 0x80; + +// ── Geometry layout byte values ─────────────────────────────────────────────── + +/// MLT v2 geometry section layout codes. +/// +/// Encodes which geometry streams are present and in what order. +/// Dict and tessellation variants are excluded from this initial implementation. +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum GeoLayout { + /// Types, Vertices + Points = 0, + /// Types, GeoLengths, Vertices + MultiPoints = 2, + /// Types, PartLengths, Vertices + Lines = 4, + /// Types, GeoLengths, PartLengths, Vertices + MultiLines = 6, + /// Types, PartLengths, RingLengths, Vertices + Polygons = 8, + /// Types, GeoLengths, PartLengths, RingLengths, Vertices + MultiPolygons = 10, +} + +impl TryFrom for GeoLayout { + type Error = MltError; + fn try_from(v: u8) -> MltResult { + match v { + 0 => Ok(Self::Points), + 2 => Ok(Self::MultiPoints), + 4 => Ok(Self::Lines), + 6 => Ok(Self::MultiLines), + 8 => Ok(Self::Polygons), + 10 => Ok(Self::MultiPolygons), + _ => Err(MltError::NotImplemented("unsupported v2 geometry layout")), + } + } +} + +// ── Column type codes (reuse v1 codes for backward clarity) ────────────────── + +// IDs +const COL_ID: u8 = 0; +const COL_OPT_ID: u8 = 1; +// Scalars (mirror v1 ColumnType values) +const COL_BOOL: u8 = 10; +const COL_OPT_BOOL: u8 = 11; +const COL_I8: u8 = 12; +const COL_OPT_I8: u8 = 13; +const COL_U8: u8 = 14; +const COL_OPT_U8: u8 = 15; +const COL_I32: u8 = 16; +const COL_OPT_I32: u8 = 17; +const COL_U32: u8 = 18; +const COL_OPT_U32: u8 = 19; +const COL_I64: u8 = 20; +const COL_OPT_I64: u8 = 21; +const COL_U64: u8 = 22; +const COL_OPT_U64: u8 = 23; +const COL_F32: u8 = 24; +const COL_OPT_F32: u8 = 25; +const COL_F64: u8 = 26; +const COL_OPT_F64: u8 = 27; +// Strings: StrPlain / OptStrPlain +const COL_STR: u8 = 28; +const COL_OPT_STR: u8 = 29; + +// ── Encoder ─────────────────────────────────────────────────────────────────── + +impl TileLayer01 { + /// Encode this layer to the MLT **v2** experimental wire format. + /// + /// Returns a complete framed record: `[varint(body_len+1)][tag=2][body…]` + /// ready to be concatenated with other layers in a tile. + /// + /// Returns an empty slice for empty layers (no features). + pub fn encode_v2(&self) -> MltResult> { + if self.features.is_empty() { + return Ok(Vec::new()); + } + + let feature_count = self.features.len(); + + // ── Geometry ────────────────────────────────────────────────────────── + let geom = collect_geometry(&self.features)?; + + // ── IDs ─────────────────────────────────────────────────────────────── + let has_ids = self.features.iter().any(|f| f.id.is_some()); + let all_ids: Vec> = if has_ids { + self.features.iter().map(|f| f.id).collect() + } else { + Vec::new() + }; + + // ── Property columns ────────────────────────────────────────────────── + let prop_count = self.property_names.len(); + + // ── Build body ──────────────────────────────────────────────────────── + let mut body: Vec = Vec::new(); + + // name + let name_bytes = self.name.as_bytes(); + body.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; + body.extend_from_slice(name_bytes); + + // extent + body.write_varint(self.extent).map_err(MltError::from)?; + + // feature_count (v2 addition) + body.write_varint(feature_count as u32).map_err(MltError::from)?; + + // ── Geometry section ────────────────────────────────────────────────── + body.push(geom.layout as u8); + write_geometry_streams( + &mut body, + &geom.types, + geom.geo_lengths.as_deref(), + geom.part_lengths.as_deref(), + geom.ring_lengths.as_deref(), + &geom.vertices, + feature_count, + )?; + + // ── Columns ─────────────────────────────────────────────────────────── + let col_count = u32::try_from(usize::from(has_ids) + prop_count)?; + body.write_varint(col_count).map_err(MltError::from)?; + + // ID column + if has_ids { + write_id_column(&mut body, &all_ids, feature_count)?; + } + + // Property columns + for (i, name) in self.property_names.iter().enumerate() { + let vals: Vec<&PropValue> = self.features.iter().map(|f| &f.properties[i]).collect(); + write_property_column(&mut body, name, &vals, feature_count)?; + } + + // ── Frame: [varint(body_len+1)][tag=2][body] ────────────────────────── + let size = u32::try_from(body.len() + 1)?; // +1 for the tag byte + let mut out = Vec::with_capacity(5 + 1 + body.len()); + out.write_varint(size).map_err(MltError::from)?; + out.push(2_u8); // tag = 2 + out.extend_from_slice(&body); + + Ok(out) + } +} + +// ── Geometry collection ─────────────────────────────────────────────────────── + +struct CollectedGeometry { + layout: GeoLayout, + /// Geometry type per feature (u8 repr of GeometryType) + types: Vec, + /// Number of component geometries per multi-geometry feature + geo_lengths: Option>, + /// Number of rings per polygon, or vertices per line + part_lengths: Option>, + /// Number of vertices per ring + ring_lengths: Option>, + /// Flat vertex buffer `[x0, y0, x1, y1, …]` + vertices: Vec, +} + +fn collect_geometry(features: &[TileFeature]) -> MltResult { + let has_multi = features.iter().any(|f| { + matches!( + f.geometry, + Geometry::MultiPoint(_) | Geometry::MultiLineString(_) | Geometry::MultiPolygon(_) + ) + }); + let has_ring = features + .iter() + .any(|f| matches!(f.geometry, Geometry::Polygon(_) | Geometry::MultiPolygon(_))); + let has_line = features + .iter() + .any(|f| matches!(f.geometry, Geometry::LineString(_) | Geometry::MultiLineString(_))); + let has_part = has_line || has_ring; + + let layout = match (has_multi, has_ring, has_part) { + (_, true, _) if has_multi => GeoLayout::MultiPolygons, + (_, true, _) => GeoLayout::Polygons, + (_, false, true) if has_multi => GeoLayout::MultiLines, + (_, false, true) => GeoLayout::Lines, + (true, false, false) => GeoLayout::MultiPoints, + (false, false, false) => GeoLayout::Points, + }; + + let mut types = Vec::with_capacity(features.len()); + let mut geo_lengths: Option> = if has_multi { Some(Vec::new()) } else { None }; + let mut part_lengths: Option> = if has_part { Some(Vec::new()) } else { None }; + let mut ring_lengths: Option> = if has_ring { Some(Vec::new()) } else { None }; + let mut vertices: Vec = Vec::new(); + + for feat in features { + push_feature_geometry( + &feat.geometry, + &mut types, + &mut geo_lengths, + &mut part_lengths, + &mut ring_lengths, + &mut vertices, + )?; + } + + Ok(CollectedGeometry { layout, types, geo_lengths, part_lengths, ring_lengths, vertices }) +} + +fn push_coords(coords: impl Iterator>, vertices: &mut Vec) -> u32 { + let start = vertices.len(); + for c in coords { + vertices.push(c.x); + vertices.push(c.y); + } + u32::try_from((vertices.len() - start) / 2).expect("vertex count fits u32") +} + +/// For polygon rings: GeoJSON rings include the closing coordinate (first == last), +/// which MLT does not store. Drop the last coordinate if it equals the first. +fn ring_vertex_count(ring: &LineString, vertices: &mut Vec) -> u32 { + let coords = &ring.0; + let has_closing = coords.len() >= 2 && coords.first() == coords.last(); + let coords_to_write = if has_closing { &coords[..coords.len() - 1] } else { &coords[..] }; + push_coords(coords_to_write.iter().copied(), vertices) +} + +fn push_feature_geometry( + geom: &Geometry, + types: &mut Vec, + geo_lengths: &mut Option>, + part_lengths: &mut Option>, + ring_lengths: &mut Option>, + vertices: &mut Vec, +) -> MltResult<()> { + match geom { + Geometry::Point(pt) => { + types.push(GeometryType::Point as u32); + vertices.push(pt.0.x); + vertices.push(pt.0.y); + } + Geometry::MultiPoint(mp) => { + types.push(GeometryType::MultiPoint as u32); + let count = push_coords(mp.0.iter().map(|p| p.0), vertices); + geo_lengths.as_mut().map(|v| v.push(count)); + } + Geometry::LineString(ls) => { + types.push(GeometryType::LineString as u32); + let count = push_coords(ls.0.iter().copied(), vertices); + part_lengths.as_mut().map(|v| v.push(count)); + } + Geometry::MultiLineString(mls) => { + types.push(GeometryType::MultiLineString as u32); + geo_lengths.as_mut().map(|v| v.push(mls.0.len() as u32)); + for ls in &mls.0 { + let count = push_coords(ls.0.iter().copied(), vertices); + part_lengths.as_mut().map(|v| v.push(count)); + } + } + Geometry::Polygon(poly) => { + types.push(GeometryType::Polygon as u32); + let ring_count = 1 + poly.interiors().len(); + part_lengths.as_mut().map(|v| v.push(ring_count as u32)); + let ext_count = ring_vertex_count(poly.exterior(), vertices); + ring_lengths.as_mut().map(|v| v.push(ext_count)); + for hole in poly.interiors() { + let hole_count = ring_vertex_count(hole, vertices); + ring_lengths.as_mut().map(|v| v.push(hole_count)); + } + } + Geometry::MultiPolygon(mp) => { + types.push(GeometryType::MultiPolygon as u32); + geo_lengths.as_mut().map(|v| v.push(mp.0.len() as u32)); + for poly in &mp.0 { + let ring_count = 1 + poly.interiors().len(); + part_lengths.as_mut().map(|v| v.push(ring_count as u32)); + let ext_count = ring_vertex_count(poly.exterior(), vertices); + ring_lengths.as_mut().map(|v| v.push(ext_count)); + for hole in poly.interiors() { + let hole_count = ring_vertex_count(hole, vertices); + ring_lengths.as_mut().map(|v| v.push(hole_count)); + } + } + } + _other => { + return Err(MltError::NotImplemented( + "v2 encoder: unsupported geometry type (GeometryCollection not supported)", + )); + } + } + Ok(()) +} + +// ── Stream write helpers ────────────────────────────────────────────────────── + +/// Write a VarInt-encoded `u32` stream with implicit count (= `feature_count`). +fn write_u32_stream_implicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> { + buf.push(ENC_VARINT); + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +/// Write a VarInt-encoded `u32` stream with explicit count. +fn write_u32_stream_explicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> { + buf.push(ENC_VARINT_EXPL); + buf.write_varint(values.len() as u32).map_err(MltError::from)?; + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +/// Write a VarInt-encoded `u64` stream with implicit count. +fn write_u64_stream_implicit(buf: &mut Vec, values: &[u64]) -> MltResult<()> { + buf.push(ENC_VARINT); + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +/// Write a ZigZag+VarInt encoded `i32` stream with implicit count. +fn write_i32_stream_implicit(buf: &mut Vec, values: &[i32]) -> MltResult<()> { + buf.push(ENC_VARINT); + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(i32::encode(v)).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +/// Write a ZigZag+VarInt encoded `i64` stream with implicit count. +fn write_i64_stream_implicit(buf: &mut Vec, values: &[i64]) -> MltResult<()> { + buf.push(ENC_VARINT); + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(i64::encode(v)).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +/// Write a fixed-width raw byte stream with explicit count (number of data bytes). +fn write_raw_bytes(buf: &mut Vec, data: &[u8]) -> MltResult<()> { + buf.push(ENC_RAW_EXPL); + buf.write_varint(data.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(data); + Ok(()) +} + +/// Write vertex data using ComponentwiseDelta + VarInt with explicit count (vertex pairs). +fn write_cwdelta_vertices(buf: &mut Vec, flat_verts: &[i32]) -> MltResult<()> { + debug_assert!(flat_verts.len() % 2 == 0, "vertex buffer must have even length"); + buf.push(ENC_CWDELTA_VARINT_EXPL); + let pair_count = flat_verts.len() / 2; + buf.write_varint(pair_count as u32).map_err(MltError::from)?; + + let mut encoded: Vec = Vec::new(); + encode_componentwise_delta_vec2s(flat_verts, &mut encoded); + + let mut tmp = Vec::new(); + for v in &encoded { + tmp.write_varint(*v).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +fn write_geometry_streams( + buf: &mut Vec, + types: &[u32], + geo_lengths: Option<&[u32]>, + part_lengths: Option<&[u32]>, + ring_lengths: Option<&[u32]>, + vertices: &[i32], + feature_count: usize, +) -> MltResult<()> { + debug_assert_eq!(types.len(), feature_count); + // Types stream: count = feature_count (implicit) + write_u32_stream_implicit(buf, types)?; + if let Some(g) = geo_lengths { + write_u32_stream_explicit(buf, g)?; + } + if let Some(p) = part_lengths { + write_u32_stream_explicit(buf, p)?; + } + if let Some(r) = ring_lengths { + write_u32_stream_explicit(buf, r)?; + } + write_cwdelta_vertices(buf, vertices) +} + +// ── Presence bitfield ───────────────────────────────────────────────────────── + +/// Write `ceil(feature_count / 8)` bytes of a packed presence bitfield. +/// +/// Bit `i` (LSB-first within each byte) is set when `values[i]` is non-null. +fn write_presence(buf: &mut Vec, null_mask: &[bool]) { + let byte_count = null_mask.len().div_ceil(8); + let start = buf.len(); + buf.resize(start + byte_count, 0u8); + for (i, &present) in null_mask.iter().enumerate() { + if present { + buf[start + i / 8] |= 1 << (i % 8); + } + } +} + +// ── ID column ───────────────────────────────────────────────────────────────── + +fn write_id_column(buf: &mut Vec, ids: &[Option], _feature_count: usize) -> MltResult<()> { + let any_null = ids.iter().any(|id| id.is_none()); + + if any_null { + // OptId: presence bitfield + data for present values + buf.push(COL_OPT_ID); + let presence: Vec = ids.iter().map(|id| id.is_some()).collect(); + write_presence(buf, &presence); + let values: Vec = ids.iter().filter_map(|id| *id).collect(); + write_u64_stream_implicit(buf, &values)?; + } else { + // Id: all values present + buf.push(COL_ID); + let values: Vec = ids.iter().map(|id| id.unwrap_or(0)).collect(); + write_u64_stream_implicit(buf, &values)?; + } + + Ok(()) +} + +// ── Property column ─────────────────────────────────────────────────────────── + +fn write_property_column( + buf: &mut Vec, + name: &str, + values: &[&PropValue], + feature_count: usize, +) -> MltResult<()> { + debug_assert_eq!(values.len(), feature_count); + + // Determine the dominant type and whether any value is null + let any_null = values.iter().any(|v| is_null(v)); + + let col_type = prop_column_type(values[0], any_null); + buf.push(col_type); + + // name + let name_bytes = name.as_bytes(); + buf.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(name_bytes); + + // presence bitfield (for optional columns) + if any_null { + let presence: Vec = values.iter().map(|v| !is_null(v)).collect(); + write_presence(buf, &presence); + } + + // data streams + write_prop_data(buf, values, any_null) +} + +fn is_null(v: &PropValue) -> bool { + match v { + PropValue::Bool(o) => o.is_none(), + PropValue::I8(o) => o.is_none(), + PropValue::U8(o) => o.is_none(), + PropValue::I32(o) => o.is_none(), + PropValue::U32(o) => o.is_none(), + PropValue::I64(o) => o.is_none(), + PropValue::U64(o) => o.is_none(), + PropValue::F32(o) => o.is_none(), + PropValue::F64(o) => o.is_none(), + PropValue::Str(o) => o.is_none(), + } +} + +fn prop_column_type(sample: &PropValue, any_null: bool) -> u8 { + let (non_opt, opt) = match sample { + PropValue::Bool(_) => (COL_BOOL, COL_OPT_BOOL), + PropValue::I8(_) => (COL_I8, COL_OPT_I8), + PropValue::U8(_) => (COL_U8, COL_OPT_U8), + PropValue::I32(_) => (COL_I32, COL_OPT_I32), + PropValue::U32(_) => (COL_U32, COL_OPT_U32), + PropValue::I64(_) => (COL_I64, COL_OPT_I64), + PropValue::U64(_) => (COL_U64, COL_OPT_U64), + PropValue::F32(_) => (COL_F32, COL_OPT_F32), + PropValue::F64(_) => (COL_F64, COL_OPT_F64), + PropValue::Str(_) => (COL_STR, COL_OPT_STR), + }; + if any_null { opt } else { non_opt } +} + +fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> MltResult<()> { + match values[0] { + PropValue::Bool(_) => { + let data: Vec = + values.iter().filter(|v| !is_null(v)).map(|v| prop_bool(v) as u8).collect(); + write_raw_bytes(buf, &data)?; + } + PropValue::I8(_) => { + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| i8::encode(prop_i8(v)) as u8) + .collect(); + write_raw_bytes(buf, &data)?; + } + PropValue::U8(_) => { + let data: Vec = + values.iter().filter(|v| !is_null(v)).map(|v| prop_u8(v)).collect(); + write_raw_bytes(buf, &data)?; + } + PropValue::I32(_) => { + let data: Vec = + values.iter().filter(|v| !is_null(v)).map(|v| prop_i32(v)).collect(); + let implicit_count = !any_null; + if implicit_count { + write_i32_stream_implicit(buf, &data)?; + } else { + write_i32_stream_as_explicit(buf, &data)?; + } + } + PropValue::U32(_) => { + let data: Vec = + values.iter().filter(|v| !is_null(v)).map(|v| prop_u32(v)).collect(); + let implicit_count = !any_null; + if implicit_count { + write_u32_stream_implicit(buf, &data)?; + } else { + write_u32_popcount(buf, &data)?; + } + } + PropValue::I64(_) => { + let data: Vec = + values.iter().filter(|v| !is_null(v)).map(|v| prop_i64(v)).collect(); + if !any_null { + write_i64_stream_implicit(buf, &data)?; + } else { + write_i64_stream_as_explicit(buf, &data)?; + } + } + PropValue::U64(_) => { + let data: Vec = + values.iter().filter(|v| !is_null(v)).map(|v| prop_u64(v)).collect(); + if !any_null { + write_u64_stream_implicit(buf, &data)?; + } else { + write_u64_popcount(buf, &data)?; + } + } + PropValue::F32(_) => { + let mut data = Vec::new(); + for v in values.iter().filter(|v| !is_null(v)) { + data.extend_from_slice(&prop_f32(v).to_le_bytes()); + } + write_raw_bytes(buf, &data)?; + } + PropValue::F64(_) => { + let mut data = Vec::new(); + for v in values.iter().filter(|v| !is_null(v)) { + data.extend_from_slice(&prop_f64(v).to_le_bytes()); + } + write_raw_bytes(buf, &data)?; + } + PropValue::Str(_) => { + let strings: Vec> = values + .iter() + .map(|v| match v { + PropValue::Str(Some(s)) => Some(s.as_str()), + _ => None, + }) + .collect(); + write_string_col_data(buf, &strings, any_null)?; + } + } + Ok(()) +} + +/// Write string column data streams (lengths + raw bytes). +/// Only the non-null strings are written (count = popcount when any_null, else feature_count). +fn write_string_col_data(buf: &mut Vec, strings: &[Option<&str>], any_null: bool) -> MltResult<()> { + let present: Vec<&str> = strings.iter().filter_map(|s| *s).collect(); + let lengths: Vec = present.iter().map(|s| s.len() as u32).collect(); + let all_bytes: Vec = present.iter().flat_map(|s| s.as_bytes()).copied().collect(); + + // Lengths stream: count = popcount (implicit when any_null, else feature_count) + if any_null { + // count = popcount(presence), which the decoder can compute → use implicit + write_u32_stream_implicit(buf, &lengths)?; + } else { + write_u32_stream_implicit(buf, &lengths)?; + } + // String data: explicit count = total bytes + write_raw_bytes(buf, &all_bytes)?; + + Ok(()) +} + +// ── VarInt stream helpers for optional columns (count = popcount) ───────────── + +fn write_i32_stream_as_explicit(buf: &mut Vec, values: &[i32]) -> MltResult<()> { + buf.push(ENC_VARINT); // count = popcount (implicit for optional columns) + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(i32::encode(v)).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +fn write_i64_stream_as_explicit(buf: &mut Vec, values: &[i64]) -> MltResult<()> { + buf.push(ENC_VARINT); // count = popcount + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(i64::encode(v)).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +fn write_u32_popcount(buf: &mut Vec, values: &[u32]) -> MltResult<()> { + buf.push(ENC_VARINT); // count = popcount + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +fn write_u64_popcount(buf: &mut Vec, values: &[u64]) -> MltResult<()> { + buf.push(ENC_VARINT); // count = popcount + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v).map_err(MltError::from)?; + } + buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&tmp); + Ok(()) +} + +// ── PropValue field accessors ───────────────────────────────────────────────── + +fn prop_bool(v: &PropValue) -> bool { + if let PropValue::Bool(Some(b)) = v { *b } else { false } +} +fn prop_i8(v: &PropValue) -> i8 { + if let PropValue::I8(Some(b)) = v { *b } else { 0 } +} +fn prop_u8(v: &PropValue) -> u8 { + if let PropValue::U8(Some(b)) = v { *b } else { 0 } +} +fn prop_i32(v: &PropValue) -> i32 { + if let PropValue::I32(Some(b)) = v { *b } else { 0 } +} +fn prop_u32(v: &PropValue) -> u32 { + if let PropValue::U32(Some(b)) = v { *b } else { 0 } +} +fn prop_i64(v: &PropValue) -> i64 { + if let PropValue::I64(Some(b)) = v { *b } else { 0 } +} +fn prop_u64(v: &PropValue) -> u64 { + if let PropValue::U64(Some(b)) = v { *b } else { 0 } +} +fn prop_f32(v: &PropValue) -> f32 { + if let PropValue::F32(Some(b)) = v { *b } else { 0.0 } +} +fn prop_f64(v: &PropValue) -> f64 { + if let PropValue::F64(Some(b)) = v { *b } else { 0.0 } +} + +// ── Decoder ─────────────────────────────────────────────────────────────────── + +/// Decode an MLT **v2** layer body (the bytes after the `tag=2` byte). +/// +/// Returns a [`TileLayer01`] that can be used by the rest of the MLT tooling. +pub fn decode_v2_layer(data: &[u8]) -> MltResult { + // ── Header ──────────────────────────────────────────────────────────────── + let (data, name_len) = parse_varint::(data)?; + let name_len = name_len as usize; + if data.len() < name_len { + return Err(MltError::BufferUnderflow(name_len as u32, data.len())); + } + let name = std::str::from_utf8(&data[..name_len])?.to_string(); + let data = &data[name_len..]; + + let (data, extent) = parse_varint::(data)?; + let (data, feature_count) = parse_varint::(data)?; + let feature_count = feature_count as usize; + + // ── Geometry section ────────────────────────────────────────────────────── + if data.is_empty() { + return Err(MltError::MissingGeometry); + } + let layout = GeoLayout::try_from(data[0])?; + let data = &data[1..]; + + let (data, geometries) = decode_geometry(data, layout, feature_count)?; + + // ── Column count ────────────────────────────────────────────────────────── + let (mut data, col_count) = parse_varint::(data)?; + + // ── Columns ─────────────────────────────────────────────────────────────── + let mut feature_ids: Vec> = vec![None; feature_count]; + let mut property_names: Vec = Vec::new(); + let mut property_columns: Vec> = Vec::new(); + let mut presence_groups: Vec> = Vec::new(); + + for _ in 0..col_count { + if data.is_empty() { + return Err(MltError::BufferUnderflow(1, 0)); + } + let col_type = data[0]; + data = &data[1..]; + + match col_type { + COL_ID => { + let (new_data, ids) = decode_id_column(data, false, feature_count)?; + data = new_data; + for (i, id) in ids.into_iter().enumerate() { + feature_ids[i] = Some(id); + } + } + COL_OPT_ID => { + let (new_data, presence, ids) = decode_opt_id_column(data, feature_count)?; + data = new_data; + presence_groups.push(presence.clone()); + let mut iter = ids.into_iter(); + for (i, &present) in presence.iter().enumerate() { + if present { + feature_ids[i] = iter.next(); + } + } + } + COL_BOOL | COL_OPT_BOOL + | COL_I8 | COL_OPT_I8 + | COL_U8 | COL_OPT_U8 + | COL_I32 | COL_OPT_I32 + | COL_U32 | COL_OPT_U32 + | COL_I64 | COL_OPT_I64 + | COL_U64 | COL_OPT_U64 + | COL_F32 | COL_OPT_F32 + | COL_F64 | COL_OPT_F64 + | COL_STR | COL_OPT_STR => { + let (new_data, col_name, col_values) = + decode_property_column(data, col_type, feature_count, &mut presence_groups)?; + data = new_data; + property_names.push(col_name); + property_columns.push(col_values); + } + _ => { + return Err(MltError::NotImplemented("unsupported v2 column type")); + } + } + } + + // ── Assemble TileLayer01 ────────────────────────────────────────────────── + let prop_count = property_names.len(); + let features = geometries + .into_iter() + .enumerate() + .map(|(i, geom)| { + let id = feature_ids[i]; + let properties = (0..prop_count).map(|j| property_columns[j][i].clone()).collect(); + TileFeature { id, geometry: geom, properties } + }) + .collect(); + + Ok(TileLayer01 { name, extent, property_names, features }) +} + +// ── Geometry decoding ───────────────────────────────────────────────────────── + +fn decode_geometry<'a>( + data: &'a [u8], + layout: GeoLayout, + feature_count: usize, +) -> MltResult<(&'a [u8], Vec>)> { + // Types stream: count = feature_count (implicit) + let (data, type_vals) = read_u32_stream(data, feature_count, false)?; + let geo_types: Vec = type_vals + .iter() + .map(|&v| GeometryType::try_from(v as u8).map_err(MltError::from)) + .collect::>()?; + + // Optional auxiliary streams, depending on layout + let (data, geo_lengths) = if matches!( + layout, + GeoLayout::MultiPoints | GeoLayout::MultiLines | GeoLayout::MultiPolygons + ) { + let (d, v) = read_u32_stream(data, 0, true)?; + (d, Some(v)) + } else { + (data, None) + }; + + let (data, part_lengths) = if matches!( + layout, + GeoLayout::Lines | GeoLayout::MultiLines | GeoLayout::Polygons | GeoLayout::MultiPolygons + ) { + let (d, v) = read_u32_stream(data, 0, true)?; + (d, Some(v)) + } else { + (data, None) + }; + + let (data, ring_lengths) = if matches!(layout, GeoLayout::Polygons | GeoLayout::MultiPolygons) { + let (d, v) = read_u32_stream(data, 0, true)?; + (d, Some(v)) + } else { + (data, None) + }; + + // Vertices stream: explicit count (vertex pairs) + let (data, vertices) = read_cwdelta_stream(data)?; + + let geometries = reconstruct_geometries( + layout, + &geo_types, + geo_lengths.as_deref(), + part_lengths.as_deref(), + ring_lengths.as_deref(), + &vertices, + )?; + + Ok((data, geometries)) +} + +fn reconstruct_geometries( + layout: GeoLayout, + types: &[GeometryType], + geo_lengths: Option<&[u32]>, + part_lengths: Option<&[u32]>, + ring_lengths: Option<&[u32]>, + vertices: &[i32], +) -> MltResult>> { + let mut geoms = Vec::with_capacity(types.len()); + let mut vert_pos: usize = 0; // current position in vertices (counting i32 elements) + + match layout { + GeoLayout::Points => { + for _ in types { + let x = vertices[vert_pos]; + let y = vertices[vert_pos + 1]; + vert_pos += 2; + geoms.push(Geometry::Point(Point(Coord { x, y }))); + } + } + GeoLayout::MultiPoints => { + let geo_lens = geo_lengths.expect("MultiPoints requires GeoLengths"); + for &geo_len in geo_lens { + let pts: Vec> = (0..geo_len as usize) + .map(|_| { + let x = vertices[vert_pos]; + let y = vertices[vert_pos + 1]; + vert_pos += 2; + Point(Coord { x, y }) + }) + .collect(); + geoms.push(Geometry::MultiPoint(MultiPoint(pts))); + } + } + GeoLayout::Lines => { + let part_lens = part_lengths.expect("Lines requires PartLengths"); + for &part_len in part_lens { + let coords: Vec> = (0..part_len as usize) + .map(|_| { + let c = Coord { x: vertices[vert_pos], y: vertices[vert_pos + 1] }; + vert_pos += 2; + c + }) + .collect(); + geoms.push(Geometry::LineString(LineString(coords))); + } + } + GeoLayout::MultiLines => { + let geo_lens = geo_lengths.expect("MultiLines requires GeoLengths"); + let part_lens = part_lengths.expect("MultiLines requires PartLengths"); + let mut part_idx = 0; + for &geo_len in geo_lens { + let lines: Vec> = (0..geo_len as usize) + .map(|_| { + let part_len = part_lens[part_idx] as usize; + part_idx += 1; + let coords: Vec> = (0..part_len) + .map(|_| { + let c = Coord { x: vertices[vert_pos], y: vertices[vert_pos + 1] }; + vert_pos += 2; + c + }) + .collect(); + LineString(coords) + }) + .collect(); + geoms.push(Geometry::MultiLineString(MultiLineString(lines))); + } + } + GeoLayout::Polygons => { + let part_lens = part_lengths.expect("Polygons requires PartLengths"); + let ring_lens = ring_lengths.expect("Polygons requires RingLengths"); + let mut ring_idx = 0; + for &part_len in part_lens { + let rings: Vec> = (0..part_len as usize) + .map(|_| { + let ring_len = ring_lens[ring_idx] as usize; + ring_idx += 1; + let mut coords: Vec> = (0..ring_len) + .map(|_| { + let c = Coord { x: vertices[vert_pos], y: vertices[vert_pos + 1] }; + vert_pos += 2; + c + }) + .collect(); + // Close the ring (MLT omits the closing vertex) + if let Some(&first) = coords.first() { + coords.push(first); + } + LineString(coords) + }) + .collect(); + let mut ring_iter = rings.into_iter(); + let exterior = ring_iter.next().unwrap_or_else(|| LineString(vec![])); + let holes: Vec> = ring_iter.collect(); + geoms.push(Geometry::Polygon(Polygon::new(exterior, holes))); + } + } + GeoLayout::MultiPolygons => { + let geo_lens = geo_lengths.expect("MultiPolygons requires GeoLengths"); + let part_lens = part_lengths.expect("MultiPolygons requires PartLengths"); + let ring_lens = ring_lengths.expect("MultiPolygons requires RingLengths"); + let mut part_idx = 0; + let mut ring_idx = 0; + for &geo_len in geo_lens { + let polys: Vec> = (0..geo_len as usize) + .map(|_| { + let part_len = part_lens[part_idx] as usize; + part_idx += 1; + let rings: Vec> = (0..part_len) + .map(|_| { + let ring_len = ring_lens[ring_idx] as usize; + ring_idx += 1; + let mut coords: Vec> = (0..ring_len) + .map(|_| { + let c = Coord { + x: vertices[vert_pos], + y: vertices[vert_pos + 1], + }; + vert_pos += 2; + c + }) + .collect(); + if let Some(&first) = coords.first() { + coords.push(first); + } + LineString(coords) + }) + .collect(); + let mut ring_iter = rings.into_iter(); + let exterior = ring_iter.next().unwrap_or_else(|| LineString(vec![])); + Polygon::new(exterior, ring_iter.collect()) + }) + .collect(); + geoms.push(Geometry::MultiPolygon(MultiPolygon(polys))); + } + } + } + + Ok(geoms) +} + +// ── Stream read helpers ─────────────────────────────────────────────────────── + +/// Read one encoding byte, returning (remaining, has_explicit_count, logical, physical). +fn read_enc_byte(data: &[u8]) -> MltResult<(&[u8], bool, u8, u8)> { + if data.is_empty() { + return Err(MltError::BufferUnderflow(1, 0)); + } + let b = data[0]; + let has_explicit_count = (b & 0x80) != 0; + let logical = (b >> 4) & 0x07; + let physical = (b >> 2) & 0x03; + Ok((&data[1..], has_explicit_count, logical, physical)) +} + +/// Read a VarInt u32 stream. If `explicit` is true, read the count from the stream; +/// otherwise use `implicit_count`. After reading count, reads byte_length and the data. +fn read_u32_stream(data: &[u8], implicit_count: usize, explicit: bool) -> MltResult<(&[u8], Vec)> { + let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; + + if logical != 0 { + // Only None logical is supported here + return Err(MltError::NotImplemented("v2 decoder: unsupported logical encoding for integer stream")); + } + if physical != 2 { + return Err(MltError::NotImplemented("v2 decoder: only VarInt physical encoding supported for integer stream")); + } + + let (data, count) = if has_expl_count || explicit { + let (d, c) = parse_varint::(data)?; + (d, c as usize) + } else { + (data, implicit_count) + }; + + // byte_length + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let mut values = Vec::with_capacity(count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + values.push(v); + pos += consumed; + } + + Ok((remaining, values)) +} + +/// Read a raw-bytes stream (physical=None-noLen, logical=None, explicit count = byte count). +fn read_raw_bytes_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { + let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; + + if logical != 0 || physical != 0 || !has_expl_count { + return Err(MltError::NotImplemented( + "v2 decoder: unexpected encoding byte for raw bytes stream", + )); + } + + let (data, count) = parse_varint::(data)?; + let count = count as usize; + if data.len() < count { + return Err(MltError::BufferUnderflow(count as u32, data.len())); + } + Ok((&data[count..], data[..count].to_vec())) +} + +/// Read a ZigZag+VarInt i32 stream (count from context = explicit). +fn read_i32_stream(data: &[u8], implicit_count: usize, _has_any_null: bool) -> MltResult<(&[u8], Vec)> { + let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; + + if logical != 0 || physical != 2 { + return Err(MltError::NotImplemented("v2 decoder: only VarInt/ZigZag supported for i32 stream")); + } + + let (data, count) = if has_expl_count { + let (d, c) = parse_varint::(data)?; + (d, c as usize) + } else { + (data, implicit_count) + }; + + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let mut values = Vec::with_capacity(count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + values.push(i32::decode(v)); + pos += consumed; + } + + Ok((remaining, values)) +} + +/// Read a ZigZag+VarInt i64 stream. +fn read_i64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec)> { + let (data, _has_expl_count, logical, physical) = read_enc_byte(data)?; + + if logical != 0 || physical != 2 { + return Err(MltError::NotImplemented("v2 decoder: only VarInt/ZigZag supported for i64 stream")); + } + + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let mut values = Vec::with_capacity(implicit_count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u64::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(8, encoded.len() - pos))?; + values.push(i64::decode(v)); + pos += consumed; + } + + Ok((remaining, values)) +} + +/// Read a VarInt u64 stream. +fn read_u64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec)> { + let (data, _has_expl_count, logical, physical) = read_enc_byte(data)?; + + if logical != 0 || physical != 2 { + return Err(MltError::NotImplemented("v2 decoder: only VarInt supported for u64 stream")); + } + + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let mut values = Vec::with_capacity(implicit_count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u64::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(8, encoded.len() - pos))?; + values.push(v); + pos += consumed; + } + + Ok((remaining, values)) +} + +/// Read a ComponentwiseDelta + VarInt vertex stream (explicit count = vertex pairs). +fn read_cwdelta_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { + let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; + + if logical != 2 || physical != 2 { + return Err(MltError::NotImplemented( + "v2 decoder: only CwDelta+VarInt supported for vertex stream", + )); + } + + let (data, pair_count) = if has_expl_count { + let (d, c) = parse_varint::(data)?; + (d, c as usize) + } else { + return Err(MltError::NotImplemented( + "v2 decoder: vertex stream must have explicit count", + )); + }; + + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + // Decode VarInt u32 values + let coord_count = pair_count * 2; + let mut u32_vals = Vec::with_capacity(coord_count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + u32_vals.push(v); + pos += consumed; + } + + // Apply inverse CwDelta + let mut result = Vec::with_capacity(coord_count); + let mut last_x: i32 = 0; + let mut last_y: i32 = 0; + for chunk in u32_vals.chunks_exact(2) { + last_x = last_x.wrapping_add(i32::decode(chunk[0])); + last_y = last_y.wrapping_add(i32::decode(chunk[1])); + result.push(last_x); + result.push(last_y); + } + + Ok((remaining, result)) +} + + +// ── Presence bitfield ───────────────────────────────────────────────────────── + +/// Read `ceil(feature_count / 8)` bytes of presence bitfield and return a `Vec`. +fn read_presence(data: &[u8], feature_count: usize) -> MltResult<(&[u8], Vec)> { + let byte_count = feature_count.div_ceil(8); + if data.len() < byte_count { + return Err(MltError::BufferUnderflow(byte_count as u32, data.len())); + } + let mut presence = Vec::with_capacity(feature_count); + for i in 0..feature_count { + presence.push((data[i / 8] >> (i % 8)) & 1 == 1); + } + Ok((&data[byte_count..], presence)) +} + +// ── ID column decoding ──────────────────────────────────────────────────────── + +fn decode_id_column<'a>( + data: &'a [u8], + _optional: bool, + feature_count: usize, +) -> MltResult<(&'a [u8], Vec)> { + read_u64_stream(data, feature_count) +} + +fn decode_opt_id_column<'a>( + data: &'a [u8], + feature_count: usize, +) -> MltResult<(&'a [u8], Vec, Vec)> { + let (data, presence) = read_presence(data, feature_count)?; + let popcount = presence.iter().filter(|&&p| p).count(); + let (data, ids) = read_u64_stream(data, popcount)?; + Ok((data, presence, ids)) +} + +// ── Property column decoding ────────────────────────────────────────────────── + +fn decode_property_column<'a>( + data: &'a [u8], + col_type: u8, + feature_count: usize, + presence_groups: &mut Vec>, +) -> MltResult<(&'a [u8], String, Vec)> { + // name + let (data, name_len) = parse_varint::(data)?; + let name_len = name_len as usize; + if data.len() < name_len { + return Err(MltError::BufferUnderflow(name_len as u32, data.len())); + } + let name = std::str::from_utf8(&data[..name_len])?.to_string(); + let data = &data[name_len..]; + + let is_optional = matches!( + col_type, + COL_OPT_BOOL | COL_OPT_I8 | COL_OPT_U8 + | COL_OPT_I32 | COL_OPT_U32 + | COL_OPT_I64 | COL_OPT_U64 + | COL_OPT_F32 | COL_OPT_F64 + | COL_OPT_STR + ); + + let (data, presence) = if is_optional { + let (d, p) = read_presence(data, feature_count)?; + presence_groups.push(p.clone()); + (d, Some(p)) + } else { + (data, None) + }; + + let popcount = presence.as_ref().map_or(feature_count, |p| p.iter().filter(|&&b| b).count()); + + let (data, values) = decode_column_data(data, col_type, feature_count, popcount, &presence)?; + + Ok((data, name, values)) +} + +fn decode_column_data<'a>( + data: &'a [u8], + col_type: u8, + feature_count: usize, + popcount: usize, + presence: &Option>, +) -> MltResult<(&'a [u8], Vec)> { + match col_type { + COL_BOOL | COL_OPT_BOOL => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let values = expand_with_presence( + bytes.iter().map(|&b| PropValue::Bool(Some(b != 0))), + presence, + feature_count, + PropValue::Bool(None), + ); + Ok((data, values)) + } + COL_I8 | COL_OPT_I8 => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let values = expand_with_presence( + bytes.iter().map(|&b| PropValue::I8(Some(i8::decode(b as u32 as u8)))), + presence, + feature_count, + PropValue::I8(None), + ); + Ok((data, values)) + } + COL_U8 | COL_OPT_U8 => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let values = expand_with_presence( + bytes.into_iter().map(|b| PropValue::U8(Some(b))), + presence, + feature_count, + PropValue::U8(None), + ); + Ok((data, values)) + } + COL_I32 | COL_OPT_I32 => { + let any_null = col_type == COL_OPT_I32; + let (data, ints) = read_i32_stream(data, popcount, any_null)?; + let values = expand_with_presence( + ints.into_iter().map(|v| PropValue::I32(Some(v))), + presence, + feature_count, + PropValue::I32(None), + ); + Ok((data, values)) + } + COL_U32 | COL_OPT_U32 => { + let (data, ints) = read_u32_stream(data, popcount, false)?; + let values = expand_with_presence( + ints.into_iter().map(|v| PropValue::U32(Some(v))), + presence, + feature_count, + PropValue::U32(None), + ); + Ok((data, values)) + } + COL_I64 | COL_OPT_I64 => { + let (data, ints) = read_i64_stream(data, popcount)?; + let values = expand_with_presence( + ints.into_iter().map(|v| PropValue::I64(Some(v))), + presence, + feature_count, + PropValue::I64(None), + ); + Ok((data, values)) + } + COL_U64 | COL_OPT_U64 => { + let (data, ints) = read_u64_stream(data, popcount)?; + let values = expand_with_presence( + ints.into_iter().map(|v| PropValue::U64(Some(v))), + presence, + feature_count, + PropValue::U64(None), + ); + Ok((data, values)) + } + COL_F32 | COL_OPT_F32 => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let floats: Vec = bytes + .chunks_exact(4) + .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]])) + .collect(); + let values = expand_with_presence( + floats.into_iter().map(|v| PropValue::F32(Some(v))), + presence, + feature_count, + PropValue::F32(None), + ); + Ok((data, values)) + } + COL_F64 | COL_OPT_F64 => { + let (data, bytes) = read_raw_bytes_stream(data)?; + let floats: Vec = bytes + .chunks_exact(8) + .map(|c| f64::from_le_bytes([c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7]])) + .collect(); + let values = expand_with_presence( + floats.into_iter().map(|v| PropValue::F64(Some(v))), + presence, + feature_count, + PropValue::F64(None), + ); + Ok((data, values)) + } + COL_STR | COL_OPT_STR => { + decode_string_column(data, popcount, presence, feature_count) + } + _ => Err(MltError::NotImplemented("v2 decoder: unknown column type")), + } +} + +fn decode_string_column<'a>( + data: &'a [u8], + popcount: usize, + presence: &Option>, + feature_count: usize, +) -> MltResult<(&'a [u8], Vec)> { + // Lengths stream (count = popcount, implicit) + let (data, lengths) = read_u32_stream(data, popcount, false)?; + // String data (explicit byte count) + let (data, str_bytes) = read_raw_bytes_stream(data)?; + + // Reconstruct strings from lengths + let mut strings: Vec> = Vec::with_capacity(popcount); + let mut offset = 0; + for &len in &lengths { + let len = len as usize; + let s = std::str::from_utf8(&str_bytes[offset..offset + len])?.to_string(); + strings.push(Some(s)); + offset += len; + } + + let values = expand_with_presence( + strings.into_iter().map(|s| PropValue::Str(s)), + presence, + feature_count, + PropValue::Str(None), + ); + Ok((data, values)) +} + +/// Expand a stream of non-null values back to `feature_count` values by +/// inserting `null_val` for absent features according to the presence bitfield. +fn expand_with_presence>( + present_values: I, + presence: &Option>, + feature_count: usize, + null_val: PropValue, +) -> Vec { + match presence { + None => present_values.collect(), + Some(pres) => { + let mut result = Vec::with_capacity(feature_count); + let mut iter = present_values; + for &present in pres { + if present { + result.push(iter.next().unwrap_or(null_val.clone())); + } else { + result.push(null_val.clone()); + } + } + result + } + } +} diff --git a/rust/mlt-core/tests/unknown_layer.rs b/rust/mlt-core/tests/unknown_layer.rs index c5e06bf8c..6960a7f73 100644 --- a/rust/mlt-core/tests/unknown_layer.rs +++ b/rust/mlt-core/tests/unknown_layer.rs @@ -72,9 +72,10 @@ fn unknown_zero_length_body() { #[test] fn multiple_layers_mixed_unknown_and_tag01() { - // Build two unknown layers back-to-back (tags 2 and 3, since tag=1 is Tag01). - let mut raw = unknown_layer_bytes(2, b"hello"); - raw.extend_from_slice(&unknown_layer_bytes(3, b"world")); + // Build two unknown layers back-to-back. Tags 1 and 2 are known (Tag01 and Tag02), + // so use tags 10 and 11 for these "unknown" tests. + let mut raw = unknown_layer_bytes(10, b"hello"); + raw.extend_from_slice(&unknown_layer_bytes(11, b"world")); let layers = Parser::default() .parse_layers(&raw) @@ -85,12 +86,12 @@ fn multiple_layers_mixed_unknown_and_tag01() { let Layer::Unknown(u0) = &layers[0] else { panic!("expected Unknown at index 0"); }; - assert_eq!(u0.tag(), 2u32); + assert_eq!(u0.tag(), 10u32); assert_eq!(u0.data(), b"hello"); let Layer::Unknown(u1) = &layers[1] else { panic!("expected Unknown at index 1"); }; - assert_eq!(u1.tag(), 3u32); + assert_eq!(u1.tag(), 11u32); assert_eq!(u1.data(), b"world"); } diff --git a/rust/mlt/src/convert.rs b/rust/mlt/src/convert.rs index 2cd384f48..c96029658 100644 --- a/rust/mlt/src/convert.rs +++ b/rust/mlt/src/convert.rs @@ -104,6 +104,9 @@ pub struct ConvertArgs { /// Disable grouping of similar string columns into shared dictionaries #[clap(long, default_value = "false")] no_shared_dict: bool, + /// Encode output as experimental MLT v2 format (tag=2 layers) + #[clap(long, default_value = "false")] + v2: bool, } pub fn convert(args: &ConvertArgs) -> AnyResult<()> { @@ -116,6 +119,10 @@ pub fn convert(args: &ConvertArgs) -> AnyResult<()> { ..Default::default() }; + if args.v2 && is_mbtiles_extension(&args.input) { + bail!("--v2 is not supported with .mbtiles input (only file-based conversion)"); + } + if is_mbtiles_extension(&args.input) { if !is_mbtiles_extension(&args.output) { bail!( @@ -145,8 +152,9 @@ pub fn convert(args: &ConvertArgs) -> AnyResult<()> { }; let failed = AtomicUsize::new(0); + let use_v2 = args.v2; files.par_iter().for_each(|file| { - if let Err(e) = convert_file(file, base, &args.output, cfg) { + if let Err(e) = convert_file(file, base, &args.output, cfg, use_v2) { eprintln!("error: {}: {e:#}", file.display()); failed.fetch_add(1, Ordering::Relaxed); } @@ -159,7 +167,13 @@ pub fn convert(args: &ConvertArgs) -> AnyResult<()> { Ok(()) } -fn convert_file(file: &Path, base: &Path, output: &Path, cfg: EncoderConfig) -> AnyResult<()> { +fn convert_file( + file: &Path, + base: &Path, + output: &Path, + cfg: EncoderConfig, + v2: bool, +) -> AnyResult<()> { let rel = file .strip_prefix(base) .with_context(|| format!("stripping prefix from {}", file.display()))?; @@ -172,8 +186,16 @@ fn convert_file(file: &Path, base: &Path, output: &Path, cfg: EncoderConfig) -> let buffer = fs::read(file).with_context(|| format!("reading {}", file.display()))?; let out_bytes = if is_mlt_extension(file) { - convert_mlt_buffer(&buffer, cfg) - .with_context(|| format!("converting MLT {}", file.display()))? + if v2 { + convert_mlt_buffer_v2(&buffer) + .with_context(|| format!("converting MLT→v2 {}", file.display()))? + } else { + convert_mlt_buffer(&buffer, cfg) + .with_context(|| format!("converting MLT {}", file.display()))? + } + } else if v2 { + convert_mvt_buffer_v2(buffer) + .with_context(|| format!("converting MVT→v2 {}", file.display()))? } else { convert_mvt_buffer(buffer, cfg) .with_context(|| format!("converting MVT {}", file.display()))? @@ -257,6 +279,38 @@ fn convert_mvt_buffer(buffer: Vec, cfg: EncoderConfig) -> AnyResult> Ok(out) } +/// Re-encode an MLT tile as v2, decoding each Tag01/Tag02 layer and re-encoding as v2. +fn convert_mlt_buffer_v2(buffer: &[u8]) -> AnyResult> { + let layers = Parser::default().parse_layers(buffer)?; + let mut dec = Decoder::default(); + let mut out: Vec = Vec::new(); + + for layer in layers { + match layer { + Layer::Tag01(l) => { + let tile = l.into_tile(&mut dec)?; + out.extend_from_slice(&tile.encode_v2()?); + } + Layer::Tag02(tile) => { + out.extend_from_slice(&tile.encode_v2()?); + } + Layer::Unknown(_) => {} + _ => {} + } + } + + Ok(out) +} + +/// Convert an MVT tile to the experimental MLT v2 format. +fn convert_mvt_buffer_v2(buffer: Vec) -> AnyResult> { + let mut out: Vec = Vec::new(); + for tile in mvt_to_tile_layers(buffer)? { + out.extend_from_slice(&tile.encode_v2()?); + } + Ok(out) +} + /// Stage 1 of the pipeline: stream raw MVT tiles from the source `.mbtiles` file in /// `BATCH_SIZE` chunks and forward each batch to the compute stage via `raw_tx`. /// From ec8245dd0d9613577d8968f31f49331668a91c13 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Apr 2026 21:49:07 +0000 Subject: [PATCH 07/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- rust/mlt-core/src/v2.rs | 259 +++++++++++++++++++++++++++++----------- 1 file changed, 190 insertions(+), 69 deletions(-) diff --git a/rust/mlt-core/src/v2.rs b/rust/mlt-core/src/v2.rs index 3512990aa..7f917c733 100644 --- a/rust/mlt-core/src/v2.rs +++ b/rust/mlt-core/src/v2.rs @@ -10,7 +10,9 @@ //! - Own-presence bitfields for optional columns (no shared-presence / OptRef*) //! - Pure single-geometry-type layers only (no mixed geometry) -use geo_types::{Coord, Geometry, LineString, MultiLineString, MultiPoint, MultiPolygon, Point, Polygon}; +use geo_types::{ + Coord, Geometry, LineString, MultiLineString, MultiPoint, MultiPolygon, Point, Polygon, +}; use integer_encoding::{VarInt as _, VarIntWriter as _}; use zigzag::ZigZag as _; @@ -136,14 +138,16 @@ impl TileLayer01 { // name let name_bytes = self.name.as_bytes(); - body.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; + body.write_varint(name_bytes.len() as u32) + .map_err(MltError::from)?; body.extend_from_slice(name_bytes); // extent body.write_varint(self.extent).map_err(MltError::from)?; // feature_count (v2 addition) - body.write_varint(feature_count as u32).map_err(MltError::from)?; + body.write_varint(feature_count as u32) + .map_err(MltError::from)?; // ── Geometry section ────────────────────────────────────────────────── body.push(geom.layout as u8); @@ -209,9 +213,12 @@ fn collect_geometry(features: &[TileFeature]) -> MltResult { let has_ring = features .iter() .any(|f| matches!(f.geometry, Geometry::Polygon(_) | Geometry::MultiPolygon(_))); - let has_line = features - .iter() - .any(|f| matches!(f.geometry, Geometry::LineString(_) | Geometry::MultiLineString(_))); + let has_line = features.iter().any(|f| { + matches!( + f.geometry, + Geometry::LineString(_) | Geometry::MultiLineString(_) + ) + }); let has_part = has_line || has_ring; let layout = match (has_multi, has_ring, has_part) { @@ -240,7 +247,14 @@ fn collect_geometry(features: &[TileFeature]) -> MltResult { )?; } - Ok(CollectedGeometry { layout, types, geo_lengths, part_lengths, ring_lengths, vertices }) + Ok(CollectedGeometry { + layout, + types, + geo_lengths, + part_lengths, + ring_lengths, + vertices, + }) } fn push_coords(coords: impl Iterator>, vertices: &mut Vec) -> u32 { @@ -257,7 +271,11 @@ fn push_coords(coords: impl Iterator>, vertices: &mut Vec fn ring_vertex_count(ring: &LineString, vertices: &mut Vec) -> u32 { let coords = &ring.0; let has_closing = coords.len() >= 2 && coords.first() == coords.last(); - let coords_to_write = if has_closing { &coords[..coords.len() - 1] } else { &coords[..] }; + let coords_to_write = if has_closing { + &coords[..coords.len() - 1] + } else { + &coords[..] + }; push_coords(coords_to_write.iter().copied(), vertices) } @@ -344,7 +362,8 @@ fn write_u32_stream_implicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> /// Write a VarInt-encoded `u32` stream with explicit count. fn write_u32_stream_explicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> { buf.push(ENC_VARINT_EXPL); - buf.write_varint(values.len() as u32).map_err(MltError::from)?; + buf.write_varint(values.len() as u32) + .map_err(MltError::from)?; let mut tmp = Vec::new(); for &v in values { tmp.write_varint(v).map_err(MltError::from)?; @@ -393,17 +412,22 @@ fn write_i64_stream_implicit(buf: &mut Vec, values: &[i64]) -> MltResult<()> /// Write a fixed-width raw byte stream with explicit count (number of data bytes). fn write_raw_bytes(buf: &mut Vec, data: &[u8]) -> MltResult<()> { buf.push(ENC_RAW_EXPL); - buf.write_varint(data.len() as u32).map_err(MltError::from)?; + buf.write_varint(data.len() as u32) + .map_err(MltError::from)?; buf.extend_from_slice(data); Ok(()) } /// Write vertex data using ComponentwiseDelta + VarInt with explicit count (vertex pairs). fn write_cwdelta_vertices(buf: &mut Vec, flat_verts: &[i32]) -> MltResult<()> { - debug_assert!(flat_verts.len() % 2 == 0, "vertex buffer must have even length"); + debug_assert!( + flat_verts.len() % 2 == 0, + "vertex buffer must have even length" + ); buf.push(ENC_CWDELTA_VARINT_EXPL); let pair_count = flat_verts.len() / 2; - buf.write_varint(pair_count as u32).map_err(MltError::from)?; + buf.write_varint(pair_count as u32) + .map_err(MltError::from)?; let mut encoded: Vec = Vec::new(); encode_componentwise_delta_vec2s(flat_verts, &mut encoded); @@ -497,7 +521,8 @@ fn write_property_column( // name let name_bytes = name.as_bytes(); - buf.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; + buf.write_varint(name_bytes.len() as u32) + .map_err(MltError::from)?; buf.extend_from_slice(name_bytes); // presence bitfield (for optional columns) @@ -544,8 +569,11 @@ fn prop_column_type(sample: &PropValue, any_null: bool) -> u8 { fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> MltResult<()> { match values[0] { PropValue::Bool(_) => { - let data: Vec = - values.iter().filter(|v| !is_null(v)).map(|v| prop_bool(v) as u8).collect(); + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_bool(v) as u8) + .collect(); write_raw_bytes(buf, &data)?; } PropValue::I8(_) => { @@ -557,13 +585,19 @@ fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> write_raw_bytes(buf, &data)?; } PropValue::U8(_) => { - let data: Vec = - values.iter().filter(|v| !is_null(v)).map(|v| prop_u8(v)).collect(); + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_u8(v)) + .collect(); write_raw_bytes(buf, &data)?; } PropValue::I32(_) => { - let data: Vec = - values.iter().filter(|v| !is_null(v)).map(|v| prop_i32(v)).collect(); + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_i32(v)) + .collect(); let implicit_count = !any_null; if implicit_count { write_i32_stream_implicit(buf, &data)?; @@ -572,8 +606,11 @@ fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> } } PropValue::U32(_) => { - let data: Vec = - values.iter().filter(|v| !is_null(v)).map(|v| prop_u32(v)).collect(); + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_u32(v)) + .collect(); let implicit_count = !any_null; if implicit_count { write_u32_stream_implicit(buf, &data)?; @@ -582,8 +619,11 @@ fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> } } PropValue::I64(_) => { - let data: Vec = - values.iter().filter(|v| !is_null(v)).map(|v| prop_i64(v)).collect(); + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_i64(v)) + .collect(); if !any_null { write_i64_stream_implicit(buf, &data)?; } else { @@ -591,8 +631,11 @@ fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> } } PropValue::U64(_) => { - let data: Vec = - values.iter().filter(|v| !is_null(v)).map(|v| prop_u64(v)).collect(); + let data: Vec = values + .iter() + .filter(|v| !is_null(v)) + .map(|v| prop_u64(v)) + .collect(); if !any_null { write_u64_stream_implicit(buf, &data)?; } else { @@ -629,7 +672,11 @@ fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> /// Write string column data streams (lengths + raw bytes). /// Only the non-null strings are written (count = popcount when any_null, else feature_count). -fn write_string_col_data(buf: &mut Vec, strings: &[Option<&str>], any_null: bool) -> MltResult<()> { +fn write_string_col_data( + buf: &mut Vec, + strings: &[Option<&str>], + any_null: bool, +) -> MltResult<()> { let present: Vec<&str> = strings.iter().filter_map(|s| *s).collect(); let lengths: Vec = present.iter().map(|s| s.len() as u32).collect(); let all_bytes: Vec = present.iter().flat_map(|s| s.as_bytes()).copied().collect(); @@ -696,31 +743,67 @@ fn write_u64_popcount(buf: &mut Vec, values: &[u64]) -> MltResult<()> { // ── PropValue field accessors ───────────────────────────────────────────────── fn prop_bool(v: &PropValue) -> bool { - if let PropValue::Bool(Some(b)) = v { *b } else { false } + if let PropValue::Bool(Some(b)) = v { + *b + } else { + false + } } fn prop_i8(v: &PropValue) -> i8 { - if let PropValue::I8(Some(b)) = v { *b } else { 0 } + if let PropValue::I8(Some(b)) = v { + *b + } else { + 0 + } } fn prop_u8(v: &PropValue) -> u8 { - if let PropValue::U8(Some(b)) = v { *b } else { 0 } + if let PropValue::U8(Some(b)) = v { + *b + } else { + 0 + } } fn prop_i32(v: &PropValue) -> i32 { - if let PropValue::I32(Some(b)) = v { *b } else { 0 } + if let PropValue::I32(Some(b)) = v { + *b + } else { + 0 + } } fn prop_u32(v: &PropValue) -> u32 { - if let PropValue::U32(Some(b)) = v { *b } else { 0 } + if let PropValue::U32(Some(b)) = v { + *b + } else { + 0 + } } fn prop_i64(v: &PropValue) -> i64 { - if let PropValue::I64(Some(b)) = v { *b } else { 0 } + if let PropValue::I64(Some(b)) = v { + *b + } else { + 0 + } } fn prop_u64(v: &PropValue) -> u64 { - if let PropValue::U64(Some(b)) = v { *b } else { 0 } + if let PropValue::U64(Some(b)) = v { + *b + } else { + 0 + } } fn prop_f32(v: &PropValue) -> f32 { - if let PropValue::F32(Some(b)) = v { *b } else { 0.0 } + if let PropValue::F32(Some(b)) = v { + *b + } else { + 0.0 + } } fn prop_f64(v: &PropValue) -> f64 { - if let PropValue::F64(Some(b)) = v { *b } else { 0.0 } + if let PropValue::F64(Some(b)) = v { + *b + } else { + 0.0 + } } // ── Decoder ─────────────────────────────────────────────────────────────────── @@ -786,16 +869,10 @@ pub fn decode_v2_layer(data: &[u8]) -> MltResult { } } } - COL_BOOL | COL_OPT_BOOL - | COL_I8 | COL_OPT_I8 - | COL_U8 | COL_OPT_U8 - | COL_I32 | COL_OPT_I32 - | COL_U32 | COL_OPT_U32 - | COL_I64 | COL_OPT_I64 - | COL_U64 | COL_OPT_U64 - | COL_F32 | COL_OPT_F32 - | COL_F64 | COL_OPT_F64 - | COL_STR | COL_OPT_STR => { + COL_BOOL | COL_OPT_BOOL | COL_I8 | COL_OPT_I8 | COL_U8 | COL_OPT_U8 | COL_I32 + | COL_OPT_I32 | COL_U32 | COL_OPT_U32 | COL_I64 | COL_OPT_I64 | COL_U64 + | COL_OPT_U64 | COL_F32 | COL_OPT_F32 | COL_F64 | COL_OPT_F64 | COL_STR + | COL_OPT_STR => { let (new_data, col_name, col_values) = decode_property_column(data, col_type, feature_count, &mut presence_groups)?; data = new_data; @@ -815,12 +892,23 @@ pub fn decode_v2_layer(data: &[u8]) -> MltResult { .enumerate() .map(|(i, geom)| { let id = feature_ids[i]; - let properties = (0..prop_count).map(|j| property_columns[j][i].clone()).collect(); - TileFeature { id, geometry: geom, properties } + let properties = (0..prop_count) + .map(|j| property_columns[j][i].clone()) + .collect(); + TileFeature { + id, + geometry: geom, + properties, + } }) .collect(); - Ok(TileLayer01 { name, extent, property_names, features }) + Ok(TileLayer01 { + name, + extent, + property_names, + features, + }) } // ── Geometry decoding ───────────────────────────────────────────────────────── @@ -919,7 +1007,10 @@ fn reconstruct_geometries( for &part_len in part_lens { let coords: Vec> = (0..part_len as usize) .map(|_| { - let c = Coord { x: vertices[vert_pos], y: vertices[vert_pos + 1] }; + let c = Coord { + x: vertices[vert_pos], + y: vertices[vert_pos + 1], + }; vert_pos += 2; c }) @@ -938,7 +1029,10 @@ fn reconstruct_geometries( part_idx += 1; let coords: Vec> = (0..part_len) .map(|_| { - let c = Coord { x: vertices[vert_pos], y: vertices[vert_pos + 1] }; + let c = Coord { + x: vertices[vert_pos], + y: vertices[vert_pos + 1], + }; vert_pos += 2; c }) @@ -960,7 +1054,10 @@ fn reconstruct_geometries( ring_idx += 1; let mut coords: Vec> = (0..ring_len) .map(|_| { - let c = Coord { x: vertices[vert_pos], y: vertices[vert_pos + 1] }; + let c = Coord { + x: vertices[vert_pos], + y: vertices[vert_pos + 1], + }; vert_pos += 2; c }) @@ -1038,15 +1135,23 @@ fn read_enc_byte(data: &[u8]) -> MltResult<(&[u8], bool, u8, u8)> { /// Read a VarInt u32 stream. If `explicit` is true, read the count from the stream; /// otherwise use `implicit_count`. After reading count, reads byte_length and the data. -fn read_u32_stream(data: &[u8], implicit_count: usize, explicit: bool) -> MltResult<(&[u8], Vec)> { +fn read_u32_stream( + data: &[u8], + implicit_count: usize, + explicit: bool, +) -> MltResult<(&[u8], Vec)> { let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; if logical != 0 { // Only None logical is supported here - return Err(MltError::NotImplemented("v2 decoder: unsupported logical encoding for integer stream")); + return Err(MltError::NotImplemented( + "v2 decoder: unsupported logical encoding for integer stream", + )); } if physical != 2 { - return Err(MltError::NotImplemented("v2 decoder: only VarInt physical encoding supported for integer stream")); + return Err(MltError::NotImplemented( + "v2 decoder: only VarInt physical encoding supported for integer stream", + )); } let (data, count) = if has_expl_count || explicit { @@ -1096,11 +1201,17 @@ fn read_raw_bytes_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { } /// Read a ZigZag+VarInt i32 stream (count from context = explicit). -fn read_i32_stream(data: &[u8], implicit_count: usize, _has_any_null: bool) -> MltResult<(&[u8], Vec)> { +fn read_i32_stream( + data: &[u8], + implicit_count: usize, + _has_any_null: bool, +) -> MltResult<(&[u8], Vec)> { let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; if logical != 0 || physical != 2 { - return Err(MltError::NotImplemented("v2 decoder: only VarInt/ZigZag supported for i32 stream")); + return Err(MltError::NotImplemented( + "v2 decoder: only VarInt/ZigZag supported for i32 stream", + )); } let (data, count) = if has_expl_count { @@ -1135,7 +1246,9 @@ fn read_i64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec< let (data, _has_expl_count, logical, physical) = read_enc_byte(data)?; if logical != 0 || physical != 2 { - return Err(MltError::NotImplemented("v2 decoder: only VarInt/ZigZag supported for i64 stream")); + return Err(MltError::NotImplemented( + "v2 decoder: only VarInt/ZigZag supported for i64 stream", + )); } let (data, byte_len) = parse_varint::(data)?; @@ -1163,7 +1276,9 @@ fn read_u64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec< let (data, _has_expl_count, logical, physical) = read_enc_byte(data)?; if logical != 0 || physical != 2 { - return Err(MltError::NotImplemented("v2 decoder: only VarInt supported for u64 stream")); + return Err(MltError::NotImplemented( + "v2 decoder: only VarInt supported for u64 stream", + )); } let (data, byte_len) = parse_varint::(data)?; @@ -1238,7 +1353,6 @@ fn read_cwdelta_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { Ok((remaining, result)) } - // ── Presence bitfield ───────────────────────────────────────────────────────── /// Read `ceil(feature_count / 8)` bytes of presence bitfield and return a `Vec`. @@ -1293,11 +1407,16 @@ fn decode_property_column<'a>( let is_optional = matches!( col_type, - COL_OPT_BOOL | COL_OPT_I8 | COL_OPT_U8 - | COL_OPT_I32 | COL_OPT_U32 - | COL_OPT_I64 | COL_OPT_U64 - | COL_OPT_F32 | COL_OPT_F64 - | COL_OPT_STR + COL_OPT_BOOL + | COL_OPT_I8 + | COL_OPT_U8 + | COL_OPT_I32 + | COL_OPT_U32 + | COL_OPT_I64 + | COL_OPT_U64 + | COL_OPT_F32 + | COL_OPT_F64 + | COL_OPT_STR ); let (data, presence) = if is_optional { @@ -1308,7 +1427,9 @@ fn decode_property_column<'a>( (data, None) }; - let popcount = presence.as_ref().map_or(feature_count, |p| p.iter().filter(|&&b| b).count()); + let popcount = presence + .as_ref() + .map_or(feature_count, |p| p.iter().filter(|&&b| b).count()); let (data, values) = decode_column_data(data, col_type, feature_count, popcount, &presence)?; @@ -1336,7 +1457,9 @@ fn decode_column_data<'a>( COL_I8 | COL_OPT_I8 => { let (data, bytes) = read_raw_bytes_stream(data)?; let values = expand_with_presence( - bytes.iter().map(|&b| PropValue::I8(Some(i8::decode(b as u32 as u8)))), + bytes + .iter() + .map(|&b| PropValue::I8(Some(i8::decode(b as u32 as u8)))), presence, feature_count, PropValue::I8(None), @@ -1422,9 +1545,7 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_STR | COL_OPT_STR => { - decode_string_column(data, popcount, presence, feature_count) - } + COL_STR | COL_OPT_STR => decode_string_column(data, popcount, presence, feature_count), _ => Err(MltError::NotImplemented("v2 decoder: unknown column type")), } } From 3fdad3939934087f4121ea3aec1ec42a04bdaa20 Mon Sep 17 00:00:00 2001 From: Yuri Astrakhan Date: Wed, 22 Apr 2026 01:00:31 -0400 Subject: [PATCH 08/12] wip --- justfile | 6 + rust/mlt-core/src/v2.rs | 1233 ++++++++++++++++++++++++++++++++++----- rust/mlt/src/convert.rs | 95 +-- 3 files changed, 1115 insertions(+), 219 deletions(-) diff --git a/justfile b/justfile index 5cce53ca8..6505bc339 100755 --- a/justfile +++ b/justfile @@ -69,6 +69,12 @@ ci-extract-version language tag: mlt *args: cargo run --manifest-path {{join(justfile_directory(), 'rust', 'Cargo.toml')}} --package mlt -- "$@" +# Run the mlt CLI tool with the given arguments from current dir. +[no-cd] +[positional-arguments] # avoids shell expansions +mlt-rel *args: + cargo run --release --manifest-path {{join(justfile_directory(), 'rust', 'Cargo.toml')}} --package mlt -- "$@" + # Ensure a command is available assert-cmd command: #!/usr/bin/env bash diff --git a/rust/mlt-core/src/v2.rs b/rust/mlt-core/src/v2.rs index 7f917c733..21b918627 100644 --- a/rust/mlt-core/src/v2.rs +++ b/rust/mlt-core/src/v2.rs @@ -3,12 +3,30 @@ //! This module implements a minimal v2 wire format for round-trip experimentation. //! The format uses tag `2` to distinguish v2 layers from v1 layers (tag `1`). //! -//! Simplifications in this initial implementation: -//! - VarInt physical encoding only (no FastPFor, no Morton) -//! - ComponentwiseDelta + VarInt for vertex coordinates -//! - StrPlain (lengths + raw bytes) for strings (no FSST, no shared-dict) -//! - Own-presence bitfields for optional columns (no shared-presence / OptRef*) -//! - Pure single-geometry-type layers only (no mixed geometry) +//! Encoding strategies (each stream picks the smallest result automatically): +//! +//! - **Feature ordering**: tries `Unsorted`, `SpatialMorton`, and `SpatialHilbert`, keeping +//! the ordering that produces the smallest encoded tile. Mirrors the v1 optimizer. +//! - **Geometry types stream**: tries VarInt and RLE. +//! For single-type layers (all Points, all Lines …) RLE reduces from N bytes to ~3 bytes. +//! - **Vertex data**: ComponentwiseDelta applied first, then tries: +//! - VarInt (1 byte per small delta) +//! - FastPFor128 LE (bit-packing in blocks of 128; wins for layers with many vertices) +//! - **Integer ID columns**: tries plain VarInt and Delta+VarInt. +//! Sequential OSM IDs encode as 1-byte deltas each. +//! - **String columns**: tries three encodings, keeps smallest: +//! - StrPlain / OptStrPlain: per-value byte lengths + raw UTF-8 bytes. +//! - StrDict / OptStrDict: deduplicated dictionary + per-feature indices. +//! Best for low-cardinality columns (road class, surface type …). +//! - StrFsst / OptStrFsst: FSST symbol-table compression. +//! Best for high-cardinality columns (street names, place names …). +//! - **Presence**: raw packed bitfields for optional columns (no bool-RLE header). +//! - **Shared dictionaries**: string columns with similar content are grouped using MinHash +//! similarity (same algorithm as v1). Each group encodes one shared dictionary corpus +//! (plain or FSST-compressed, whichever is smaller) followed by per-child index streams. +//! Mirrors v1's `ColumnType::SharedDict` optimization. Note: property column order in the +//! decoded layer may differ from the original when grouping is applied (shared-dict children +//! are emitted together before the next non-grouped column). use geo_types::{ Coord, Geometry, LineString, MultiLineString, MultiPoint, MultiPolygon, Point, Polygon, @@ -16,11 +34,24 @@ use geo_types::{ use integer_encoding::{VarInt as _, VarIntWriter as _}; use zigzag::ZigZag as _; +use fastpfor::{AnyLenCodec as _, FastPFor128}; + use crate::MltError; use crate::MltResult; +use crate::codecs::fsst::compress_fsst; use crate::codecs::varint::parse_varint; use crate::codecs::zigzag::encode_componentwise_delta_vec2s; use crate::decoder::{GeometryType, PropValue, TileFeature, TileLayer01}; +use std::collections::HashMap; + +use crate::encoder::{ + EncoderConfig, SortStrategy, StagedSharedDict, StagedSharedDictItem, StringGroup, + group_string_properties, spatial_sort_likely_to_help, +}; + +/// Minimum feature count above which the bounding-box heuristic is applied +/// before deciding whether to try spatial sort trials. Mirrors v1's threshold. +const SORT_TRIAL_THRESHOLD: usize = 512; // ── Encoding byte bit positions ─────────────────────────────────────────────── // bit 7: has_explicit_count @@ -32,8 +63,14 @@ use crate::decoder::{GeometryType, PropValue, TileFeature, TileLayer01}; const ENC_VARINT: u8 = 0x08; /// `logical=None, physical=VarInt, explicit count follows` const ENC_VARINT_EXPL: u8 = 0x88; +/// `logical=Delta, physical=VarInt, count from context` +const ENC_DELTA_VARINT: u8 = 0x18; /// `logical=CwDelta, physical=VarInt, explicit count follows` const ENC_CWDELTA_VARINT_EXPL: u8 = 0xA8; +/// `logical=CwDelta, physical=FastPFor128, explicit count follows` +const ENC_CWDELTA_FP128_EXPL: u8 = 0xAC; +/// `logical=Rle, physical=reserved (00), count from context; byte_length always follows` +const ENC_RLE: u8 = 0x30; /// `logical=None, physical=None-noLen, explicit count follows` const ENC_RAW_EXPL: u8 = 0x80; @@ -102,21 +139,72 @@ const COL_OPT_F64: u8 = 27; // Strings: StrPlain / OptStrPlain const COL_STR: u8 = 28; const COL_OPT_STR: u8 = 29; +// Strings: StrDict / OptStrDict (no presence stream; for OptStrDict index 0 = null) +const COL_STR_DICT: u8 = 30; +const COL_OPT_STR_DICT: u8 = 31; +// Strings: StrFsst / OptStrFsst (FSST-compressed; presence bitfield when optional) +const COL_STR_FSST: u8 = 32; +const COL_OPT_STR_FSST: u8 = 33; +// Strings: SharedDict (plain) / SharedDictFsst (FSST corpus) — cross-column shared dictionary. +// One encoded column entry covers N child string columns that share similar string values. +const COL_STR_SHARED_DICT: u8 = 34; +const COL_STR_SHARED_DICT_FSST: u8 = 35; + +/// Flag byte for each child within a shared-dict column: bit 0 = child has null values. +const CHILD_OPTIONAL: u8 = 0x01; // ── Encoder ─────────────────────────────────────────────────────────────────── impl TileLayer01 { /// Encode this layer to the MLT **v2** experimental wire format. /// + /// Respects the same [`EncoderConfig`] flags as the v1 encoder: + /// - `try_spatial_morton_sort` / `try_spatial_hilbert_sort`: spatial sort trials. + /// - `allow_fsst`: FSST string compression. + /// - `allow_fpf`: FastPFor128 vertex compression. + /// - `allow_shared_dict`: cross-column shared dictionary grouping. + /// /// Returns a complete framed record: `[varint(body_len+1)][tag=2][body…]` /// ready to be concatenated with other layers in a tile. /// - /// Returns an empty slice for empty layers (no features). - pub fn encode_v2(&self) -> MltResult> { + /// Returns an empty `Vec` for layers with no features. + pub fn encode_v2(&self, cfg: EncoderConfig) -> MltResult> { if self.features.is_empty() { return Ok(Vec::new()); } + // Baseline: unsorted feature order. + let mut best = self.encode_v2_with_sort(cfg)?; + + // Apply the same spatial-sort heuristic as the v1 optimizer. + let try_spatial = self.features.len() < SORT_TRIAL_THRESHOLD + || spatial_sort_likely_to_help(self); + + if try_spatial { + let strategies = [ + (SortStrategy::SpatialMorton, cfg.try_spatial_morton_sort), + (SortStrategy::SpatialHilbert, cfg.try_spatial_hilbert_sort), + ]; + for (strategy, enabled) in strategies { + if !enabled { + continue; + } + let mut candidate = self.clone(); + candidate.sort(strategy); + let bytes = candidate.encode_v2_with_sort(cfg)?; + if bytes.len() < best.len() { + best = bytes; + } + } + } + + Ok(best) + } + + /// Encode this layer to v2 bytes with features in their **current** order. + /// + /// Called by [`encode_v2`] once per sort-strategy trial. + fn encode_v2_with_sort(&self, cfg: EncoderConfig) -> MltResult> { let feature_count = self.features.len(); // ── Geometry ────────────────────────────────────────────────────────── @@ -130,8 +218,72 @@ impl TileLayer01 { Vec::new() }; - // ── Property columns ────────────────────────────────────────────────── - let prop_count = self.property_names.len(); + // ── Collect column chunks ───────────────────────────────────────────── + // We must know the total column count before writing it, so collect all + // encoded chunks first, then write col_count + chunks together. + // + // For shared-dict groups, we compare the shared-dict chunk against encoding + // each member individually and keep whichever is smaller. + + // Group similar string columns with MinHash (mirrors v1); skip when disabled. + let groups = if cfg.allow_shared_dict { + group_string_properties(self) + } else { + Vec::new() + }; + let col_to_group: HashMap = groups + .iter() + .flat_map(|g| g.columns.iter().map(move |(_, i)| (*i, g))) + .collect(); + let mut group_start: HashMap = + groups.iter().map(|g| (g.columns[0].1, g)).collect(); + + let mut col_chunks: Vec> = Vec::new(); + + // ID column (single chunk) + if has_ids { + let mut chunk = Vec::new(); + write_id_column(&mut chunk, &all_ids, feature_count)?; + col_chunks.push(chunk); + } + + // Property columns + for (col_idx, name) in self.property_names.iter().enumerate() { + if let Some(g) = group_start.remove(&col_idx) { + // Try shared-dict encoding (all group members as one encoded column). + let shared_chunk = encode_shared_dict_chunk(g, &self.features, cfg.allow_fsst)?; + + // Try individual encoding for every group member as a baseline. + let individual_chunks: Vec> = g + .columns + .iter() + .map(|(_, c_idx)| { + let full_name = &self.property_names[*c_idx]; + let vals: Vec<&PropValue> = + self.features.iter().map(|f| &f.properties[*c_idx]).collect(); + let mut chunk = Vec::new(); + write_property_column( + &mut chunk, full_name, &vals, feature_count, cfg.allow_fsst, + )?; + Ok(chunk) + }) + .collect::>()?; + let individual_total: usize = individual_chunks.iter().map(|c| c.len()).sum(); + + if shared_chunk.len() <= individual_total { + col_chunks.push(shared_chunk); + } else { + col_chunks.extend(individual_chunks); + } + } else if !col_to_group.contains_key(&col_idx) { + let vals: Vec<&PropValue> = + self.features.iter().map(|f| &f.properties[col_idx]).collect(); + let mut chunk = Vec::new(); + write_property_column(&mut chunk, name, &vals, feature_count, cfg.allow_fsst)?; + col_chunks.push(chunk); + } + // else: column absorbed into a shared-dict group handled above. + } // ── Build body ──────────────────────────────────────────────────────── let mut body: Vec = Vec::new(); @@ -159,21 +311,14 @@ impl TileLayer01 { geom.ring_lengths.as_deref(), &geom.vertices, feature_count, + cfg.allow_fpf, )?; // ── Columns ─────────────────────────────────────────────────────────── - let col_count = u32::try_from(usize::from(has_ids) + prop_count)?; - body.write_varint(col_count).map_err(MltError::from)?; - - // ID column - if has_ids { - write_id_column(&mut body, &all_ids, feature_count)?; - } - - // Property columns - for (i, name) in self.property_names.iter().enumerate() { - let vals: Vec<&PropValue> = self.features.iter().map(|f| &f.properties[i]).collect(); - write_property_column(&mut body, name, &vals, feature_count)?; + body.write_varint(col_chunks.len() as u32) + .map_err(MltError::from)?; + for chunk in col_chunks { + body.extend_from_slice(&chunk); } // ── Frame: [varint(body_len+1)][tag=2][body] ────────────────────────── @@ -345,17 +490,76 @@ fn push_feature_geometry( Ok(()) } +// ── Stream body builders (return raw bytes, no enc_byte prefix) ─────────────── + +/// Build the VarInt body for a `u32` slice (no enc_byte, no count, no length). +fn build_varint_body_u32(values: &[u32]) -> MltResult> { + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v).map_err(MltError::from)?; + } + Ok(tmp) +} + +/// Build the RLE body for a `u32` slice: pairs of `(run_length, value)` as VarInts. +fn build_rle_body_u32(values: &[u32]) -> MltResult> { + let mut tmp = Vec::new(); + let mut i = 0; + while i < values.len() { + let val = values[i]; + let mut run: u32 = 1; + while i + (run as usize) < values.len() && values[i + (run as usize)] == val { + run += 1; + } + tmp.write_varint(run).map_err(MltError::from)?; + tmp.write_varint(val).map_err(MltError::from)?; + i += run as usize; + } + Ok(tmp) +} + +/// Build the Delta+VarInt body for a `u64` slice (deltas from the previous value). +/// The first element is the original value (delta from 0). +fn build_delta_body_u64(values: &[u64]) -> MltResult> { + let mut tmp = Vec::new(); + let mut prev = 0u64; + for &v in values { + let delta = v.wrapping_sub(prev); + tmp.write_varint(delta).map_err(MltError::from)?; + prev = v; + } + Ok(tmp) +} + // ── Stream write helpers ────────────────────────────────────────────────────── /// Write a VarInt-encoded `u32` stream with implicit count (= `feature_count`). fn write_u32_stream_implicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> { buf.push(ENC_VARINT); - let mut tmp = Vec::new(); - for &v in values { - tmp.write_varint(v).map_err(MltError::from)?; + let body = build_varint_body_u32(values)?; + buf.write_varint(body.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&body); + Ok(()) +} + +/// Write a `u32` types stream choosing the smaller of VarInt vs RLE encoding. +/// +/// For pure single-type layers `[0, 0, 0, …]` this reduces from N bytes to ~3. +fn write_types_stream(buf: &mut Vec, values: &[u32]) -> MltResult<()> { + let varint_body = build_varint_body_u32(values)?; + let rle_body = build_rle_body_u32(values)?; + + // VarInt overhead: 1 (enc_byte) + varint(body_len) + body + // RLE overhead: 1 (enc_byte) + varint(body_len) + body (same shape) + if rle_body.len() < varint_body.len() { + buf.push(ENC_RLE); + buf.write_varint(rle_body.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&rle_body); + } else { + buf.push(ENC_VARINT); + buf.write_varint(varint_body.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&varint_body); } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(&tmp); Ok(()) } @@ -364,12 +568,34 @@ fn write_u32_stream_explicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> buf.push(ENC_VARINT_EXPL); buf.write_varint(values.len() as u32) .map_err(MltError::from)?; - let mut tmp = Vec::new(); - for &v in values { - tmp.write_varint(v).map_err(MltError::from)?; + let body = build_varint_body_u32(values)?; + buf.write_varint(body.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&body); + Ok(()) +} + +/// Write a `u64` stream choosing the smaller of plain VarInt vs Delta+VarInt. +/// +/// For sequential IDs (common in OSM) delta encoding reduces each ID to 1 byte. +fn write_u64_stream_best(buf: &mut Vec, values: &[u64]) -> MltResult<()> { + let plain_body = { + let mut tmp = Vec::new(); + for &v in values { + tmp.write_varint(v).map_err(MltError::from)?; + } + tmp + }; + let delta_body = build_delta_body_u64(values)?; + + if delta_body.len() < plain_body.len() { + buf.push(ENC_DELTA_VARINT); + buf.write_varint(delta_body.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&delta_body); + } else { + buf.push(ENC_VARINT); + buf.write_varint(plain_body.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(&plain_body); } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(&tmp); Ok(()) } @@ -419,25 +645,70 @@ fn write_raw_bytes(buf: &mut Vec, data: &[u8]) -> MltResult<()> { } /// Write vertex data using ComponentwiseDelta + VarInt with explicit count (vertex pairs). -fn write_cwdelta_vertices(buf: &mut Vec, flat_verts: &[i32]) -> MltResult<()> { +fn write_cwdelta_vertices_varint(pair_count: u32, encoded: &[u32]) -> MltResult> { + let mut out = Vec::new(); + out.push(ENC_CWDELTA_VARINT_EXPL); + out.write_varint(pair_count).map_err(MltError::from)?; + let mut tmp = Vec::new(); + for &v in encoded { + tmp.write_varint(v).map_err(MltError::from)?; + } + out.write_varint(tmp.len() as u32).map_err(MltError::from)?; + out.extend_from_slice(&tmp); + Ok(out) +} + +/// Write vertex data using ComponentwiseDelta + FastPFor128 with explicit count (vertex pairs). +/// +/// FastPFor128 requires the input length to be a multiple of 128. The tail is handled +/// by the VarByte portion of the `Composition(FastPFor128, VariableByte)` codec. +/// Bytes are stored in little-endian u32 order (no byte-swap needed on x86). +fn write_cwdelta_vertices_fp128(pair_count: u32, encoded: &[u32]) -> MltResult> { + if encoded.is_empty() { + // FP128 can't encode an empty slice; fall back handled by caller. + return Ok(Vec::new()); + } + let mut scratch: Vec = Vec::with_capacity(encoded.len() + 1024); + FastPFor128::default() + .encode(encoded, &mut scratch) + .map_err(|_| MltError::NotImplemented("v2: FastPFor128 encode error"))?; + + // v2 wire format: little-endian u32 words (no byte swap on LE hosts). + let byte_data: Vec = scratch + .iter() + .flat_map(|&w| w.to_le_bytes()) + .collect(); + + let mut out = Vec::new(); + out.push(ENC_CWDELTA_FP128_EXPL); + out.write_varint(pair_count).map_err(MltError::from)?; + out.write_varint(byte_data.len() as u32).map_err(MltError::from)?; + out.extend_from_slice(&byte_data); + Ok(out) +} + +/// Write vertex data: tries CwDelta+FastPFor128 and CwDelta+VarInt, keeps the smaller. +/// When `allow_fpf` is false only VarInt is tried (mirrors `EncoderConfig::allow_fpf`). +fn write_cwdelta_vertices(buf: &mut Vec, flat_verts: &[i32], allow_fpf: bool) -> MltResult<()> { debug_assert!( flat_verts.len() % 2 == 0, "vertex buffer must have even length" ); - buf.push(ENC_CWDELTA_VARINT_EXPL); - let pair_count = flat_verts.len() / 2; - buf.write_varint(pair_count as u32) - .map_err(MltError::from)?; + let pair_count = (flat_verts.len() / 2) as u32; let mut encoded: Vec = Vec::new(); encode_componentwise_delta_vec2s(flat_verts, &mut encoded); - let mut tmp = Vec::new(); - for v in &encoded { - tmp.write_varint(*v).map_err(MltError::from)?; + let varint_bytes = write_cwdelta_vertices_varint(pair_count, &encoded)?; + + if allow_fpf { + let fp128_bytes = write_cwdelta_vertices_fp128(pair_count, &encoded)?; + if !fp128_bytes.is_empty() && fp128_bytes.len() < varint_bytes.len() { + buf.extend_from_slice(&fp128_bytes); + return Ok(()); + } } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(&tmp); + buf.extend_from_slice(&varint_bytes); Ok(()) } @@ -449,10 +720,11 @@ fn write_geometry_streams( ring_lengths: Option<&[u32]>, vertices: &[i32], feature_count: usize, + allow_fpf: bool, ) -> MltResult<()> { debug_assert_eq!(types.len(), feature_count); - // Types stream: count = feature_count (implicit) - write_u32_stream_implicit(buf, types)?; + // Types stream: count = feature_count (implicit); prefer RLE when all same type. + write_types_stream(buf, types)?; if let Some(g) = geo_lengths { write_u32_stream_explicit(buf, g)?; } @@ -462,7 +734,7 @@ fn write_geometry_streams( if let Some(r) = ring_lengths { write_u32_stream_explicit(buf, r)?; } - write_cwdelta_vertices(buf, vertices) + write_cwdelta_vertices(buf, vertices, allow_fpf) } // ── Presence bitfield ───────────────────────────────────────────────────────── @@ -492,12 +764,12 @@ fn write_id_column(buf: &mut Vec, ids: &[Option], _feature_count: usize let presence: Vec = ids.iter().map(|id| id.is_some()).collect(); write_presence(buf, &presence); let values: Vec = ids.iter().filter_map(|id| *id).collect(); - write_u64_stream_implicit(buf, &values)?; + write_u64_stream_best(buf, &values)?; } else { - // Id: all values present + // Id: all values present; Delta+VarInt compresses sequential OSM IDs well. buf.push(COL_ID); let values: Vec = ids.iter().map(|id| id.unwrap_or(0)).collect(); - write_u64_stream_implicit(buf, &values)?; + write_u64_stream_best(buf, &values)?; } Ok(()) @@ -510,28 +782,82 @@ fn write_property_column( name: &str, values: &[&PropValue], feature_count: usize, + allow_fsst: bool, ) -> MltResult<()> { debug_assert_eq!(values.len(), feature_count); - // Determine the dominant type and whether any value is null let any_null = values.iter().any(|v| is_null(v)); + // String columns: choose the smallest available encoding. + if matches!(values[0], PropValue::Str(_)) { + let strings: Vec> = values + .iter() + .map(|v| match v { + PropValue::Str(Some(s)) => Some(s.as_str()), + _ => None, + }) + .collect(); + + let plain_bytes = build_string_plain_data(&strings, any_null)?; + let dict_bytes = build_string_dict_data(&strings, any_null)?; + let fsst_bytes = if allow_fsst { + build_string_fsst_data(&strings, any_null)? + } else { + Vec::new() + }; + + let name_bytes = name.as_bytes(); + + // Choose whichever encoding yields the fewest bytes. + let best_size = if fsst_bytes.is_empty() { + plain_bytes.len().min(dict_bytes.len()) + } else { + plain_bytes.len().min(dict_bytes.len()).min(fsst_bytes.len()) + }; + + if !fsst_bytes.is_empty() && best_size == fsst_bytes.len() { + // FSST: presence bitfield (when optional) + 4 compressed streams. + buf.push(if any_null { COL_OPT_STR_FSST } else { COL_STR_FSST }); + buf.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(name_bytes); + if any_null { + let presence: Vec = strings.iter().map(|s| s.is_some()).collect(); + write_presence(buf, &presence); + } + buf.extend_from_slice(&fsst_bytes); + } else if best_size == dict_bytes.len() { + // Dict: no separate presence (null = index 0 for OptStrDict). + buf.push(if any_null { COL_OPT_STR_DICT } else { COL_STR_DICT }); + buf.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(name_bytes); + buf.extend_from_slice(&dict_bytes); + } else { + // Plain: presence bitfield when optional. + buf.push(if any_null { COL_OPT_STR } else { COL_STR }); + buf.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; + buf.extend_from_slice(name_bytes); + if any_null { + let presence: Vec = strings.iter().map(|s| s.is_some()).collect(); + write_presence(buf, &presence); + } + buf.extend_from_slice(&plain_bytes); + } + return Ok(()); + } + let col_type = prop_column_type(values[0], any_null); buf.push(col_type); - // name let name_bytes = name.as_bytes(); buf.write_varint(name_bytes.len() as u32) .map_err(MltError::from)?; buf.extend_from_slice(name_bytes); - // presence bitfield (for optional columns) if any_null { let presence: Vec = values.iter().map(|v| !is_null(v)).collect(); write_presence(buf, &presence); } - // data streams write_prop_data(buf, values, any_null) } @@ -598,12 +924,7 @@ fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> .filter(|v| !is_null(v)) .map(|v| prop_i32(v)) .collect(); - let implicit_count = !any_null; - if implicit_count { - write_i32_stream_implicit(buf, &data)?; - } else { - write_i32_stream_as_explicit(buf, &data)?; - } + write_i32_stream_implicit(buf, &data)?; } PropValue::U32(_) => { let data: Vec = values @@ -611,12 +932,7 @@ fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> .filter(|v| !is_null(v)) .map(|v| prop_u32(v)) .collect(); - let implicit_count = !any_null; - if implicit_count { - write_u32_stream_implicit(buf, &data)?; - } else { - write_u32_popcount(buf, &data)?; - } + write_u32_stream_implicit(buf, &data)?; } PropValue::I64(_) => { let data: Vec = values @@ -636,11 +952,7 @@ fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> .filter(|v| !is_null(v)) .map(|v| prop_u64(v)) .collect(); - if !any_null { - write_u64_stream_implicit(buf, &data)?; - } else { - write_u64_popcount(buf, &data)?; - } + write_u64_stream_implicit(buf, &data)?; } PropValue::F32(_) => { let mut data = Vec::new(); @@ -657,87 +969,328 @@ fn write_prop_data(buf: &mut Vec, values: &[&PropValue], any_null: bool) -> write_raw_bytes(buf, &data)?; } PropValue::Str(_) => { - let strings: Vec> = values - .iter() - .map(|v| match v { - PropValue::Str(Some(s)) => Some(s.as_str()), - _ => None, - }) - .collect(); - write_string_col_data(buf, &strings, any_null)?; + // Handled earlier in write_property_column; should not reach here. + unreachable!("Str columns are handled before write_prop_data"); } } Ok(()) } -/// Write string column data streams (lengths + raw bytes). -/// Only the non-null strings are written (count = popcount when any_null, else feature_count). -fn write_string_col_data( - buf: &mut Vec, - strings: &[Option<&str>], - any_null: bool, -) -> MltResult<()> { +/// Build the bytes for a StrPlain / OptStrPlain column data section. +/// Returns: lengths_stream + raw_bytes_stream (no col_type, no name, no presence). +fn build_string_plain_data(strings: &[Option<&str>], _any_null: bool) -> MltResult> { let present: Vec<&str> = strings.iter().filter_map(|s| *s).collect(); let lengths: Vec = present.iter().map(|s| s.len() as u32).collect(); let all_bytes: Vec = present.iter().flat_map(|s| s.as_bytes()).copied().collect(); - // Lengths stream: count = popcount (implicit when any_null, else feature_count) - if any_null { - // count = popcount(presence), which the decoder can compute → use implicit - write_u32_stream_implicit(buf, &lengths)?; - } else { - write_u32_stream_implicit(buf, &lengths)?; + let mut buf = Vec::new(); + // Lengths stream: count = len(present), encoded with implicit count. + write_u32_stream_implicit(&mut buf, &lengths)?; + // String data: raw bytes with explicit byte count. + write_raw_bytes(&mut buf, &all_bytes)?; + Ok(buf) +} + +/// Build the bytes for a StrDict / OptStrDict column data section. +/// +/// Wire layout: `dict_lengths_stream | dict_data_stream | indices_stream` +/// +/// - `dict_lengths_stream`: VarInt, explicit count = number of unique values. +/// - `dict_data_stream`: raw bytes, explicit count = total UTF-8 bytes in dict. +/// - `indices_stream`: VarInt, implicit count = feature_count. +/// - `OptStrDict`: index 0 = null; indices 1..N map to dict entries 0..N-1. +/// - `StrDict`: indices 0..N-1 map to dict entries directly. +fn build_string_dict_data(strings: &[Option<&str>], any_null: bool) -> MltResult> { + // Build ordered dictionary (preserves first-occurrence order). + let mut dict: Vec<&str> = Vec::new(); + let mut dict_index: HashMap<&str, u32> = HashMap::new(); + + for s in strings.iter().filter_map(|s| *s) { + if !dict_index.contains_key(s) { + let idx = dict.len() as u32; + dict.push(s); + dict_index.insert(s, idx); + } } - // String data: explicit count = total bytes - write_raw_bytes(buf, &all_bytes)?; - Ok(()) -} + // Build per-feature index stream. + let mut indices: Vec = Vec::with_capacity(strings.len()); + for s in strings { + match s { + None => { + // OptStrDict: null = index 0; StrDict should not have nulls. + indices.push(0); + } + Some(s) => { + let dict_pos = dict_index[*s]; + if any_null { + // Shift by 1: index 0 reserved for null. + indices.push(dict_pos + 1); + } else { + indices.push(dict_pos); + } + } + } + } -// ── VarInt stream helpers for optional columns (count = popcount) ───────────── + let dict_lengths: Vec = dict.iter().map(|s| s.len() as u32).collect(); + let dict_bytes: Vec = dict.iter().flat_map(|s| s.as_bytes()).copied().collect(); + + let mut buf = Vec::new(); + // dict_lengths: explicit count = number of dict entries. + write_u32_stream_explicit(&mut buf, &dict_lengths)?; + // dict_data: raw bytes. + write_raw_bytes(&mut buf, &dict_bytes)?; + // indices: implicit count = feature_count. + write_u32_stream_implicit(&mut buf, &indices)?; + Ok(buf) +} -fn write_i32_stream_as_explicit(buf: &mut Vec, values: &[i32]) -> MltResult<()> { - buf.push(ENC_VARINT); // count = popcount (implicit for optional columns) - let mut tmp = Vec::new(); - for &v in values { - tmp.write_varint(i32::encode(v)).map_err(MltError::from)?; +/// Build the bytes for a StrFsst / OptStrFsst column data section. +/// +/// Wire layout: `sym_lengths_stream | sym_bytes_stream | val_lengths_stream | corpus_stream` +/// +/// - `sym_lengths_stream`: VarInt, explicit count = number of symbols. +/// - `sym_bytes_stream`: raw bytes, explicit count = total symbol bytes. +/// - `val_lengths_stream`: VarInt, implicit count = popcount (or feature_count if non-optional). +/// - `corpus_stream`: raw bytes, explicit count = compressed corpus size. +fn build_string_fsst_data(strings: &[Option<&str>], _any_null: bool) -> MltResult> { + let present: Vec<&str> = strings.iter().filter_map(|s| *s).collect(); + // FSST requires at least one string to train on. + if present.is_empty() { + // Fall back to an empty structure that the decoder can handle. + let mut buf = Vec::new(); + write_u32_stream_explicit(&mut buf, &[])?; // 0 symbols + write_raw_bytes(&mut buf, &[])?; // 0 symbol bytes + write_u32_stream_implicit(&mut buf, &[])?; // 0 lengths + write_raw_bytes(&mut buf, &[])?; // 0 corpus bytes + return Ok(buf); } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(&tmp); - Ok(()) + + let raw = compress_fsst(&present); + + let mut buf = Vec::new(); + write_u32_stream_explicit(&mut buf, &raw.symbol_lengths)?; + write_raw_bytes(&mut buf, &raw.symbol_bytes)?; + write_u32_stream_implicit(&mut buf, &raw.value_lengths)?; + write_raw_bytes(&mut buf, &raw.corpus)?; + Ok(buf) } -fn write_i64_stream_as_explicit(buf: &mut Vec, values: &[i64]) -> MltResult<()> { - buf.push(ENC_VARINT); // count = popcount - let mut tmp = Vec::new(); - for &v in values { - tmp.write_varint(i64::encode(v)).map_err(MltError::from)?; +// ── Shared-dictionary encoding ──────────────────────────────────────────────── + +/// Build the per-feature index stream for one child of a shared-dict group. +/// +/// - Non-optional child: index `k` → dict entry `k` (0-based). +/// - Optional child: index `0` = null; index `k` (k ≥ 1) → dict entry `k-1`. +fn build_item_indices( + item: &StagedSharedDictItem, + span_to_idx: &HashMap<(u32, u32), u32>, + optional: bool, +) -> Vec { + let mut span_iter = item.dense_spans(); + item.presence_bools() + .map(|present| { + if present { + let span = span_iter.next().expect("v2 SharedDict: presence/dense mismatch"); + let dict_idx = *span_to_idx + .get(&span) + .expect("v2 SharedDict: span not in dict"); + if optional { dict_idx + 1 } else { dict_idx } + } else { + 0 // null slot (only reachable when `optional`) + } + }) + .collect() +} + +/// Build the encoded bytes for a plain-corpus shared-dict column (`COL_STR_SHARED_DICT`). +/// +/// Wire layout: +/// ```text +/// [u8: COL_STR_SHARED_DICT] +/// [varint: prefix_len] [prefix bytes] +/// [varint: child_count] +/// [dict_lengths: ENC_VARINT_EXPL, count = dict_entry_count, body] +/// [dict_data: ENC_RAW_EXPL, count = total UTF-8 bytes] +/// for each child: +/// [u8: child_flags] bit 0 = optional +/// [varint: suffix_len] [suffix bytes] +/// [indices: ENC_VARINT, implicit count = feature_count, body] +/// non-optional: 0..N-1 → dict index +/// optional: 0 = null, 1..N → dict index + 1 +/// ``` +fn build_shared_dict_plain( + prefix: &str, + items: &[StagedSharedDictItem], + dict_strings: &[&str], + span_to_idx: &HashMap<(u32, u32), u32>, +) -> MltResult> { + let mut buf = Vec::new(); + buf.push(COL_STR_SHARED_DICT); + + let prefix_bytes = prefix.as_bytes(); + buf.write_varint(prefix_bytes.len() as u32) + .map_err(MltError::from)?; + buf.extend_from_slice(prefix_bytes); + buf.write_varint(items.len() as u32).map_err(MltError::from)?; + + let dict_lengths: Vec = dict_strings.iter().map(|s| s.len() as u32).collect(); + let dict_bytes: Vec = dict_strings + .iter() + .flat_map(|s| s.as_bytes()) + .copied() + .collect(); + write_u32_stream_explicit(&mut buf, &dict_lengths)?; + write_raw_bytes(&mut buf, &dict_bytes)?; + + for item in items { + let optional = item.has_presence(); + buf.push(if optional { CHILD_OPTIONAL } else { 0 }); + let suffix_bytes = item.suffix.as_bytes(); + buf.write_varint(suffix_bytes.len() as u32) + .map_err(MltError::from)?; + buf.extend_from_slice(suffix_bytes); + let indices = build_item_indices(item, span_to_idx, optional); + write_u32_stream_implicit(&mut buf, &indices)?; } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(&tmp); - Ok(()) + + Ok(buf) } -fn write_u32_popcount(buf: &mut Vec, values: &[u32]) -> MltResult<()> { - buf.push(ENC_VARINT); // count = popcount - let mut tmp = Vec::new(); - for &v in values { - tmp.write_varint(v).map_err(MltError::from)?; +/// Build the encoded bytes for a FSST-corpus shared-dict column (`COL_STR_SHARED_DICT_FSST`). +/// +/// Same as [`build_shared_dict_plain`] but the corpus section becomes: +/// ```text +/// [sym_lengths: ENC_VARINT_EXPL] [sym_bytes: ENC_RAW_EXPL] +/// [val_lengths: ENC_VARINT_EXPL, count = dict_entry_count] [corpus: ENC_RAW_EXPL] +/// ``` +/// Per-child index semantics are identical to the plain variant. +fn build_shared_dict_fsst( + prefix: &str, + items: &[StagedSharedDictItem], + dict_strings: &[&str], + span_to_idx: &HashMap<(u32, u32), u32>, +) -> MltResult> { + let raw = compress_fsst(dict_strings); + + let mut buf = Vec::new(); + buf.push(COL_STR_SHARED_DICT_FSST); + + let prefix_bytes = prefix.as_bytes(); + buf.write_varint(prefix_bytes.len() as u32) + .map_err(MltError::from)?; + buf.extend_from_slice(prefix_bytes); + buf.write_varint(items.len() as u32).map_err(MltError::from)?; + + write_u32_stream_explicit(&mut buf, &raw.symbol_lengths)?; + write_raw_bytes(&mut buf, &raw.symbol_bytes)?; + write_u32_stream_explicit(&mut buf, &raw.value_lengths)?; + write_raw_bytes(&mut buf, &raw.corpus)?; + + for item in items { + let optional = item.has_presence(); + buf.push(if optional { CHILD_OPTIONAL } else { 0 }); + let suffix_bytes = item.suffix.as_bytes(); + buf.write_varint(suffix_bytes.len() as u32) + .map_err(MltError::from)?; + buf.extend_from_slice(suffix_bytes); + let indices = build_item_indices(item, span_to_idx, optional); + write_u32_stream_implicit(&mut buf, &indices)?; } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(&tmp); - Ok(()) + + Ok(buf) } -fn write_u64_popcount(buf: &mut Vec, values: &[u64]) -> MltResult<()> { - buf.push(ENC_VARINT); // count = popcount - let mut tmp = Vec::new(); - for &v in values { - tmp.write_varint(v).map_err(MltError::from)?; +/// Encode a shared-dict group, automatically choosing plain vs FSST corpus. +/// +/// Returns the encoded bytes for a single shared-dict column entry. The caller +/// should compare this against individually-encoded columns and use whichever is +/// smaller (see [`encode_v2_with_sort`]). +/// +/// When `allow_fsst` is false only the plain-corpus variant is tried. +fn encode_shared_dict_chunk( + group: &StringGroup, + features: &[TileFeature], + allow_fsst: bool, +) -> MltResult> { + // Extract per-column string values in stable column-index order. + let mut order: Vec = (0..group.columns.len()).collect(); + order.sort_by_key(|&i| group.columns[i].1); + + let columns: Vec<(String, Vec>)> = order + .iter() + .map(|&i| { + let (suffix, col_idx) = &group.columns[i]; + let values: Vec> = features + .iter() + .map(|f| match f.properties.get(*col_idx) { + Some(PropValue::Str(Some(s))) => Some(s.clone()), + _ => None, + }) + .collect(); + (suffix.clone(), values) + }) + .collect(); + + // Build the shared corpus (dedup strings across all children into one buffer). + let shared_dict = StagedSharedDict::new( + group.prefix.clone(), + columns + .iter() + .map(|(s, v)| (s.as_str(), v.iter().map(|o| o.as_deref()))), + )?; + + // Unique dictionary entries (sorted, deduped spans → stable dict order). + let dict_spans = { + let mut s: Vec<(u32, u32)> = shared_dict + .items + .iter() + .flat_map(|item| item.dense_spans()) + .collect(); + s.sort_unstable(); + s.dedup(); + s + }; + let dict_strings: Vec<&str> = dict_spans + .iter() + .map(|&(s, e)| { + shared_dict + .get((s, e)) + .ok_or(MltError::NotImplemented("v2: shared dict span OOB")) + }) + .collect::>>()?; + let span_to_idx: HashMap<(u32, u32), u32> = dict_spans + .iter() + .copied() + .enumerate() + .map(|(i, span)| (span, i as u32)) + .collect(); + + let plain_buf = + build_shared_dict_plain(&group.prefix, &shared_dict.items, &dict_strings, &span_to_idx)?; + + if allow_fsst && !dict_strings.is_empty() { + let fsst_buf = build_shared_dict_fsst( + &group.prefix, + &shared_dict.items, + &dict_strings, + &span_to_idx, + )?; + if fsst_buf.len() < plain_buf.len() { + return Ok(fsst_buf); + } } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(&tmp); - Ok(()) + + Ok(plain_buf) +} + +// ── VarInt stream helpers for optional columns (count = popcount) ───────────── +// These reuse the same wire format as their _implicit counterparts; the count is +// determined by the decoder from context (feature_count or popcount). + +fn write_i64_stream_as_explicit(buf: &mut Vec, values: &[i64]) -> MltResult<()> { + write_i64_stream_implicit(buf, values) } // ── PropValue field accessors ───────────────────────────────────────────────── @@ -879,6 +1432,29 @@ pub fn decode_v2_layer(data: &[u8]) -> MltResult { property_names.push(col_name); property_columns.push(col_values); } + COL_STR_DICT | COL_OPT_STR_DICT => { + let (new_data, col_name, col_values) = + decode_string_dict_column(data, col_type, feature_count)?; + data = new_data; + property_names.push(col_name); + property_columns.push(col_values); + } + COL_STR_FSST | COL_OPT_STR_FSST => { + let (new_data, col_name, col_values) = + decode_string_fsst_column(data, col_type, feature_count)?; + data = new_data; + property_names.push(col_name); + property_columns.push(col_values); + } + COL_STR_SHARED_DICT | COL_STR_SHARED_DICT_FSST => { + let (new_data, columns) = + decode_shared_dict_v2(data, col_type, feature_count)?; + data = new_data; + for (col_name, col_values) in columns { + property_names.push(col_name); + property_columns.push(col_values); + } + } _ => { return Err(MltError::NotImplemented("unsupported v2 column type")); } @@ -1133,8 +1709,12 @@ fn read_enc_byte(data: &[u8]) -> MltResult<(&[u8], bool, u8, u8)> { Ok((&data[1..], has_explicit_count, logical, physical)) } -/// Read a VarInt u32 stream. If `explicit` is true, read the count from the stream; -/// otherwise use `implicit_count`. After reading count, reads byte_length and the data. +/// Read a VarInt (or RLE) u32 stream. +/// +/// If `explicit` is true, read the count from the stream; otherwise use `implicit_count`. +/// Supports: +/// - `logical=0` (None) + `physical=2` (VarInt): plain VarInt stream. +/// - `logical=3` (Rle): byte_length always present; data is `(run_len, value)` VarInt pairs. fn read_u32_stream( data: &[u8], implicit_count: usize, @@ -1142,15 +1722,38 @@ fn read_u32_stream( ) -> MltResult<(&[u8], Vec)> { let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; - if logical != 0 { - // Only None logical is supported here - return Err(MltError::NotImplemented( - "v2 decoder: unsupported logical encoding for integer stream", - )); + // ── RLE: logical=3, physical bits reserved (00) ────────────────────────── + if logical == 3 { + // byte_length always follows for RLE; no separate count field. + let (data, byte_len) = parse_varint::(data)?; + let byte_len = byte_len as usize; + if data.len() < byte_len { + return Err(MltError::BufferUnderflow(byte_len as u32, data.len())); + } + let encoded = &data[..byte_len]; + let remaining = &data[byte_len..]; + + let capacity = if has_expl_count || explicit { implicit_count } else { 16 }; + let mut values = Vec::with_capacity(capacity); + let mut pos = 0; + while pos < byte_len { + let (run_len, c1) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + pos += c1; + let (val, c2) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + pos += c2; + for _ in 0..run_len { + values.push(val); + } + } + return Ok((remaining, values)); } - if physical != 2 { + + // ── VarInt: logical=0, physical=2 ──────────────────────────────────────── + if logical != 0 || physical != 2 { return Err(MltError::NotImplemented( - "v2 decoder: only VarInt physical encoding supported for integer stream", + "v2 decoder: unsupported encoding for u32 stream", )); } @@ -1161,7 +1764,6 @@ fn read_u32_stream( (data, implicit_count) }; - // byte_length let (data, byte_len) = parse_varint::(data)?; let byte_len = byte_len as usize; if data.len() < byte_len { @@ -1271,13 +1873,17 @@ fn read_i64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec< Ok((remaining, values)) } -/// Read a VarInt u64 stream. +/// Read a VarInt (or Delta+VarInt) u64 stream. +/// +/// Supports: +/// - `logical=0` (None) + `physical=2` (VarInt): plain VarInt. +/// - `logical=1` (Delta) + `physical=2` (VarInt): VarInt-encoded deltas, prefix-sum to recover. fn read_u64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec)> { let (data, _has_expl_count, logical, physical) = read_enc_byte(data)?; - if logical != 0 || physical != 2 { + if physical != 2 || (logical != 0 && logical != 1) { return Err(MltError::NotImplemented( - "v2 decoder: only VarInt supported for u64 stream", + "v2 decoder: only VarInt / Delta+VarInt supported for u64 stream", )); } @@ -1298,16 +1904,30 @@ fn read_u64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec< pos += consumed; } + // Apply prefix-sum to undo delta encoding. + if logical == 1 { + let mut acc = 0u64; + for v in &mut values { + acc = acc.wrapping_add(*v); + *v = acc; + } + } + Ok((remaining, values)) } -/// Read a ComponentwiseDelta + VarInt vertex stream (explicit count = vertex pairs). +/// Read a ComponentwiseDelta + (VarInt or FastPFor128) vertex stream (explicit count = vertex pairs). fn read_cwdelta_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; - if logical != 2 || physical != 2 { + if logical != 2 { return Err(MltError::NotImplemented( - "v2 decoder: only CwDelta+VarInt supported for vertex stream", + "v2 decoder: only CwDelta logical encoding supported for vertex stream", + )); + } + if physical != 2 && physical != 3 { + return Err(MltError::NotImplemented( + "v2 decoder: only VarInt(2) or FastPFor128(3) physical encoding for vertex stream", )); } @@ -1328,16 +1948,35 @@ fn read_cwdelta_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { let encoded = &data[..byte_len]; let remaining = &data[byte_len..]; - // Decode VarInt u32 values let coord_count = pair_count * 2; - let mut u32_vals = Vec::with_capacity(coord_count); - let mut pos = 0; - while pos < byte_len { - let (v, consumed) = u32::decode_var(&encoded[pos..]) - .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; - u32_vals.push(v); - pos += consumed; - } + + let u32_vals: Vec = if physical == 3 { + // FastPFor128: bytes are LE u32 words + if !byte_len.is_multiple_of(4) { + return Err(MltError::NotImplemented("v2 FP128: byte length not multiple of 4")); + } + let words: Vec = encoded + .chunks_exact(4) + .map(|c| u32::from_le_bytes([c[0], c[1], c[2], c[3]])) + .collect(); + let mut result = Vec::with_capacity(coord_count + 128); + FastPFor128::default() + .decode(&words, &mut result, Some(coord_count as u32)) + .map_err(|_| MltError::NotImplemented("v2: FastPFor128 decode error"))?; + result.truncate(coord_count); + result + } else { + // VarInt + let mut vals = Vec::with_capacity(coord_count); + let mut pos = 0; + while pos < byte_len { + let (v, consumed) = u32::decode_var(&encoded[pos..]) + .ok_or(MltError::BufferUnderflow(4, encoded.len() - pos))?; + vals.push(v); + pos += consumed; + } + vals + }; // Apply inverse CwDelta let mut result = Vec::with_capacity(coord_count); @@ -1580,6 +2219,296 @@ fn decode_string_column<'a>( Ok((data, values)) } +/// Decode a StrFsst (col_type=32) or OptStrFsst (col_type=33) column. +/// +/// Wire layout: `[presence?] | sym_lengths | sym_bytes | val_lengths | corpus` +fn decode_string_fsst_column<'a>( + data: &'a [u8], + col_type: u8, + feature_count: usize, +) -> MltResult<(&'a [u8], String, Vec)> { + let optional = col_type == COL_OPT_STR_FSST; + + // name + let (data, name_len) = parse_varint::(data)?; + let name_len = name_len as usize; + if data.len() < name_len { + return Err(MltError::BufferUnderflow(name_len as u32, data.len())); + } + let name = std::str::from_utf8(&data[..name_len])?.to_string(); + let data = &data[name_len..]; + + // presence (only for optional) + let (data, presence) = if optional { + let (d, p) = read_presence(data, feature_count)?; + (d, Some(p)) + } else { + (data, None) + }; + let popcount = presence + .as_ref() + .map(|p| p.iter().filter(|&&x| x).count()) + .unwrap_or(feature_count); + + // val_lengths uses an implicit count (= popcount) in the per-column FSST variant. + let (data, present_strings) = read_fsst_strings(data, popcount, false)?; + + let values = if let Some(pres) = &presence { + let mut result = Vec::with_capacity(feature_count); + let mut iter = present_strings.into_iter(); + for &present in pres { + if present { + result.push(PropValue::Str(iter.next().map(Some).unwrap_or(None))); + } else { + result.push(PropValue::Str(None)); + } + } + result + } else { + present_strings + .into_iter() + .map(|s| PropValue::Str(Some(s))) + .collect() + }; + + Ok((data, name, values)) +} + +/// Decompress an FSST-encoded corpus into individual strings. +fn fsst_decompress( + sym_lengths: &[u32], + sym_bytes: &[u8], + val_lengths: &[u32], + corpus: &[u8], +) -> MltResult> { + // Build per-symbol offset table. + let mut sym_offsets = vec![0usize; sym_lengths.len() + 1]; + for (i, &len) in sym_lengths.iter().enumerate() { + sym_offsets[i + 1] = sym_offsets[i] + len as usize; + } + + // Decompress corpus: 0xFF → literal next byte; else → expand symbol. + let mut output: Vec = Vec::new(); + let mut i = 0; + while i < corpus.len() { + let sym_idx = corpus[i] as usize; + if sym_idx == 255 { + i += 1; + if i < corpus.len() { + output.push(corpus[i]); + } + } else if sym_idx < sym_lengths.len() { + let start = sym_offsets[sym_idx]; + let end = sym_offsets[sym_idx + 1]; + output.extend_from_slice(&sym_bytes[start..end]); + } + i += 1; + } + + // Split by value lengths. + let mut strings = Vec::with_capacity(val_lengths.len()); + let mut offset = 0usize; + for &len in val_lengths { + let end = offset + len as usize; + let s = std::str::from_utf8(output.get(offset..end).ok_or( + MltError::BufferUnderflow(len, output.len() - offset), + )?) + .map_err(MltError::from)? + .to_string(); + strings.push(s); + offset = end; + } + Ok(strings) +} + +/// Decode a StrDict (col_type=30) or OptStrDict (col_type=31) column. +/// +/// Wire layout: `dict_lengths_stream | dict_data_stream | indices_stream` +/// +/// - `StrDict`: all features have a value; index `k` → dict entry `k`. +/// - `OptStrDict`: index `0` = null; index `k` (k≥1) → dict entry `k-1`. +fn decode_string_dict_column<'a>( + data: &'a [u8], + col_type: u8, + feature_count: usize, +) -> MltResult<(&'a [u8], String, Vec)> { + let optional = col_type == COL_OPT_STR_DICT; + + // name + let (data, name_len) = parse_varint::(data)?; + let name_len = name_len as usize; + if data.len() < name_len { + return Err(MltError::BufferUnderflow(name_len as u32, data.len())); + } + let name = std::str::from_utf8(&data[..name_len])?.to_string(); + let data = &data[name_len..]; + + // dict_lengths: explicit count = number of dict entries + let (data, dict_lengths) = read_u32_stream(data, 0, true)?; + // dict_data: raw bytes + let (data, dict_bytes) = read_raw_bytes_stream(data)?; + // indices: implicit count = feature_count + let (data, indices) = read_u32_stream(data, feature_count, false)?; + + // Reconstruct dictionary entries from lengths + raw bytes. + let dict = build_strings_from_lengths(&dict_lengths, &dict_bytes)?; + + // Map per-feature indices back to PropValue. + let mut values = Vec::with_capacity(feature_count); + for &idx in &indices { + let pv = if optional { + if idx == 0 { + PropValue::Str(None) + } else { + let entry = dict + .get(idx as usize - 1) + .ok_or(MltError::NotImplemented("v2 StrDict: index out of range"))?; + PropValue::Str(Some(entry.clone())) + } + } else { + let entry = dict + .get(idx as usize) + .ok_or(MltError::NotImplemented("v2 StrDict: index out of range"))?; + PropValue::Str(Some(entry.clone())) + }; + values.push(pv); + } + + Ok((data, name, values)) +} + +/// Decode a `COL_STR_SHARED_DICT` or `COL_STR_SHARED_DICT_FSST` column. +/// +/// Returns a list of `(property_name, per_feature_values)` — one entry per child column in +/// the group — so the caller can append them individually to `property_names` / +/// `property_columns`. +/// +/// Wire layout (plain corpus, `COL_STR_SHARED_DICT`): +/// ```text +/// [varint: prefix_len] [prefix bytes] +/// [varint: child_count] +/// [dict_lengths: ENC_VARINT_EXPL] [dict_data: ENC_RAW_EXPL] +/// for each child: +/// [u8: flags] bit 0 = optional +/// [varint: suffix_len] [suffix bytes] +/// [indices: ENC_VARINT, implicit count = feature_count] +/// ``` +/// FSST variant (`COL_STR_SHARED_DICT_FSST`) replaces the corpus section with: +/// ```text +/// [sym_lengths: ENC_VARINT_EXPL] [sym_bytes: ENC_RAW_EXPL] +/// [val_lengths: ENC_VARINT_EXPL] [corpus: ENC_RAW_EXPL] +/// ``` +fn decode_shared_dict_v2<'a>( + data: &'a [u8], + col_type: u8, + feature_count: usize, +) -> MltResult<(&'a [u8], Vec<(String, Vec)>)> { + let use_fsst = col_type == COL_STR_SHARED_DICT_FSST; + + let (data, prefix_len) = parse_varint::(data)?; + let prefix_len = prefix_len as usize; + if data.len() < prefix_len { + return Err(MltError::BufferUnderflow(prefix_len as u32, data.len())); + } + let prefix = std::str::from_utf8(&data[..prefix_len])?.to_string(); + let data = &data[prefix_len..]; + + let (data, child_count) = parse_varint::(data)?; + let child_count = child_count as usize; + + // Decode shared dictionary corpus into a Vec (one entry per dict index). + let (data, dict_strings): (&[u8], Vec) = if use_fsst { + // val_lengths has an explicit count in the FSST shared-dict variant. + read_fsst_strings(data, 0, true)? + } else { + let (data, dict_lengths) = read_u32_stream(data, 0, true)?; + let (data, dict_bytes) = read_raw_bytes_stream(data)?; + (data, build_strings_from_lengths(&dict_lengths, &dict_bytes)?) + }; + + let mut result = Vec::with_capacity(child_count); + let mut data = data; + + for _ in 0..child_count { + if data.is_empty() { + return Err(MltError::BufferUnderflow(1, 0)); + } + let flags = data[0]; + data = &data[1..]; + let optional = (flags & CHILD_OPTIONAL) != 0; + + let (d, suffix_len) = parse_varint::(data)?; + let suffix_len = suffix_len as usize; + if d.len() < suffix_len { + return Err(MltError::BufferUnderflow(suffix_len as u32, d.len())); + } + let suffix = std::str::from_utf8(&d[..suffix_len])?.to_string(); + data = &d[suffix_len..]; + + let col_name = format!("{prefix}{suffix}"); + + // indices: implicit count = feature_count + let (d, indices) = read_u32_stream(data, feature_count, false)?; + data = d; + + let values: MltResult> = indices + .iter() + .map(|&idx| { + if optional && idx == 0 { + Ok(PropValue::Str(None)) + } else { + let dict_idx = if optional { idx as usize - 1 } else { idx as usize }; + let s = dict_strings + .get(dict_idx) + .ok_or(MltError::NotImplemented("v2 SharedDict: index out of range"))? + .clone(); + Ok(PropValue::Str(Some(s))) + } + }) + .collect(); + + result.push((col_name, values?)); + } + + Ok((data, result)) +} + +/// Reconstruct strings from a parallel `(lengths, flat_bytes)` pair. +fn build_strings_from_lengths(lengths: &[u32], bytes: &[u8]) -> MltResult> { + let mut strings = Vec::with_capacity(lengths.len()); + let mut offset = 0usize; + for &len in lengths { + let end = offset + len as usize; + let s = std::str::from_utf8( + bytes + .get(offset..end) + .ok_or(MltError::BufferUnderflow(len, bytes.len().saturating_sub(offset)))?, + ) + .map_err(MltError::from)? + .to_string(); + strings.push(s); + offset = end; + } + Ok(strings) +} + +/// Read FSST-encoded streams and decompress them into plain strings. +/// +/// `val_explicit` controls whether the value-lengths count is read from the stream +/// (`true`) or inferred from `val_count` (`false`). +fn read_fsst_strings<'a>( + data: &'a [u8], + val_count: usize, + val_explicit: bool, +) -> MltResult<(&'a [u8], Vec)> { + let (data, sym_lengths) = read_u32_stream(data, 0, true)?; + let (data, sym_bytes) = read_raw_bytes_stream(data)?; + let (data, val_lengths) = read_u32_stream(data, val_count, val_explicit)?; + let (data, corpus) = read_raw_bytes_stream(data)?; + let strings = fsst_decompress(&sym_lengths, &sym_bytes, &val_lengths, &corpus)?; + Ok((data, strings)) +} + /// Expand a stream of non-null values back to `feature_count` values by /// inserting `null_val` for absent features according to the presence bitfield. fn expand_with_presence>( diff --git a/rust/mlt/src/convert.rs b/rust/mlt/src/convert.rs index c96029658..23132ae5c 100644 --- a/rust/mlt/src/convert.rs +++ b/rust/mlt/src/convert.rs @@ -119,10 +119,6 @@ pub fn convert(args: &ConvertArgs) -> AnyResult<()> { ..Default::default() }; - if args.v2 && is_mbtiles_extension(&args.input) { - bail!("--v2 is not supported with .mbtiles input (only file-based conversion)"); - } - if is_mbtiles_extension(&args.input) { if !is_mbtiles_extension(&args.output) { bail!( @@ -134,7 +130,7 @@ pub fn convert(args: &ConvertArgs) -> AnyResult<()> { .enable_io() .enable_time() .build()? - .block_on(convert_mbtiles_async(args, cfg)); + .block_on(convert_mbtiles_async(args, cfg, args.v2)); } let mut files: Vec = Vec::new(); @@ -152,9 +148,8 @@ pub fn convert(args: &ConvertArgs) -> AnyResult<()> { }; let failed = AtomicUsize::new(0); - let use_v2 = args.v2; files.par_iter().for_each(|file| { - if let Err(e) = convert_file(file, base, &args.output, cfg, use_v2) { + if let Err(e) = convert_file(file, base, &args.output, cfg, args.v2) { eprintln!("error: {}: {e:#}", file.display()); failed.fetch_add(1, Ordering::Relaxed); } @@ -186,19 +181,13 @@ fn convert_file( let buffer = fs::read(file).with_context(|| format!("reading {}", file.display()))?; let out_bytes = if is_mlt_extension(file) { - if v2 { - convert_mlt_buffer_v2(&buffer) - .with_context(|| format!("converting MLT→v2 {}", file.display()))? - } else { - convert_mlt_buffer(&buffer, cfg) - .with_context(|| format!("converting MLT {}", file.display()))? - } - } else if v2 { - convert_mvt_buffer_v2(buffer) - .with_context(|| format!("converting MVT→v2 {}", file.display()))? + let label = if v2 { "MLT→v2" } else { "MLT" }; + convert_mlt_buffer(&buffer, cfg, v2) + .with_context(|| format!("converting {label} {}", file.display()))? } else { - convert_mvt_buffer(buffer, cfg) - .with_context(|| format!("converting MVT {}", file.display()))? + let label = if v2 { "MVT→v2" } else { "MVT" }; + convert_mvt_buffer(buffer, cfg, v2) + .with_context(|| format!("converting {label} {}", file.display()))? }; fs::write(&out_path, &out_bytes).with_context(|| format!("writing {}", out_path.display()))?; @@ -241,12 +230,13 @@ fn is_mbtiles_extension(path: &Path) -> bool { matches!(path.extension().and_then(OsStr::to_str), Some("mbtiles")) } -/// Re-encode an MLT tile using automatic encoding selection. +/// Re-encode an MLT tile, dispatching to v1 or v2 encoding based on `v2`. /// -/// Every Tag01 layer is fully decoded to [`mlt_core::TileLayer01`] and then -/// re-encoded via [`mlt_core::TileLayer01::encode`]. Unknown layer tags are -/// passed through unchanged. -fn convert_mlt_buffer(buffer: &[u8], cfg: EncoderConfig) -> AnyResult> { +/// For v1: every `Tag01` layer is re-encoded via [`TileLayer01::encode`]; +/// `Unknown` layers are passed through unchanged. +/// For v2: `Tag01` and `Tag02` layers are re-encoded via [`TileLayer01::encode_v2`]; +/// `Unknown` layers are dropped (they have no v2 equivalent). +fn convert_mlt_buffer(buffer: &[u8], cfg: EncoderConfig, v2: bool) -> AnyResult> { let layers = Parser::default().parse_layers(buffer)?; let mut dec = Decoder::default(); let mut out: Vec = Vec::new(); @@ -255,46 +245,15 @@ fn convert_mlt_buffer(buffer: &[u8], cfg: EncoderConfig) -> AnyResult> { match layer { Layer::Tag01(l) => { let tile = l.into_tile(&mut dec)?; - out.extend_from_slice(&tile.encode(cfg)?); + let bytes = if v2 { tile.encode_v2(cfg)? } else { tile.encode(cfg)? }; + out.extend_from_slice(&bytes); } - Layer::Unknown(u) => { - out.extend(EncodedUnknown::from(u).write_to(Encoder::default())?.data); + Layer::Tag02(tile) if v2 => { + out.extend_from_slice(&tile.encode_v2(cfg)?); } - _ => {} - } - } - - Ok(out) -} - -/// Convert an MVT tile to an MLT tile using automatic encoding selection. -/// -/// Each MVT layer is converted to a [`mlt_core::TileLayer01`] and encoded -/// via [`mlt_core::TileLayer01::encode`]. -fn convert_mvt_buffer(buffer: Vec, cfg: EncoderConfig) -> AnyResult> { - let mut out: Vec = Vec::new(); - for tile in mvt_to_tile_layers(buffer)? { - out.extend_from_slice(&tile.encode(cfg)?); - } - Ok(out) -} - -/// Re-encode an MLT tile as v2, decoding each Tag01/Tag02 layer and re-encoding as v2. -fn convert_mlt_buffer_v2(buffer: &[u8]) -> AnyResult> { - let layers = Parser::default().parse_layers(buffer)?; - let mut dec = Decoder::default(); - let mut out: Vec = Vec::new(); - - for layer in layers { - match layer { - Layer::Tag01(l) => { - let tile = l.into_tile(&mut dec)?; - out.extend_from_slice(&tile.encode_v2()?); - } - Layer::Tag02(tile) => { - out.extend_from_slice(&tile.encode_v2()?); + Layer::Unknown(u) if !v2 => { + out.extend(EncodedUnknown::from(u).write_to(Encoder::default())?.data); } - Layer::Unknown(_) => {} _ => {} } } @@ -302,11 +261,12 @@ fn convert_mlt_buffer_v2(buffer: &[u8]) -> AnyResult> { Ok(out) } -/// Convert an MVT tile to the experimental MLT v2 format. -fn convert_mvt_buffer_v2(buffer: Vec) -> AnyResult> { +/// Convert an MVT tile to MLT, dispatching to v1 or v2 encoding based on `v2`. +fn convert_mvt_buffer(buffer: Vec, cfg: EncoderConfig, v2: bool) -> AnyResult> { let mut out: Vec = Vec::new(); for tile in mvt_to_tile_layers(buffer)? { - out.extend_from_slice(&tile.encode_v2()?); + let bytes = if v2 { tile.encode_v2(cfg)? } else { tile.encode(cfg)? }; + out.extend_from_slice(&bytes); } Ok(out) } @@ -356,6 +316,7 @@ async fn mbtiles_compute( mlt_tx: Sender, encoding: Encoding, cfg: EncoderConfig, + v2: bool, ) -> AnyResult<()> { while let Some(batch) = raw_rx.recv().await { let results = spawn_blocking(move || { @@ -371,7 +332,7 @@ async fn mbtiles_compute( Encoding::Zstd => decode_zstd(&data)?, Encoding::Uncompressed | Encoding::Internal => data, }; - let mlt = convert_mvt_buffer(mvt, cfg)?; + let mlt = convert_mvt_buffer(mvt, cfg, v2)?; Ok((coord, mlt)) }) .collect::>() @@ -486,7 +447,7 @@ async fn mbtiles_writer( /// ``` /// /// WAL mode is enabled on the destination file for better write throughput. -async fn convert_mbtiles_async(args: &ConvertArgs, cfg: EncoderConfig) -> AnyResult<()> { +async fn convert_mbtiles_async(args: &ConvertArgs, cfg: EncoderConfig, v2: bool) -> AnyResult<()> { // ── source ──────────────────────────────────────────────────────────────── let src = Mbtiles::new(&args.input)?; let mut src_conn = src.open_readonly().await?; @@ -559,7 +520,7 @@ async fn convert_mbtiles_async(args: &ConvertArgs, cfg: EncoderConfig) -> AnyRes let ((), (), total) = tokio::try_join!( mbtiles_reader(src, src_conn, raw_tx), - mbtiles_compute(raw_rx, mlt_tx, tile_info.encoding, cfg), + mbtiles_compute(raw_rx, mlt_tx, tile_info.encoding, cfg, v2), mbtiles_writer(dst, dst_conn, mlt_rx, mbt_type, bar.clone()), )?; From eacf88ca16a207a84fd4303175d0e9d8e5572d4a Mon Sep 17 00:00:00 2001 From: Yuri Astrakhan Date: Wed, 22 Apr 2026 01:28:06 -0400 Subject: [PATCH 09/12] v2 --- rust/mlt-core/src/v2.rs | 562 ++++++++++++++++++++++------------------ rust/mlt/src/convert.rs | 12 +- 2 files changed, 324 insertions(+), 250 deletions(-) diff --git a/rust/mlt-core/src/v2.rs b/rust/mlt-core/src/v2.rs index 21b918627..30d8d27d1 100644 --- a/rust/mlt-core/src/v2.rs +++ b/rust/mlt-core/src/v2.rs @@ -28,26 +28,25 @@ //! decoded layer may differ from the original when grouping is applied (shared-dict children //! are emitted together before the next non-grouped column). +use std::collections::HashMap; + +use fastpfor::{AnyLenCodec as _, FastPFor128}; use geo_types::{ Coord, Geometry, LineString, MultiLineString, MultiPoint, MultiPolygon, Point, Polygon, }; use integer_encoding::{VarInt as _, VarIntWriter as _}; use zigzag::ZigZag as _; -use fastpfor::{AnyLenCodec as _, FastPFor128}; - -use crate::MltError; -use crate::MltResult; use crate::codecs::fsst::compress_fsst; use crate::codecs::varint::parse_varint; use crate::codecs::zigzag::encode_componentwise_delta_vec2s; use crate::decoder::{GeometryType, PropValue, TileFeature, TileLayer01}; -use std::collections::HashMap; - use crate::encoder::{ EncoderConfig, SortStrategy, StagedSharedDict, StagedSharedDictItem, StringGroup, group_string_properties, spatial_sort_likely_to_help, }; +use crate::utils::{BinarySerializer as _, parse_string}; +use crate::{MltError, MltResult}; /// Minimum feature count above which the bounding-box heuristic is applied /// before deciding whether to try spatial sort trials. Mirrors v1's threshold. @@ -114,41 +113,88 @@ impl TryFrom for GeoLayout { // ── Column type codes (reuse v1 codes for backward clarity) ────────────────── -// IDs -const COL_ID: u8 = 0; -const COL_OPT_ID: u8 = 1; -// Scalars (mirror v1 ColumnType values) -const COL_BOOL: u8 = 10; -const COL_OPT_BOOL: u8 = 11; -const COL_I8: u8 = 12; -const COL_OPT_I8: u8 = 13; -const COL_U8: u8 = 14; -const COL_OPT_U8: u8 = 15; -const COL_I32: u8 = 16; -const COL_OPT_I32: u8 = 17; -const COL_U32: u8 = 18; -const COL_OPT_U32: u8 = 19; -const COL_I64: u8 = 20; -const COL_OPT_I64: u8 = 21; -const COL_U64: u8 = 22; -const COL_OPT_U64: u8 = 23; -const COL_F32: u8 = 24; -const COL_OPT_F32: u8 = 25; -const COL_F64: u8 = 26; -const COL_OPT_F64: u8 = 27; -// Strings: StrPlain / OptStrPlain -const COL_STR: u8 = 28; -const COL_OPT_STR: u8 = 29; -// Strings: StrDict / OptStrDict (no presence stream; for OptStrDict index 0 = null) -const COL_STR_DICT: u8 = 30; -const COL_OPT_STR_DICT: u8 = 31; -// Strings: StrFsst / OptStrFsst (FSST-compressed; presence bitfield when optional) -const COL_STR_FSST: u8 = 32; -const COL_OPT_STR_FSST: u8 = 33; -// Strings: SharedDict (plain) / SharedDictFsst (FSST corpus) — cross-column shared dictionary. -// One encoded column entry covers N child string columns that share similar string values. -const COL_STR_SHARED_DICT: u8 = 34; -const COL_STR_SHARED_DICT_FSST: u8 = 35; +/// Wire column type byte for MLT v2 property columns. +/// +/// Each variant maps to its discriminant value on the wire. +/// Optional variants encode the same data as their non-optional counterpart +/// but are preceded by a packed presence bitfield (except `OptStrDict`, which +/// uses index 0 to signal null). +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq)] +enum ColType { + // IDs + Id = 0, + OptId = 1, + // Scalars (mirror v1 ColumnType values) + Bool = 10, + OptBool = 11, + I8 = 12, + OptI8 = 13, + U8 = 14, + OptU8 = 15, + I32 = 16, + OptI32 = 17, + U32 = 18, + OptU32 = 19, + I64 = 20, + OptI64 = 21, + U64 = 22, + OptU64 = 23, + F32 = 24, + OptF32 = 25, + F64 = 26, + OptF64 = 27, + // Strings: plain lengths + raw bytes (presence bitfield when optional) + Str = 28, + OptStr = 29, + // Strings: deduplicated dictionary; index 0 = null for OptStrDict + StrDict = 30, + OptStrDict = 31, + // Strings: FSST-compressed corpus (presence bitfield when optional) + StrFsst = 32, + OptStrFsst = 33, + // Strings: cross-column shared dictionary (plain corpus) + StrSharedDict = 34, + // Strings: cross-column shared dictionary (FSST corpus) + StrSharedDictFsst = 35, +} + +impl TryFrom for ColType { + type Error = MltError; + fn try_from(v: u8) -> MltResult { + match v { + 0 => Ok(Self::Id), + 1 => Ok(Self::OptId), + 10 => Ok(Self::Bool), + 11 => Ok(Self::OptBool), + 12 => Ok(Self::I8), + 13 => Ok(Self::OptI8), + 14 => Ok(Self::U8), + 15 => Ok(Self::OptU8), + 16 => Ok(Self::I32), + 17 => Ok(Self::OptI32), + 18 => Ok(Self::U32), + 19 => Ok(Self::OptU32), + 20 => Ok(Self::I64), + 21 => Ok(Self::OptI64), + 22 => Ok(Self::U64), + 23 => Ok(Self::OptU64), + 24 => Ok(Self::F32), + 25 => Ok(Self::OptF32), + 26 => Ok(Self::F64), + 27 => Ok(Self::OptF64), + 28 => Ok(Self::Str), + 29 => Ok(Self::OptStr), + 30 => Ok(Self::StrDict), + 31 => Ok(Self::OptStrDict), + 32 => Ok(Self::StrFsst), + 33 => Ok(Self::OptStrFsst), + 34 => Ok(Self::StrSharedDict), + 35 => Ok(Self::StrSharedDictFsst), + _ => Err(MltError::NotImplemented("unsupported v2 column type")), + } + } +} /// Flag byte for each child within a shared-dict column: bit 0 = child has null values. const CHILD_OPTIONAL: u8 = 0x01; @@ -177,8 +223,8 @@ impl TileLayer01 { let mut best = self.encode_v2_with_sort(cfg)?; // Apply the same spatial-sort heuristic as the v1 optimizer. - let try_spatial = self.features.len() < SORT_TRIAL_THRESHOLD - || spatial_sort_likely_to_help(self); + let try_spatial = + self.features.len() < SORT_TRIAL_THRESHOLD || spatial_sort_likely_to_help(self); if try_spatial { let strategies = [ @@ -259,11 +305,18 @@ impl TileLayer01 { .iter() .map(|(_, c_idx)| { let full_name = &self.property_names[*c_idx]; - let vals: Vec<&PropValue> = - self.features.iter().map(|f| &f.properties[*c_idx]).collect(); + let vals: Vec<&PropValue> = self + .features + .iter() + .map(|f| &f.properties[*c_idx]) + .collect(); let mut chunk = Vec::new(); write_property_column( - &mut chunk, full_name, &vals, feature_count, cfg.allow_fsst, + &mut chunk, + full_name, + &vals, + feature_count, + cfg.allow_fsst, )?; Ok(chunk) }) @@ -276,8 +329,11 @@ impl TileLayer01 { col_chunks.extend(individual_chunks); } } else if !col_to_group.contains_key(&col_idx) { - let vals: Vec<&PropValue> = - self.features.iter().map(|f| &f.properties[col_idx]).collect(); + let vals: Vec<&PropValue> = self + .features + .iter() + .map(|f| &f.properties[col_idx]) + .collect(); let mut chunk = Vec::new(); write_property_column(&mut chunk, name, &vals, feature_count, cfg.allow_fsst)?; col_chunks.push(chunk); @@ -289,10 +345,7 @@ impl TileLayer01 { let mut body: Vec = Vec::new(); // name - let name_bytes = self.name.as_bytes(); - body.write_varint(name_bytes.len() as u32) - .map_err(MltError::from)?; - body.extend_from_slice(name_bytes); + body.write_string(&self.name).map_err(MltError::from)?; // extent body.write_varint(self.extent).map_err(MltError::from)?; @@ -537,7 +590,8 @@ fn build_delta_body_u64(values: &[u64]) -> MltResult> { fn write_u32_stream_implicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> { buf.push(ENC_VARINT); let body = build_varint_body_u32(values)?; - buf.write_varint(body.len() as u32).map_err(MltError::from)?; + buf.write_varint(body.len() as u32) + .map_err(MltError::from)?; buf.extend_from_slice(&body); Ok(()) } @@ -553,11 +607,13 @@ fn write_types_stream(buf: &mut Vec, values: &[u32]) -> MltResult<()> { // RLE overhead: 1 (enc_byte) + varint(body_len) + body (same shape) if rle_body.len() < varint_body.len() { buf.push(ENC_RLE); - buf.write_varint(rle_body.len() as u32).map_err(MltError::from)?; + buf.write_varint(rle_body.len() as u32) + .map_err(MltError::from)?; buf.extend_from_slice(&rle_body); } else { buf.push(ENC_VARINT); - buf.write_varint(varint_body.len() as u32).map_err(MltError::from)?; + buf.write_varint(varint_body.len() as u32) + .map_err(MltError::from)?; buf.extend_from_slice(&varint_body); } Ok(()) @@ -569,7 +625,8 @@ fn write_u32_stream_explicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> buf.write_varint(values.len() as u32) .map_err(MltError::from)?; let body = build_varint_body_u32(values)?; - buf.write_varint(body.len() as u32).map_err(MltError::from)?; + buf.write_varint(body.len() as u32) + .map_err(MltError::from)?; buf.extend_from_slice(&body); Ok(()) } @@ -589,11 +646,13 @@ fn write_u64_stream_best(buf: &mut Vec, values: &[u64]) -> MltResult<()> { if delta_body.len() < plain_body.len() { buf.push(ENC_DELTA_VARINT); - buf.write_varint(delta_body.len() as u32).map_err(MltError::from)?; + buf.write_varint(delta_body.len() as u32) + .map_err(MltError::from)?; buf.extend_from_slice(&delta_body); } else { buf.push(ENC_VARINT); - buf.write_varint(plain_body.len() as u32).map_err(MltError::from)?; + buf.write_varint(plain_body.len() as u32) + .map_err(MltError::from)?; buf.extend_from_slice(&plain_body); } Ok(()) @@ -674,15 +733,13 @@ fn write_cwdelta_vertices_fp128(pair_count: u32, encoded: &[u32]) -> MltResult = scratch - .iter() - .flat_map(|&w| w.to_le_bytes()) - .collect(); + let byte_data: Vec = scratch.iter().flat_map(|&w| w.to_le_bytes()).collect(); let mut out = Vec::new(); out.push(ENC_CWDELTA_FP128_EXPL); out.write_varint(pair_count).map_err(MltError::from)?; - out.write_varint(byte_data.len() as u32).map_err(MltError::from)?; + out.write_varint(byte_data.len() as u32) + .map_err(MltError::from)?; out.extend_from_slice(&byte_data); Ok(out) } @@ -760,14 +817,14 @@ fn write_id_column(buf: &mut Vec, ids: &[Option], _feature_count: usize if any_null { // OptId: presence bitfield + data for present values - buf.push(COL_OPT_ID); + buf.push(ColType::OptId as u8); let presence: Vec = ids.iter().map(|id| id.is_some()).collect(); write_presence(buf, &presence); let values: Vec = ids.iter().filter_map(|id| *id).collect(); write_u64_stream_best(buf, &values)?; } else { // Id: all values present; Delta+VarInt compresses sequential OSM IDs well. - buf.push(COL_ID); + buf.push(ColType::Id as u8); let values: Vec = ids.iter().map(|id| id.unwrap_or(0)).collect(); write_u64_stream_best(buf, &values)?; } @@ -806,52 +863,63 @@ fn write_property_column( Vec::new() }; - let name_bytes = name.as_bytes(); - - // Choose whichever encoding yields the fewest bytes. + // Choose whichever encoding yields the fewest bytes, then write once. + // Dict encoding uses index 0 for null — no separate presence bitfield. let best_size = if fsst_bytes.is_empty() { plain_bytes.len().min(dict_bytes.len()) } else { - plain_bytes.len().min(dict_bytes.len()).min(fsst_bytes.len()) + plain_bytes + .len() + .min(dict_bytes.len()) + .min(fsst_bytes.len()) }; - if !fsst_bytes.is_empty() && best_size == fsst_bytes.len() { - // FSST: presence bitfield (when optional) + 4 compressed streams. - buf.push(if any_null { COL_OPT_STR_FSST } else { COL_STR_FSST }); - buf.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(name_bytes); - if any_null { - let presence: Vec = strings.iter().map(|s| s.is_some()).collect(); - write_presence(buf, &presence); - } - buf.extend_from_slice(&fsst_bytes); - } else if best_size == dict_bytes.len() { - // Dict: no separate presence (null = index 0 for OptStrDict). - buf.push(if any_null { COL_OPT_STR_DICT } else { COL_STR_DICT }); - buf.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(name_bytes); - buf.extend_from_slice(&dict_bytes); - } else { - // Plain: presence bitfield when optional. - buf.push(if any_null { COL_OPT_STR } else { COL_STR }); - buf.write_varint(name_bytes.len() as u32).map_err(MltError::from)?; - buf.extend_from_slice(name_bytes); - if any_null { - let presence: Vec = strings.iter().map(|s| s.is_some()).collect(); - write_presence(buf, &presence); - } - buf.extend_from_slice(&plain_bytes); + let (col_type, data, with_presence) = + if !fsst_bytes.is_empty() && best_size == fsst_bytes.len() { + ( + if any_null { + ColType::OptStrFsst + } else { + ColType::StrFsst + }, + &fsst_bytes, + any_null, + ) + } else if best_size == dict_bytes.len() { + ( + if any_null { + ColType::OptStrDict + } else { + ColType::StrDict + }, + &dict_bytes, + false, + ) + } else { + ( + if any_null { + ColType::OptStr + } else { + ColType::Str + }, + &plain_bytes, + any_null, + ) + }; + + buf.push(col_type as u8); + buf.write_string(name).map_err(MltError::from)?; + if with_presence { + let presence: Vec = strings.iter().map(|s| s.is_some()).collect(); + write_presence(buf, &presence); } + buf.extend_from_slice(data); return Ok(()); } let col_type = prop_column_type(values[0], any_null); - buf.push(col_type); - - let name_bytes = name.as_bytes(); - buf.write_varint(name_bytes.len() as u32) - .map_err(MltError::from)?; - buf.extend_from_slice(name_bytes); + buf.push(col_type as u8); + buf.write_string(name).map_err(MltError::from)?; if any_null { let presence: Vec = values.iter().map(|v| !is_null(v)).collect(); @@ -876,18 +944,18 @@ fn is_null(v: &PropValue) -> bool { } } -fn prop_column_type(sample: &PropValue, any_null: bool) -> u8 { +fn prop_column_type(sample: &PropValue, any_null: bool) -> ColType { let (non_opt, opt) = match sample { - PropValue::Bool(_) => (COL_BOOL, COL_OPT_BOOL), - PropValue::I8(_) => (COL_I8, COL_OPT_I8), - PropValue::U8(_) => (COL_U8, COL_OPT_U8), - PropValue::I32(_) => (COL_I32, COL_OPT_I32), - PropValue::U32(_) => (COL_U32, COL_OPT_U32), - PropValue::I64(_) => (COL_I64, COL_OPT_I64), - PropValue::U64(_) => (COL_U64, COL_OPT_U64), - PropValue::F32(_) => (COL_F32, COL_OPT_F32), - PropValue::F64(_) => (COL_F64, COL_OPT_F64), - PropValue::Str(_) => (COL_STR, COL_OPT_STR), + PropValue::Bool(_) => (ColType::Bool, ColType::OptBool), + PropValue::I8(_) => (ColType::I8, ColType::OptI8), + PropValue::U8(_) => (ColType::U8, ColType::OptU8), + PropValue::I32(_) => (ColType::I32, ColType::OptI32), + PropValue::U32(_) => (ColType::U32, ColType::OptU32), + PropValue::I64(_) => (ColType::I64, ColType::OptI64), + PropValue::U64(_) => (ColType::U64, ColType::OptU64), + PropValue::F32(_) => (ColType::F32, ColType::OptF32), + PropValue::F64(_) => (ColType::F64, ColType::OptF64), + PropValue::Str(_) => (ColType::Str, ColType::OptStr), }; if any_null { opt } else { non_opt } } @@ -1061,9 +1129,9 @@ fn build_string_fsst_data(strings: &[Option<&str>], _any_null: bool) -> MltResul // Fall back to an empty structure that the decoder can handle. let mut buf = Vec::new(); write_u32_stream_explicit(&mut buf, &[])?; // 0 symbols - write_raw_bytes(&mut buf, &[])?; // 0 symbol bytes + write_raw_bytes(&mut buf, &[])?; // 0 symbol bytes write_u32_stream_implicit(&mut buf, &[])?; // 0 lengths - write_raw_bytes(&mut buf, &[])?; // 0 corpus bytes + write_raw_bytes(&mut buf, &[])?; // 0 corpus bytes return Ok(buf); } @@ -1092,7 +1160,9 @@ fn build_item_indices( item.presence_bools() .map(|present| { if present { - let span = span_iter.next().expect("v2 SharedDict: presence/dense mismatch"); + let span = span_iter + .next() + .expect("v2 SharedDict: presence/dense mismatch"); let dict_idx = *span_to_idx .get(&span) .expect("v2 SharedDict: span not in dict"); @@ -1127,13 +1197,10 @@ fn build_shared_dict_plain( span_to_idx: &HashMap<(u32, u32), u32>, ) -> MltResult> { let mut buf = Vec::new(); - buf.push(COL_STR_SHARED_DICT); - - let prefix_bytes = prefix.as_bytes(); - buf.write_varint(prefix_bytes.len() as u32) + buf.push(ColType::StrSharedDict as u8); + buf.write_string(prefix).map_err(MltError::from)?; + buf.write_varint(items.len() as u32) .map_err(MltError::from)?; - buf.extend_from_slice(prefix_bytes); - buf.write_varint(items.len() as u32).map_err(MltError::from)?; let dict_lengths: Vec = dict_strings.iter().map(|s| s.len() as u32).collect(); let dict_bytes: Vec = dict_strings @@ -1147,10 +1214,7 @@ fn build_shared_dict_plain( for item in items { let optional = item.has_presence(); buf.push(if optional { CHILD_OPTIONAL } else { 0 }); - let suffix_bytes = item.suffix.as_bytes(); - buf.write_varint(suffix_bytes.len() as u32) - .map_err(MltError::from)?; - buf.extend_from_slice(suffix_bytes); + buf.write_string(&item.suffix).map_err(MltError::from)?; let indices = build_item_indices(item, span_to_idx, optional); write_u32_stream_implicit(&mut buf, &indices)?; } @@ -1175,13 +1239,10 @@ fn build_shared_dict_fsst( let raw = compress_fsst(dict_strings); let mut buf = Vec::new(); - buf.push(COL_STR_SHARED_DICT_FSST); - - let prefix_bytes = prefix.as_bytes(); - buf.write_varint(prefix_bytes.len() as u32) + buf.push(ColType::StrSharedDictFsst as u8); + buf.write_string(prefix).map_err(MltError::from)?; + buf.write_varint(items.len() as u32) .map_err(MltError::from)?; - buf.extend_from_slice(prefix_bytes); - buf.write_varint(items.len() as u32).map_err(MltError::from)?; write_u32_stream_explicit(&mut buf, &raw.symbol_lengths)?; write_raw_bytes(&mut buf, &raw.symbol_bytes)?; @@ -1191,10 +1252,7 @@ fn build_shared_dict_fsst( for item in items { let optional = item.has_presence(); buf.push(if optional { CHILD_OPTIONAL } else { 0 }); - let suffix_bytes = item.suffix.as_bytes(); - buf.write_varint(suffix_bytes.len() as u32) - .map_err(MltError::from)?; - buf.extend_from_slice(suffix_bytes); + buf.write_string(&item.suffix).map_err(MltError::from)?; let indices = build_item_indices(item, span_to_idx, optional); write_u32_stream_implicit(&mut buf, &indices)?; } @@ -1267,8 +1325,12 @@ fn encode_shared_dict_chunk( .map(|(i, span)| (span, i as u32)) .collect(); - let plain_buf = - build_shared_dict_plain(&group.prefix, &shared_dict.items, &dict_strings, &span_to_idx)?; + let plain_buf = build_shared_dict_plain( + &group.prefix, + &shared_dict.items, + &dict_strings, + &span_to_idx, + )?; if allow_fsst && !dict_strings.is_empty() { let fsst_buf = build_shared_dict_fsst( @@ -1366,13 +1428,8 @@ fn prop_f64(v: &PropValue) -> f64 { /// Returns a [`TileLayer01`] that can be used by the rest of the MLT tooling. pub fn decode_v2_layer(data: &[u8]) -> MltResult { // ── Header ──────────────────────────────────────────────────────────────── - let (data, name_len) = parse_varint::(data)?; - let name_len = name_len as usize; - if data.len() < name_len { - return Err(MltError::BufferUnderflow(name_len as u32, data.len())); - } - let name = std::str::from_utf8(&data[..name_len])?.to_string(); - let data = &data[name_len..]; + let (data, name) = parse_string(data)?; + let name = name.to_string(); let (data, extent) = parse_varint::(data)?; let (data, feature_count) = parse_varint::(data)?; @@ -1400,18 +1457,18 @@ pub fn decode_v2_layer(data: &[u8]) -> MltResult { if data.is_empty() { return Err(MltError::BufferUnderflow(1, 0)); } - let col_type = data[0]; + let col_type = ColType::try_from(data[0])?; data = &data[1..]; match col_type { - COL_ID => { - let (new_data, ids) = decode_id_column(data, false, feature_count)?; + ColType::Id => { + let (new_data, ids) = decode_id_column(data, feature_count)?; data = new_data; for (i, id) in ids.into_iter().enumerate() { feature_ids[i] = Some(id); } } - COL_OPT_ID => { + ColType::OptId => { let (new_data, presence, ids) = decode_opt_id_column(data, feature_count)?; data = new_data; presence_groups.push(presence.clone()); @@ -1422,42 +1479,54 @@ pub fn decode_v2_layer(data: &[u8]) -> MltResult { } } } - COL_BOOL | COL_OPT_BOOL | COL_I8 | COL_OPT_I8 | COL_U8 | COL_OPT_U8 | COL_I32 - | COL_OPT_I32 | COL_U32 | COL_OPT_U32 | COL_I64 | COL_OPT_I64 | COL_U64 - | COL_OPT_U64 | COL_F32 | COL_OPT_F32 | COL_F64 | COL_OPT_F64 | COL_STR - | COL_OPT_STR => { + ColType::Bool + | ColType::OptBool + | ColType::I8 + | ColType::OptI8 + | ColType::U8 + | ColType::OptU8 + | ColType::I32 + | ColType::OptI32 + | ColType::U32 + | ColType::OptU32 + | ColType::I64 + | ColType::OptI64 + | ColType::U64 + | ColType::OptU64 + | ColType::F32 + | ColType::OptF32 + | ColType::F64 + | ColType::OptF64 + | ColType::Str + | ColType::OptStr => { let (new_data, col_name, col_values) = decode_property_column(data, col_type, feature_count, &mut presence_groups)?; data = new_data; property_names.push(col_name); property_columns.push(col_values); } - COL_STR_DICT | COL_OPT_STR_DICT => { + ColType::StrDict | ColType::OptStrDict => { let (new_data, col_name, col_values) = decode_string_dict_column(data, col_type, feature_count)?; data = new_data; property_names.push(col_name); property_columns.push(col_values); } - COL_STR_FSST | COL_OPT_STR_FSST => { + ColType::StrFsst | ColType::OptStrFsst => { let (new_data, col_name, col_values) = decode_string_fsst_column(data, col_type, feature_count)?; data = new_data; property_names.push(col_name); property_columns.push(col_values); } - COL_STR_SHARED_DICT | COL_STR_SHARED_DICT_FSST => { - let (new_data, columns) = - decode_shared_dict_v2(data, col_type, feature_count)?; + ColType::StrSharedDict | ColType::StrSharedDictFsst => { + let (new_data, columns) = decode_shared_dict_v2(data, col_type, feature_count)?; data = new_data; for (col_name, col_values) in columns { property_names.push(col_name); property_columns.push(col_values); } } - _ => { - return Err(MltError::NotImplemented("unsupported v2 column type")); - } } } @@ -1733,7 +1802,11 @@ fn read_u32_stream( let encoded = &data[..byte_len]; let remaining = &data[byte_len..]; - let capacity = if has_expl_count || explicit { implicit_count } else { 16 }; + let capacity = if has_expl_count || explicit { + implicit_count + } else { + 16 + }; let mut values = Vec::with_capacity(capacity); let mut pos = 0; while pos < byte_len { @@ -1953,7 +2026,9 @@ fn read_cwdelta_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { let u32_vals: Vec = if physical == 3 { // FastPFor128: bytes are LE u32 words if !byte_len.is_multiple_of(4) { - return Err(MltError::NotImplemented("v2 FP128: byte length not multiple of 4")); + return Err(MltError::NotImplemented( + "v2 FP128: byte length not multiple of 4", + )); } let words: Vec = encoded .chunks_exact(4) @@ -2009,11 +2084,7 @@ fn read_presence(data: &[u8], feature_count: usize) -> MltResult<(&[u8], Vec( - data: &'a [u8], - _optional: bool, - feature_count: usize, -) -> MltResult<(&'a [u8], Vec)> { +fn decode_id_column<'a>(data: &'a [u8], feature_count: usize) -> MltResult<(&'a [u8], Vec)> { read_u64_stream(data, feature_count) } @@ -2031,31 +2102,25 @@ fn decode_opt_id_column<'a>( fn decode_property_column<'a>( data: &'a [u8], - col_type: u8, + col_type: ColType, feature_count: usize, presence_groups: &mut Vec>, ) -> MltResult<(&'a [u8], String, Vec)> { - // name - let (data, name_len) = parse_varint::(data)?; - let name_len = name_len as usize; - if data.len() < name_len { - return Err(MltError::BufferUnderflow(name_len as u32, data.len())); - } - let name = std::str::from_utf8(&data[..name_len])?.to_string(); - let data = &data[name_len..]; + let (data, name) = parse_string(data)?; + let name = name.to_string(); let is_optional = matches!( col_type, - COL_OPT_BOOL - | COL_OPT_I8 - | COL_OPT_U8 - | COL_OPT_I32 - | COL_OPT_U32 - | COL_OPT_I64 - | COL_OPT_U64 - | COL_OPT_F32 - | COL_OPT_F64 - | COL_OPT_STR + ColType::OptBool + | ColType::OptI8 + | ColType::OptU8 + | ColType::OptI32 + | ColType::OptU32 + | ColType::OptI64 + | ColType::OptU64 + | ColType::OptF32 + | ColType::OptF64 + | ColType::OptStr ); let (data, presence) = if is_optional { @@ -2077,13 +2142,13 @@ fn decode_property_column<'a>( fn decode_column_data<'a>( data: &'a [u8], - col_type: u8, + col_type: ColType, feature_count: usize, popcount: usize, presence: &Option>, ) -> MltResult<(&'a [u8], Vec)> { match col_type { - COL_BOOL | COL_OPT_BOOL => { + ColType::Bool | ColType::OptBool => { let (data, bytes) = read_raw_bytes_stream(data)?; let values = expand_with_presence( bytes.iter().map(|&b| PropValue::Bool(Some(b != 0))), @@ -2093,7 +2158,7 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_I8 | COL_OPT_I8 => { + ColType::I8 | ColType::OptI8 => { let (data, bytes) = read_raw_bytes_stream(data)?; let values = expand_with_presence( bytes @@ -2105,7 +2170,7 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_U8 | COL_OPT_U8 => { + ColType::U8 | ColType::OptU8 => { let (data, bytes) = read_raw_bytes_stream(data)?; let values = expand_with_presence( bytes.into_iter().map(|b| PropValue::U8(Some(b))), @@ -2115,8 +2180,8 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_I32 | COL_OPT_I32 => { - let any_null = col_type == COL_OPT_I32; + ColType::I32 | ColType::OptI32 => { + let any_null = col_type == ColType::OptI32; let (data, ints) = read_i32_stream(data, popcount, any_null)?; let values = expand_with_presence( ints.into_iter().map(|v| PropValue::I32(Some(v))), @@ -2126,7 +2191,7 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_U32 | COL_OPT_U32 => { + ColType::U32 | ColType::OptU32 => { let (data, ints) = read_u32_stream(data, popcount, false)?; let values = expand_with_presence( ints.into_iter().map(|v| PropValue::U32(Some(v))), @@ -2136,7 +2201,7 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_I64 | COL_OPT_I64 => { + ColType::I64 | ColType::OptI64 => { let (data, ints) = read_i64_stream(data, popcount)?; let values = expand_with_presence( ints.into_iter().map(|v| PropValue::I64(Some(v))), @@ -2146,7 +2211,7 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_U64 | COL_OPT_U64 => { + ColType::U64 | ColType::OptU64 => { let (data, ints) = read_u64_stream(data, popcount)?; let values = expand_with_presence( ints.into_iter().map(|v| PropValue::U64(Some(v))), @@ -2156,7 +2221,7 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_F32 | COL_OPT_F32 => { + ColType::F32 | ColType::OptF32 => { let (data, bytes) = read_raw_bytes_stream(data)?; let floats: Vec = bytes .chunks_exact(4) @@ -2170,7 +2235,7 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_F64 | COL_OPT_F64 => { + ColType::F64 | ColType::OptF64 => { let (data, bytes) = read_raw_bytes_stream(data)?; let floats: Vec = bytes .chunks_exact(8) @@ -2184,8 +2249,20 @@ fn decode_column_data<'a>( ); Ok((data, values)) } - COL_STR | COL_OPT_STR => decode_string_column(data, popcount, presence, feature_count), - _ => Err(MltError::NotImplemented("v2 decoder: unknown column type")), + ColType::Str | ColType::OptStr => { + decode_string_column(data, popcount, presence, feature_count) + } + // These variants are dispatched before reaching this function. + ColType::Id + | ColType::OptId + | ColType::StrDict + | ColType::OptStrDict + | ColType::StrFsst + | ColType::OptStrFsst + | ColType::StrSharedDict + | ColType::StrSharedDictFsst => { + unreachable!("ColType::{col_type:?} is handled before decode_column_data") + } } } @@ -2219,24 +2296,18 @@ fn decode_string_column<'a>( Ok((data, values)) } -/// Decode a StrFsst (col_type=32) or OptStrFsst (col_type=33) column. +/// Decode a `StrFsst` or `OptStrFsst` column. /// /// Wire layout: `[presence?] | sym_lengths | sym_bytes | val_lengths | corpus` fn decode_string_fsst_column<'a>( data: &'a [u8], - col_type: u8, + col_type: ColType, feature_count: usize, ) -> MltResult<(&'a [u8], String, Vec)> { - let optional = col_type == COL_OPT_STR_FSST; + let optional = col_type == ColType::OptStrFsst; - // name - let (data, name_len) = parse_varint::(data)?; - let name_len = name_len as usize; - if data.len() < name_len { - return Err(MltError::BufferUnderflow(name_len as u32, data.len())); - } - let name = std::str::from_utf8(&data[..name_len])?.to_string(); - let data = &data[name_len..]; + let (data, name) = parse_string(data)?; + let name = name.to_string(); // presence (only for optional) let (data, presence) = if optional { @@ -2310,9 +2381,11 @@ fn fsst_decompress( let mut offset = 0usize; for &len in val_lengths { let end = offset + len as usize; - let s = std::str::from_utf8(output.get(offset..end).ok_or( - MltError::BufferUnderflow(len, output.len() - offset), - )?) + let s = std::str::from_utf8( + output + .get(offset..end) + .ok_or(MltError::BufferUnderflow(len, output.len() - offset))?, + ) .map_err(MltError::from)? .to_string(); strings.push(s); @@ -2321,7 +2394,7 @@ fn fsst_decompress( Ok(strings) } -/// Decode a StrDict (col_type=30) or OptStrDict (col_type=31) column. +/// Decode a `StrDict` or `OptStrDict` column. /// /// Wire layout: `dict_lengths_stream | dict_data_stream | indices_stream` /// @@ -2329,19 +2402,13 @@ fn fsst_decompress( /// - `OptStrDict`: index `0` = null; index `k` (k≥1) → dict entry `k-1`. fn decode_string_dict_column<'a>( data: &'a [u8], - col_type: u8, + col_type: ColType, feature_count: usize, ) -> MltResult<(&'a [u8], String, Vec)> { - let optional = col_type == COL_OPT_STR_DICT; + let optional = col_type == ColType::OptStrDict; - // name - let (data, name_len) = parse_varint::(data)?; - let name_len = name_len as usize; - if data.len() < name_len { - return Err(MltError::BufferUnderflow(name_len as u32, data.len())); - } - let name = std::str::from_utf8(&data[..name_len])?.to_string(); - let data = &data[name_len..]; + let (data, name) = parse_string(data)?; + let name = name.to_string(); // dict_lengths: explicit count = number of dict entries let (data, dict_lengths) = read_u32_stream(data, 0, true)?; @@ -2400,18 +2467,13 @@ fn decode_string_dict_column<'a>( /// ``` fn decode_shared_dict_v2<'a>( data: &'a [u8], - col_type: u8, + col_type: ColType, feature_count: usize, ) -> MltResult<(&'a [u8], Vec<(String, Vec)>)> { - let use_fsst = col_type == COL_STR_SHARED_DICT_FSST; + let use_fsst = col_type == ColType::StrSharedDictFsst; - let (data, prefix_len) = parse_varint::(data)?; - let prefix_len = prefix_len as usize; - if data.len() < prefix_len { - return Err(MltError::BufferUnderflow(prefix_len as u32, data.len())); - } - let prefix = std::str::from_utf8(&data[..prefix_len])?.to_string(); - let data = &data[prefix_len..]; + let (data, prefix) = parse_string(data)?; + let prefix = prefix.to_string(); let (data, child_count) = parse_varint::(data)?; let child_count = child_count as usize; @@ -2423,7 +2485,10 @@ fn decode_shared_dict_v2<'a>( } else { let (data, dict_lengths) = read_u32_stream(data, 0, true)?; let (data, dict_bytes) = read_raw_bytes_stream(data)?; - (data, build_strings_from_lengths(&dict_lengths, &dict_bytes)?) + ( + data, + build_strings_from_lengths(&dict_lengths, &dict_bytes)?, + ) }; let mut result = Vec::with_capacity(child_count); @@ -2437,13 +2502,9 @@ fn decode_shared_dict_v2<'a>( data = &data[1..]; let optional = (flags & CHILD_OPTIONAL) != 0; - let (d, suffix_len) = parse_varint::(data)?; - let suffix_len = suffix_len as usize; - if d.len() < suffix_len { - return Err(MltError::BufferUnderflow(suffix_len as u32, d.len())); - } - let suffix = std::str::from_utf8(&d[..suffix_len])?.to_string(); - data = &d[suffix_len..]; + let (d, suffix) = parse_string(data)?; + let suffix = suffix.to_string(); + data = d; let col_name = format!("{prefix}{suffix}"); @@ -2457,10 +2518,16 @@ fn decode_shared_dict_v2<'a>( if optional && idx == 0 { Ok(PropValue::Str(None)) } else { - let dict_idx = if optional { idx as usize - 1 } else { idx as usize }; + let dict_idx = if optional { + idx as usize - 1 + } else { + idx as usize + }; let s = dict_strings .get(dict_idx) - .ok_or(MltError::NotImplemented("v2 SharedDict: index out of range"))? + .ok_or(MltError::NotImplemented( + "v2 SharedDict: index out of range", + ))? .clone(); Ok(PropValue::Str(Some(s))) } @@ -2479,11 +2546,10 @@ fn build_strings_from_lengths(lengths: &[u32], bytes: &[u8]) -> MltResult AnyResult< match layer { Layer::Tag01(l) => { let tile = l.into_tile(&mut dec)?; - let bytes = if v2 { tile.encode_v2(cfg)? } else { tile.encode(cfg)? }; + let bytes = if v2 { + tile.encode_v2(cfg)? + } else { + tile.encode(cfg)? + }; out.extend_from_slice(&bytes); } Layer::Tag02(tile) if v2 => { @@ -265,7 +269,11 @@ fn convert_mlt_buffer(buffer: &[u8], cfg: EncoderConfig, v2: bool) -> AnyResult< fn convert_mvt_buffer(buffer: Vec, cfg: EncoderConfig, v2: bool) -> AnyResult> { let mut out: Vec = Vec::new(); for tile in mvt_to_tile_layers(buffer)? { - let bytes = if v2 { tile.encode_v2(cfg)? } else { tile.encode(cfg)? }; + let bytes = if v2 { + tile.encode_v2(cfg)? + } else { + tile.encode(cfg)? + }; out.extend_from_slice(&bytes); } Ok(out) From ab2a2c3059385f52a8f6e824eb6e3259b343f2f1 Mon Sep 17 00:00:00 2001 From: Yuri Astrakhan Date: Wed, 22 Apr 2026 01:39:02 -0400 Subject: [PATCH 10/12] cleanup --- rust/mlt-core/src/v2.rs | 287 +++++++++++++++++----------------------- 1 file changed, 118 insertions(+), 169 deletions(-) diff --git a/rust/mlt-core/src/v2.rs b/rust/mlt-core/src/v2.rs index 30d8d27d1..ee9344211 100644 --- a/rust/mlt-core/src/v2.rs +++ b/rust/mlt-core/src/v2.rs @@ -35,6 +35,7 @@ use geo_types::{ Coord, Geometry, LineString, MultiLineString, MultiPoint, MultiPolygon, Point, Polygon, }; use integer_encoding::{VarInt as _, VarIntWriter as _}; +use strum::FromRepr; use zigzag::ZigZag as _; use crate::codecs::fsst::compress_fsst; @@ -52,26 +53,44 @@ use crate::{MltError, MltResult}; /// before deciding whether to try spatial sort trials. Mirrors v1's threshold. const SORT_TRIAL_THRESHOLD: usize = 512; -// ── Encoding byte bit positions ─────────────────────────────────────────────── -// bit 7: has_explicit_count -// bits 6-4: logical (0=None, 1=Delta, 2=CwDelta, 3=Rle, 4=DeltaRle, 5=Morton) -// bits 3-2: physical (0=None-noLen, 1=None-withLen, 2=VarInt, 3=FastPFor128) +// ── Encoding byte bit layout ────────────────────────────────────────────────── +// bit 7: has_explicit_count (1 = count varint precedes byte_length) +// bits 6-4: logical encoding (0=None, 1=Delta, 2=CwDelta, 3=Rle) +// bits 3-2: physical encoding (0=None-noLen, 1=None-withLen, 2=VarInt, 3=FastPFor128) // bits 1-0: reserved (0) -/// `logical=None, physical=VarInt, count from context` -const ENC_VARINT: u8 = 0x08; -/// `logical=None, physical=VarInt, explicit count follows` -const ENC_VARINT_EXPL: u8 = 0x88; -/// `logical=Delta, physical=VarInt, count from context` -const ENC_DELTA_VARINT: u8 = 0x18; -/// `logical=CwDelta, physical=VarInt, explicit count follows` -const ENC_CWDELTA_VARINT_EXPL: u8 = 0xA8; -/// `logical=CwDelta, physical=FastPFor128, explicit count follows` -const ENC_CWDELTA_FP128_EXPL: u8 = 0xAC; -/// `logical=Rle, physical=reserved (00), count from context; byte_length always follows` -const ENC_RLE: u8 = 0x30; -/// `logical=None, physical=None-noLen, explicit count follows` -const ENC_RAW_EXPL: u8 = 0x80; +/// Stream encoding byte for MLT v2 wire format. +/// +/// Each variant name encodes the logical+physical combination and whether an +/// explicit count varint precedes the byte-length varint. +/// Variants ending in `Expl` carry `bit7=1` and are followed by a count field. +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, FromRepr)] +enum Enc { + /// logical=None, physical=VarInt, count from context (implicit) + VarInt = 0x08, + /// logical=None, physical=VarInt, explicit count follows + VarIntExpl = 0x88, + /// logical=Delta, physical=VarInt, count from context (implicit) + DeltaVarInt = 0x18, + /// logical=CwDelta, physical=VarInt, explicit count follows + CwDeltaVarInt = 0xA8, + /// logical=CwDelta, physical=FastPFor128, explicit count follows + CwDeltaFp128 = 0xAC, + /// logical=Rle, physical=reserved; byte_length always follows (no count field) + Rle = 0x30, + /// logical=None, physical=None-noLen, explicit count (= byte count) follows + RawExpl = 0x80, +} + +impl TryFrom for Enc { + type Error = MltError; + fn try_from(v: u8) -> MltResult { + Self::from_repr(v).ok_or(MltError::NotImplemented( + "unsupported v2 stream encoding byte", + )) + } +} // ── Geometry layout byte values ─────────────────────────────────────────────── @@ -80,7 +99,7 @@ const ENC_RAW_EXPL: u8 = 0x80; /// Encodes which geometry streams are present and in what order. /// Dict and tessellation variants are excluded from this initial implementation. #[repr(u8)] -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, FromRepr)] pub enum GeoLayout { /// Types, Vertices Points = 0, @@ -99,15 +118,7 @@ pub enum GeoLayout { impl TryFrom for GeoLayout { type Error = MltError; fn try_from(v: u8) -> MltResult { - match v { - 0 => Ok(Self::Points), - 2 => Ok(Self::MultiPoints), - 4 => Ok(Self::Lines), - 6 => Ok(Self::MultiLines), - 8 => Ok(Self::Polygons), - 10 => Ok(Self::MultiPolygons), - _ => Err(MltError::NotImplemented("unsupported v2 geometry layout")), - } + Self::from_repr(v).ok_or(MltError::NotImplemented("unsupported v2 geometry layout")) } } @@ -120,7 +131,7 @@ impl TryFrom for GeoLayout { /// but are preceded by a packed presence bitfield (except `OptStrDict`, which /// uses index 0 to signal null). #[repr(u8)] -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, FromRepr)] enum ColType { // IDs Id = 0, @@ -162,37 +173,7 @@ enum ColType { impl TryFrom for ColType { type Error = MltError; fn try_from(v: u8) -> MltResult { - match v { - 0 => Ok(Self::Id), - 1 => Ok(Self::OptId), - 10 => Ok(Self::Bool), - 11 => Ok(Self::OptBool), - 12 => Ok(Self::I8), - 13 => Ok(Self::OptI8), - 14 => Ok(Self::U8), - 15 => Ok(Self::OptU8), - 16 => Ok(Self::I32), - 17 => Ok(Self::OptI32), - 18 => Ok(Self::U32), - 19 => Ok(Self::OptU32), - 20 => Ok(Self::I64), - 21 => Ok(Self::OptI64), - 22 => Ok(Self::U64), - 23 => Ok(Self::OptU64), - 24 => Ok(Self::F32), - 25 => Ok(Self::OptF32), - 26 => Ok(Self::F64), - 27 => Ok(Self::OptF64), - 28 => Ok(Self::Str), - 29 => Ok(Self::OptStr), - 30 => Ok(Self::StrDict), - 31 => Ok(Self::OptStrDict), - 32 => Ok(Self::StrFsst), - 33 => Ok(Self::OptStrFsst), - 34 => Ok(Self::StrSharedDict), - 35 => Ok(Self::StrSharedDictFsst), - _ => Err(MltError::NotImplemented("unsupported v2 column type")), - } + Self::from_repr(v).ok_or(MltError::NotImplemented("unsupported v2 column type")) } } @@ -345,14 +326,13 @@ impl TileLayer01 { let mut body: Vec = Vec::new(); // name - body.write_string(&self.name).map_err(MltError::from)?; + body.write_string(&self.name)?; // extent - body.write_varint(self.extent).map_err(MltError::from)?; + body.write_varint(self.extent)?; // feature_count (v2 addition) - body.write_varint(feature_count as u32) - .map_err(MltError::from)?; + body.write_varint(feature_count as u32)?; // ── Geometry section ────────────────────────────────────────────────── body.push(geom.layout as u8); @@ -368,8 +348,7 @@ impl TileLayer01 { )?; // ── Columns ─────────────────────────────────────────────────────────── - body.write_varint(col_chunks.len() as u32) - .map_err(MltError::from)?; + body.write_varint(col_chunks.len() as u32)?; for chunk in col_chunks { body.extend_from_slice(&chunk); } @@ -377,7 +356,7 @@ impl TileLayer01 { // ── Frame: [varint(body_len+1)][tag=2][body] ────────────────────────── let size = u32::try_from(body.len() + 1)?; // +1 for the tag byte let mut out = Vec::with_capacity(5 + 1 + body.len()); - out.write_varint(size).map_err(MltError::from)?; + out.write_varint(size)?; out.push(2_u8); // tag = 2 out.extend_from_slice(&body); @@ -549,7 +528,7 @@ fn push_feature_geometry( fn build_varint_body_u32(values: &[u32]) -> MltResult> { let mut tmp = Vec::new(); for &v in values { - tmp.write_varint(v).map_err(MltError::from)?; + tmp.write_varint(v)?; } Ok(tmp) } @@ -564,8 +543,8 @@ fn build_rle_body_u32(values: &[u32]) -> MltResult> { while i + (run as usize) < values.len() && values[i + (run as usize)] == val { run += 1; } - tmp.write_varint(run).map_err(MltError::from)?; - tmp.write_varint(val).map_err(MltError::from)?; + tmp.write_varint(run)?; + tmp.write_varint(val)?; i += run as usize; } Ok(tmp) @@ -578,7 +557,7 @@ fn build_delta_body_u64(values: &[u64]) -> MltResult> { let mut prev = 0u64; for &v in values { let delta = v.wrapping_sub(prev); - tmp.write_varint(delta).map_err(MltError::from)?; + tmp.write_varint(delta)?; prev = v; } Ok(tmp) @@ -588,10 +567,9 @@ fn build_delta_body_u64(values: &[u64]) -> MltResult> { /// Write a VarInt-encoded `u32` stream with implicit count (= `feature_count`). fn write_u32_stream_implicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> { - buf.push(ENC_VARINT); + buf.push(Enc::VarInt as u8); let body = build_varint_body_u32(values)?; - buf.write_varint(body.len() as u32) - .map_err(MltError::from)?; + buf.write_varint(body.len() as u32)?; buf.extend_from_slice(&body); Ok(()) } @@ -606,14 +584,12 @@ fn write_types_stream(buf: &mut Vec, values: &[u32]) -> MltResult<()> { // VarInt overhead: 1 (enc_byte) + varint(body_len) + body // RLE overhead: 1 (enc_byte) + varint(body_len) + body (same shape) if rle_body.len() < varint_body.len() { - buf.push(ENC_RLE); - buf.write_varint(rle_body.len() as u32) - .map_err(MltError::from)?; + buf.push(Enc::Rle as u8); + buf.write_varint(rle_body.len() as u32)?; buf.extend_from_slice(&rle_body); } else { - buf.push(ENC_VARINT); - buf.write_varint(varint_body.len() as u32) - .map_err(MltError::from)?; + buf.push(Enc::VarInt as u8); + buf.write_varint(varint_body.len() as u32)?; buf.extend_from_slice(&varint_body); } Ok(()) @@ -621,12 +597,10 @@ fn write_types_stream(buf: &mut Vec, values: &[u32]) -> MltResult<()> { /// Write a VarInt-encoded `u32` stream with explicit count. fn write_u32_stream_explicit(buf: &mut Vec, values: &[u32]) -> MltResult<()> { - buf.push(ENC_VARINT_EXPL); - buf.write_varint(values.len() as u32) - .map_err(MltError::from)?; + buf.push(Enc::VarIntExpl as u8); + buf.write_varint(values.len() as u32)?; let body = build_varint_body_u32(values)?; - buf.write_varint(body.len() as u32) - .map_err(MltError::from)?; + buf.write_varint(body.len() as u32)?; buf.extend_from_slice(&body); Ok(()) } @@ -638,21 +612,19 @@ fn write_u64_stream_best(buf: &mut Vec, values: &[u64]) -> MltResult<()> { let plain_body = { let mut tmp = Vec::new(); for &v in values { - tmp.write_varint(v).map_err(MltError::from)?; + tmp.write_varint(v)?; } tmp }; let delta_body = build_delta_body_u64(values)?; if delta_body.len() < plain_body.len() { - buf.push(ENC_DELTA_VARINT); - buf.write_varint(delta_body.len() as u32) - .map_err(MltError::from)?; + buf.push(Enc::DeltaVarInt as u8); + buf.write_varint(delta_body.len() as u32)?; buf.extend_from_slice(&delta_body); } else { - buf.push(ENC_VARINT); - buf.write_varint(plain_body.len() as u32) - .map_err(MltError::from)?; + buf.push(Enc::VarInt as u8); + buf.write_varint(plain_body.len() as u32)?; buf.extend_from_slice(&plain_body); } Ok(()) @@ -660,45 +632,44 @@ fn write_u64_stream_best(buf: &mut Vec, values: &[u64]) -> MltResult<()> { /// Write a VarInt-encoded `u64` stream with implicit count. fn write_u64_stream_implicit(buf: &mut Vec, values: &[u64]) -> MltResult<()> { - buf.push(ENC_VARINT); + buf.push(Enc::VarInt as u8); let mut tmp = Vec::new(); for &v in values { - tmp.write_varint(v).map_err(MltError::from)?; + tmp.write_varint(v)?; } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.write_varint(tmp.len() as u32)?; buf.extend_from_slice(&tmp); Ok(()) } /// Write a ZigZag+VarInt encoded `i32` stream with implicit count. fn write_i32_stream_implicit(buf: &mut Vec, values: &[i32]) -> MltResult<()> { - buf.push(ENC_VARINT); + buf.push(Enc::VarInt as u8); let mut tmp = Vec::new(); for &v in values { - tmp.write_varint(i32::encode(v)).map_err(MltError::from)?; + tmp.write_varint(i32::encode(v))?; } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.write_varint(tmp.len() as u32)?; buf.extend_from_slice(&tmp); Ok(()) } /// Write a ZigZag+VarInt encoded `i64` stream with implicit count. fn write_i64_stream_implicit(buf: &mut Vec, values: &[i64]) -> MltResult<()> { - buf.push(ENC_VARINT); + buf.push(Enc::VarInt as u8); let mut tmp = Vec::new(); for &v in values { - tmp.write_varint(i64::encode(v)).map_err(MltError::from)?; + tmp.write_varint(i64::encode(v))?; } - buf.write_varint(tmp.len() as u32).map_err(MltError::from)?; + buf.write_varint(tmp.len() as u32)?; buf.extend_from_slice(&tmp); Ok(()) } /// Write a fixed-width raw byte stream with explicit count (number of data bytes). fn write_raw_bytes(buf: &mut Vec, data: &[u8]) -> MltResult<()> { - buf.push(ENC_RAW_EXPL); - buf.write_varint(data.len() as u32) - .map_err(MltError::from)?; + buf.push(Enc::RawExpl as u8); + buf.write_varint(data.len() as u32)?; buf.extend_from_slice(data); Ok(()) } @@ -706,13 +677,13 @@ fn write_raw_bytes(buf: &mut Vec, data: &[u8]) -> MltResult<()> { /// Write vertex data using ComponentwiseDelta + VarInt with explicit count (vertex pairs). fn write_cwdelta_vertices_varint(pair_count: u32, encoded: &[u32]) -> MltResult> { let mut out = Vec::new(); - out.push(ENC_CWDELTA_VARINT_EXPL); - out.write_varint(pair_count).map_err(MltError::from)?; + out.push(Enc::CwDeltaVarInt as u8); + out.write_varint(pair_count)?; let mut tmp = Vec::new(); for &v in encoded { - tmp.write_varint(v).map_err(MltError::from)?; + tmp.write_varint(v)?; } - out.write_varint(tmp.len() as u32).map_err(MltError::from)?; + out.write_varint(tmp.len() as u32)?; out.extend_from_slice(&tmp); Ok(out) } @@ -736,10 +707,9 @@ fn write_cwdelta_vertices_fp128(pair_count: u32, encoded: &[u32]) -> MltResult = scratch.iter().flat_map(|&w| w.to_le_bytes()).collect(); let mut out = Vec::new(); - out.push(ENC_CWDELTA_FP128_EXPL); - out.write_varint(pair_count).map_err(MltError::from)?; - out.write_varint(byte_data.len() as u32) - .map_err(MltError::from)?; + out.push(Enc::CwDeltaFp128 as u8); + out.write_varint(pair_count)?; + out.write_varint(byte_data.len() as u32)?; out.extend_from_slice(&byte_data); Ok(out) } @@ -908,7 +878,7 @@ fn write_property_column( }; buf.push(col_type as u8); - buf.write_string(name).map_err(MltError::from)?; + buf.write_string(name)?; if with_presence { let presence: Vec = strings.iter().map(|s| s.is_some()).collect(); write_presence(buf, &presence); @@ -919,7 +889,7 @@ fn write_property_column( let col_type = prop_column_type(values[0], any_null); buf.push(col_type as u8); - buf.write_string(name).map_err(MltError::from)?; + buf.write_string(name)?; if any_null { let presence: Vec = values.iter().map(|v| !is_null(v)).collect(); @@ -1198,9 +1168,8 @@ fn build_shared_dict_plain( ) -> MltResult> { let mut buf = Vec::new(); buf.push(ColType::StrSharedDict as u8); - buf.write_string(prefix).map_err(MltError::from)?; - buf.write_varint(items.len() as u32) - .map_err(MltError::from)?; + buf.write_string(prefix)?; + buf.write_varint(items.len() as u32)?; let dict_lengths: Vec = dict_strings.iter().map(|s| s.len() as u32).collect(); let dict_bytes: Vec = dict_strings @@ -1214,7 +1183,7 @@ fn build_shared_dict_plain( for item in items { let optional = item.has_presence(); buf.push(if optional { CHILD_OPTIONAL } else { 0 }); - buf.write_string(&item.suffix).map_err(MltError::from)?; + buf.write_string(&item.suffix)?; let indices = build_item_indices(item, span_to_idx, optional); write_u32_stream_implicit(&mut buf, &indices)?; } @@ -1240,9 +1209,8 @@ fn build_shared_dict_fsst( let mut buf = Vec::new(); buf.push(ColType::StrSharedDictFsst as u8); - buf.write_string(prefix).map_err(MltError::from)?; - buf.write_varint(items.len() as u32) - .map_err(MltError::from)?; + buf.write_string(prefix)?; + buf.write_varint(items.len() as u32)?; write_u32_stream_explicit(&mut buf, &raw.symbol_lengths)?; write_raw_bytes(&mut buf, &raw.symbol_bytes)?; @@ -1252,7 +1220,7 @@ fn build_shared_dict_fsst( for item in items { let optional = item.has_presence(); buf.push(if optional { CHILD_OPTIONAL } else { 0 }); - buf.write_string(&item.suffix).map_err(MltError::from)?; + buf.write_string(&item.suffix)?; let indices = build_item_indices(item, span_to_idx, optional); write_u32_stream_implicit(&mut buf, &indices)?; } @@ -1766,16 +1734,12 @@ fn reconstruct_geometries( // ── Stream read helpers ─────────────────────────────────────────────────────── -/// Read one encoding byte, returning (remaining, has_explicit_count, logical, physical). -fn read_enc_byte(data: &[u8]) -> MltResult<(&[u8], bool, u8, u8)> { +/// Read one encoding byte and return the recognised `Enc` variant. +fn read_enc(data: &[u8]) -> MltResult<(&[u8], Enc)> { if data.is_empty() { return Err(MltError::BufferUnderflow(1, 0)); } - let b = data[0]; - let has_explicit_count = (b & 0x80) != 0; - let logical = (b >> 4) & 0x07; - let physical = (b >> 2) & 0x03; - Ok((&data[1..], has_explicit_count, logical, physical)) + Ok((&data[1..], Enc::try_from(data[0])?)) } /// Read a VarInt (or RLE) u32 stream. @@ -1789,10 +1753,10 @@ fn read_u32_stream( implicit_count: usize, explicit: bool, ) -> MltResult<(&[u8], Vec)> { - let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; + let (data, enc) = read_enc(data)?; - // ── RLE: logical=3, physical bits reserved (00) ────────────────────────── - if logical == 3 { + // ── RLE ────────────────────────────────────────────────────────────────── + if enc == Enc::Rle { // byte_length always follows for RLE; no separate count field. let (data, byte_len) = parse_varint::(data)?; let byte_len = byte_len as usize; @@ -1802,12 +1766,7 @@ fn read_u32_stream( let encoded = &data[..byte_len]; let remaining = &data[byte_len..]; - let capacity = if has_expl_count || explicit { - implicit_count - } else { - 16 - }; - let mut values = Vec::with_capacity(capacity); + let mut values = Vec::with_capacity(if explicit { implicit_count } else { 16 }); let mut pos = 0; while pos < byte_len { let (run_len, c1) = u32::decode_var(&encoded[pos..]) @@ -1823,14 +1782,14 @@ fn read_u32_stream( return Ok((remaining, values)); } - // ── VarInt: logical=0, physical=2 ──────────────────────────────────────── - if logical != 0 || physical != 2 { + // ── VarInt ─────────────────────────────────────────────────────────────── + if !matches!(enc, Enc::VarInt | Enc::VarIntExpl) { return Err(MltError::NotImplemented( "v2 decoder: unsupported encoding for u32 stream", )); } - let (data, count) = if has_expl_count || explicit { + let (data, count) = if enc == Enc::VarIntExpl || explicit { let (d, c) = parse_varint::(data)?; (d, c as usize) } else { @@ -1859,9 +1818,9 @@ fn read_u32_stream( /// Read a raw-bytes stream (physical=None-noLen, logical=None, explicit count = byte count). fn read_raw_bytes_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { - let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; + let (data, enc) = read_enc(data)?; - if logical != 0 || physical != 0 || !has_expl_count { + if enc != Enc::RawExpl { return Err(MltError::NotImplemented( "v2 decoder: unexpected encoding byte for raw bytes stream", )); @@ -1881,15 +1840,15 @@ fn read_i32_stream( implicit_count: usize, _has_any_null: bool, ) -> MltResult<(&[u8], Vec)> { - let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; + let (data, enc) = read_enc(data)?; - if logical != 0 || physical != 2 { + if !matches!(enc, Enc::VarInt | Enc::VarIntExpl) { return Err(MltError::NotImplemented( "v2 decoder: only VarInt/ZigZag supported for i32 stream", )); } - let (data, count) = if has_expl_count { + let (data, count) = if enc == Enc::VarIntExpl { let (d, c) = parse_varint::(data)?; (d, c as usize) } else { @@ -1918,9 +1877,9 @@ fn read_i32_stream( /// Read a ZigZag+VarInt i64 stream. fn read_i64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec)> { - let (data, _has_expl_count, logical, physical) = read_enc_byte(data)?; + let (data, enc) = read_enc(data)?; - if logical != 0 || physical != 2 { + if !matches!(enc, Enc::VarInt | Enc::VarIntExpl) { return Err(MltError::NotImplemented( "v2 decoder: only VarInt/ZigZag supported for i64 stream", )); @@ -1952,9 +1911,9 @@ fn read_i64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec< /// - `logical=0` (None) + `physical=2` (VarInt): plain VarInt. /// - `logical=1` (Delta) + `physical=2` (VarInt): VarInt-encoded deltas, prefix-sum to recover. fn read_u64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec)> { - let (data, _has_expl_count, logical, physical) = read_enc_byte(data)?; + let (data, enc) = read_enc(data)?; - if physical != 2 || (logical != 0 && logical != 1) { + if !matches!(enc, Enc::VarInt | Enc::DeltaVarInt) { return Err(MltError::NotImplemented( "v2 decoder: only VarInt / Delta+VarInt supported for u64 stream", )); @@ -1978,7 +1937,7 @@ fn read_u64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec< } // Apply prefix-sum to undo delta encoding. - if logical == 1 { + if enc == Enc::DeltaVarInt { let mut acc = 0u64; for v in &mut values { acc = acc.wrapping_add(*v); @@ -1991,26 +1950,18 @@ fn read_u64_stream(data: &[u8], implicit_count: usize) -> MltResult<(&[u8], Vec< /// Read a ComponentwiseDelta + (VarInt or FastPFor128) vertex stream (explicit count = vertex pairs). fn read_cwdelta_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { - let (data, has_expl_count, logical, physical) = read_enc_byte(data)?; + let (data, enc) = read_enc(data)?; - if logical != 2 { - return Err(MltError::NotImplemented( - "v2 decoder: only CwDelta logical encoding supported for vertex stream", - )); - } - if physical != 2 && physical != 3 { + if !matches!(enc, Enc::CwDeltaVarInt | Enc::CwDeltaFp128) { return Err(MltError::NotImplemented( - "v2 decoder: only VarInt(2) or FastPFor128(3) physical encoding for vertex stream", + "v2 decoder: only CwDelta+VarInt or CwDelta+FastPFor128 supported for vertex stream", )); } - let (data, pair_count) = if has_expl_count { + // Both CwDelta variants carry an explicit pair count. + let (data, pair_count) = { let (d, c) = parse_varint::(data)?; (d, c as usize) - } else { - return Err(MltError::NotImplemented( - "v2 decoder: vertex stream must have explicit count", - )); }; let (data, byte_len) = parse_varint::(data)?; @@ -2023,7 +1974,7 @@ fn read_cwdelta_stream(data: &[u8]) -> MltResult<(&[u8], Vec)> { let coord_count = pair_count * 2; - let u32_vals: Vec = if physical == 3 { + let u32_vals: Vec = if enc == Enc::CwDeltaFp128 { // FastPFor128: bytes are LE u32 words if !byte_len.is_multiple_of(4) { return Err(MltError::NotImplemented( @@ -2385,8 +2336,7 @@ fn fsst_decompress( output .get(offset..end) .ok_or(MltError::BufferUnderflow(len, output.len() - offset))?, - ) - .map_err(MltError::from)? + )? .to_string(); strings.push(s); offset = end; @@ -2549,8 +2499,7 @@ fn build_strings_from_lengths(lengths: &[u32], bytes: &[u8]) -> MltResult Date: Tue, 2 Jun 2026 18:48:51 -0400 Subject: [PATCH 11/12] wip --- rust/mlt-core/src/decoder/layer.rs | 8 ++++---- rust/mlt-core/src/decoder/model.rs | 2 ++ rust/mlt-core/tests/unknown_layer.rs | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/rust/mlt-core/src/decoder/layer.rs b/rust/mlt-core/src/decoder/layer.rs index e3a2bb551..d0a3ed5c7 100644 --- a/rust/mlt-core/src/decoder/layer.rs +++ b/rust/mlt-core/src/decoder/layer.rs @@ -10,7 +10,7 @@ impl<'a, S: DecodeState> Layer<'a, S> { pub fn as_layer01(&self) -> Option<&Layer01<'a, S>> { match self { Self::Tag01(l) => Some(l), - Self::Tag02(_) | Self::Unknown(_) => None, + _ => None, } } @@ -19,7 +19,7 @@ impl<'a, S: DecodeState> Layer<'a, S> { pub fn into_layer01(self) -> Option> { match self { Self::Tag01(l) => Some(l), - Self::Tag02(_) | Self::Unknown(_) => None, + _ => None, } } } @@ -52,8 +52,8 @@ impl<'a> Layer<'a> { /// `Layer::Tag01(lazy)` and call the individual methods on [`Layer01`]. pub fn decode_all(self, dec: &mut Decoder) -> MltResult> { match self { - Layer::Tag01(lazy) => Ok(Layer::Tag01(lazy.decode_all(dec)?)), - Layer::Tag02(t) => Ok(Layer::Tag02(t)), + Layer::Tag01(v) => Ok(Layer::Tag01(v.decode_all(dec)?)), + Layer::Tag02(v) => Ok(Layer::Tag02(v)), Layer::Unknown(u) => Ok(Layer::Unknown(u)), } } diff --git a/rust/mlt-core/src/decoder/model.rs b/rust/mlt-core/src/decoder/model.rs index 9dd5687ab..3293c27f6 100644 --- a/rust/mlt-core/src/decoder/model.rs +++ b/rust/mlt-core/src/decoder/model.rs @@ -122,6 +122,8 @@ pub struct Layer01<'a, S: DecodeState = Lazy> { pub(crate) layer_order: Vec, } +impl StrParse for Layer{} + pub type ParsedLayer01<'a> = Layer01<'a, Parsed>; impl<'a, S> fmt::Debug for Layer01<'a, S> diff --git a/rust/mlt-core/tests/unknown_layer.rs b/rust/mlt-core/tests/unknown_layer.rs index 6960a7f73..64bf53456 100644 --- a/rust/mlt-core/tests/unknown_layer.rs +++ b/rust/mlt-core/tests/unknown_layer.rs @@ -86,12 +86,12 @@ fn multiple_layers_mixed_unknown_and_tag01() { let Layer::Unknown(u0) = &layers[0] else { panic!("expected Unknown at index 0"); }; - assert_eq!(u0.tag(), 10u32); + assert_eq!(u0.tag(), 10); assert_eq!(u0.data(), b"hello"); let Layer::Unknown(u1) = &layers[1] else { panic!("expected Unknown at index 1"); }; - assert_eq!(u1.tag(), 11u32); + assert_eq!(u1.tag(), 11); assert_eq!(u1.data(), b"world"); } From b63cf3aba385970cb2505e1aad6e025b8d7eca38 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 22:50:39 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- rust/mlt-core/src/decoder/model.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/mlt-core/src/decoder/model.rs b/rust/mlt-core/src/decoder/model.rs index 3293c27f6..5dd10c60a 100644 --- a/rust/mlt-core/src/decoder/model.rs +++ b/rust/mlt-core/src/decoder/model.rs @@ -122,7 +122,7 @@ pub struct Layer01<'a, S: DecodeState = Lazy> { pub(crate) layer_order: Vec, } -impl StrParse for Layer{} +impl StrParse for Layer {} pub type ParsedLayer01<'a> = Layer01<'a, Parsed>;