Write some docs (#2080)

spiraldb · Jan 27, 2025 · cc55754 · cc55754
1 parent 81d3bf4
commit cc55754
Show file tree

Hide file tree

Showing 31 changed files with 1,230 additions and 474 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -240,6 +240,8 @@ jobs:
       - uses: ./.github/actions/setup-flatc
       - name: Install Protoc
         uses: arduino/setup-protoc@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
       - name: "regenerate all .fbs/.proto Rust code"
         run: |
           cargo xtask generate-fbs

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -22,25 +22,12 @@ jobs:
 
       - name: build Python and Rust docs
         run: |
-          uv run make -C docs python-and-rust-html
-      - name: commit python docs to gh-pages-bench
-        run: |
-          set -ex
-
-          built_sha=$(git rev-parse HEAD)
-
-          rm -rf docs/_build/html/rust/CACHETAG.DIR docs/_build/html/rust/debug
-
-          mkdir /tmp/html
-          mv docs/_build/html /tmp/html/docs
-
-          mkdir -p /tmp/html/dev
-          mv benchmarks-website /tmp/html/dev/bench
+          uv run make -C docs html
       - name: Upload static files as artifact
         id: deployment
         uses: actions/upload-pages-artifact@v3
         with:
-          path: /tmp/html/
+          path: docs/_build/html
   deploy:
     environment:
       name: github-pages

diff --git a/docs/Makefile b/docs/Makefile
@@ -8,7 +8,7 @@ SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = .
 BUILDDIR      = _build
 
-.PHONY: Makefile help rust-html
+.PHONY: Makefile help
 
 # Put it first so that "make" without argument is like "make help".
 help:
@@ -19,24 +19,5 @@ help:
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-python-and-rust-html: html rust-html
-	true  # need a non-empty rule to prevent matching the %: rule.
-
 serve:
-	mkdir -p _build/vortex
-	-ln -s ../html _build/vortex/docs  # makes absolute links like /vortex/docs/rust/html work correctly
-	echo The docs are served at http://localhost:8000/vortex/docs/
-	(cd _build/ && python3 -m http.server)
-
-watch:
-	fswatch -o -e '#[^#]*#' -e '__pycache__' -e '\.#.*' -e target -e .git -e .venv -e pyvortex/python/vortex/_lib.abi3.so -e docs/_build -e docs/a.vortex ../ | xargs -L1 /bin/bash -c 'make python-and-rust-html'
-
-rust-html:
-	RUSTDOCFLAGS="--enable-index-page -Z unstable-options" cargo doc \
-		--no-deps \
-		--workspace \
-		--exclude bench-vortex \
-		--exclude xtask \
-		--all-features \
-		--target-dir \
-		$(BUILDDIR)/html/rust
+	sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)/html"
diff --git a/docs/README.md b/docs/README.md
@@ -9,24 +9,18 @@ inherits some of its doc strings from Rust docstrings:
 cd ../pyvortex && uv run maturin develop
 ```
 
-Build just the Python docs:
+Build the Vortex docs:
 
 ```
 uv run make html
 ```
 
-Build the Python and Rust docs and place the rust docs at `_build/rust/html`:
+## Development
 
-```
-uv run make python-and-rust-html
-```
-
-## Viewing
-
-After building:
+Live-reloading (ish) build of the docs:
 
 ```
-open pyvortex/_build/html/index.html
+uv run make serve
 ```
 
 ## Python Doctests

diff --git a/docs/concepts/arrays.md b/docs/concepts/arrays.md
@@ -0,0 +1,142 @@
+# Vortex Arrays
+
+An array is the in-memory representation of data in Vortex. It has a [length](#length), a [data type](#data-type), an
+[encoding](#encodings), some number of [children](#children), and some number of [buffers](#buffers).
+All arrays in Vortex are represented by an `ArrayData`, which in psuedo-code looks something like this:
+
+```rust
+struct ArrayData {
+    encoding: Encoding,
+    dtype: DType,
+    len: usize,
+    metadata: ByteBuffer,
+    children: [ArrayData],
+    buffers: [ByteBuffer],
+    statistics: Statistics,
+}
+```
+
+This document goes into detail about each of these fields as well as the mechanics behind the encoding vtables.
+
+**Owned vs Viewed**
+
+As with other possibly large recursive data structures in Vortex, arrays can be either _owned_ or _viewed_.
+Owned arrays are heap-allocated, while viewed arrays are lazily unwrapped from an underlying FlatBuffer representation.
+This allows Vortex to efficiently load and work with very wide schemas without needing to deserialize the full array
+in memory.
+
+This abstraction is hidden from users inside an `ArrayData` object.
+
+## Encodings
+
+An encoding acts as the virtual function table (vtable) for an `ArrayData`.
+
+### VTable
+
+The full vtable definition is quite expansive, is split across many Rust traits, and has many optional functions. Here
+is an overview:
+
+* `id`: returns the unique identifier for the encoding.
+* `validate`: validates the array's buffers and children after loading from disk.
+* `accept`: a function for accepting an `ArrayVisitor` and walking the arrays children.
+* `into_canonical`: decodes the array into a canonical encoding.
+* `into_arrow`: decodes the array into an Arrow array.
+* `metadata`
+    * `validate`: validates the array's metadata buffer.
+    * `display`: returns a human-readable representation of the array metadata.
+* `validity`
+    * `is_valid`: returns whether the element at a given row is valid.
+    * `logical_validity`: returns the validity bit-mask for an array, indicating which values are non-null.
+* `compute`: a collection of compute functions vtables.
+    * `filter`: a function for filtering the array using a given selection mask.
+    * ...
+* `statistics`: a function for computing a statistic for the array data, for example `min`.
+* `variants`: a collection of optional DType-specific functions for operation over the array.
+    * `struct`: functions for operating over arrays with a `StructDType`.
+        * `get_field`: returns the array for a given field of the struct.
+        * ...
+    * ...
+
+Encoding vtables can even be constructed from non-static sources, such as _WebAssembly_ modules, which enables the
+[forward compatibility](/specs/file-format.md#forward-compatibility) feature of the Vortex File Format.
+
+See the [Writing an Encoding](/rust/writing-an-encoding) guide for more information.
+
+### Canonical Encodings
+
+Each logical data type in Vortex has an associated canonical encoding. All encodings must support decompression into
+their canonical form.
+
+Note that Vortex also supports decompressing into intermediate encodings, such as dictionary encoding, which may be
+better suited to a particular operation or compute engine.
+
+The canonical encodings are support **zero-copy** conversion to and from _Apache Arrow_ arrays.
+
+| Data Type          | Canonical Encoding   |
+|--------------------|----------------------|
+| `DType::Null`      | `NullEncoding`       |
+| `DType::Bool`      | `BoolEncoding`       |
+| `DType::Primitive` | `PrimitiveEncoding`  |
+| `DType::UTF8`      | `VarBinViewEncoding` |
+| `DType::Binary`    | `VarBinViewEncoding` |
+| `DType::Struct`    | `StructEncoding`     |
+| `DType::List`      | `ListEncoding`       |
+| `DType::Extension` | `ExtensionEncoding`  |
+
+(data-type)=
+
+## Data Type
+
+The array's [data type](/concepts/dtypes) is a logical definition of the data held within the array and does not
+confer any specific meaning on the array's children or buffers.
+
+Another way to think about logical data types is that they represent the type of the scalar value you might read
+out of the array.
+
+## Length
+
+The length of an array can almost always be inferred by encoding from its children and buffers. But given how
+important the length is for many operations, it is stored directly in the `ArrayData` object for faster access.
+
+## Metadata
+
+Each array can store a small amount of metadata in the form of a byte buffer. This is typically not much more than
+8 bytes and does not have any alignment guarantees. This is used by encodings to store any additional information they
+might need in order to access their children or buffers.
+
+For example, a dictionary encoding stores the length of its `values` child, and the primitive type of its `codes` child.
+
+## Children
+
+Arrays can have some number of child arrays. These differ from buffers in that they are logically typed, meaning the
+encoding cannot make assumptions about the layout of these children when implementing its vtable.
+
+Dictionary encoding is an example of where child arrays might be used, with one array representing the unique
+dictionary values and another array representing the codes indexing into those values.
+
+## Buffers
+
+Buffers store binary data with a declared alignment. They act as the terminal nodes in the recursive structure of
+an array.
+
+They are not considered by the recursive compressor, although general-purpose compression may still be used
+at write-time.
+
+For example, a bit-packed array stores packed integers in binary form. These would be stored in a buffer with an
+alignment sufficient for SIMD unpacking operations.
+
+## Statistics
+
+Arrays carry their own statistics with them, allowing many compute functions to short-circuit or optimise their
+implementations. Currently, the available statistics are:
+
+- `null_count`: The number of null values in the array.
+- `true_count`: The number of `true` values in a boolean array.
+- `run_count`: The number of consecutive runs in an array.
+- `is_constant`: Whether the array only holds a single unique value
+- `is_sorted`: Whether the array values are sorted.
+- `is_strict_sorted`: Whether the array values are sorted and unique.
+- `min`: The minimum value in the array.
+- `max`: The maximum value in the array.
+- `uncompressed_size`: The size of the array in memory before any compression.
+
diff --git a/docs/concepts/compute.md b/docs/concepts/compute.md
@@ -0,0 +1,37 @@
+# Vortex Compute
+
+Encoding vtables can define optional implementations of compute functions where it's possible to optimize the
+implementation beyond the default behavior of canonicalizing the array and then performing the operation.
+
+For example, `DictEncoding` defines an implementation of compare where given a constant right-hand side argument,
+the operation is performed only over the dictionary values and the result is wrapped up with the original dictionary
+codes.
+
+## Compute Functions
+
+* `binary_boolean(lhs: ArrayData, rhs: ArrayData, BinaryOperator) -> ArrayData`
+    * Compute `And`, `AndKleene`, `Or`, `OrKleene` operations over two boolean arrays.
+* `binary_numeric(lhs: ArrayData, rhs: ArrayData, BinaryOperator) -> ArrayData`
+    * Compute `Add`, `Sub`, `RSub`, `Mul`, `Div`, `RDiv` operations over two numeric arrays.
+* `compare(lhs: ArrayData, rhs: ArrayData, CompareOperator) -> ArrayData`
+    * Compute `Eq`, `NotEq`, `Gt`, `Gte`, `Lt`, `Lte` operations over two arrays.
+* `try_cast(ArrayData, DType) -> ArrayData`
+    * Try to cast the array to the specified data type.
+* `fill_forward(ArrayData) -> ArrayData`
+    * Fill forward null values with the most recent non-null value.
+* `fill_null(ArrayData, Scalar) -> ArrayData`
+    * Fill null values with the specified scalar value.
+* `invert_fn(ArrayData) -> ArrayData`
+    * Invert the boolean values of the array.
+* `like(ArrayData, pattern: ArrayData) -> ArrayData`
+    * Perform a `LIKE` operation over two arrays.
+* `scalar_at(ArrayData, index) -> Scalar`
+    * Get the scalar value at the specified index.
+* `search_sorted(ArrayData, Scalar) -> SearchResult`
+    * Search for the specified scalar value in the sorted array.
+* `slice(ArrayData, start, end) -> ArrayData`
+    * Slice the array from the start to the end index.
+* `take(ArrayData, indices: ArrayData) -> ArrayData`
+    * Take the specified nullable indices from the array.
+* `filter(ArrayData, mask: Mask) -> ArrayData`
+    * Filter the array based on the given mask.
diff --git a/docs/concepts/dtypes.md b/docs/concepts/dtypes.md
@@ -0,0 +1,95 @@
+# Vortex Data Types
+
+A core principle of Vortex is that its data types (or `dtypes`) are _logical_ rather than _physical_.
+This means that the dtype has no bearing on how the data is actually stored in memory, and is instead used to define
+the domain of values an array may hold.
+
+For example, a `u32` dtype represents an unsigned integer domain with values between `0` and `2^32 - 1`, even though
+the underlying array may store values dictionary-encoded, run-length encoded (RLE), or in any other format!
+
+This principle enables many of Vortex's advanced features. For example, performing compute directly on
+compressed data.
+
+:::{admonition} What is a schema?!
+:class: tip
+It is worth noting now that Vortex has no concept of a _schema_, instead preferring to use a struct dtype to represent
+columnar data. This means you can write a Vortex file containing a single integer array just as well as writing one
+with many columns.
+:::
+
+**Owned vs Viewed**
+
+As with other possibly large recursive data structures in Vortex, dtypes can be either _owned_ or _viewed_.
+Owned dtypes are heap-allocated, while viewed dtypes are lazily unwrapped from an underlying FlatBuffer representation.
+This allows Vortex to efficiently load and work with very wide data types without needing to deserialize the full type
+in memory.
+
+## Logical Types
+
+The following table lists the built-in dtypes in Vortex, each of which can be marked as either nullable or non-nullable.
+
+| Name        | Domain                                      |
+|-------------|---------------------------------------------|
+| `Null`      | `null`                                      |
+| `Bool`      | `true`, `false`                             |
+| `Primitive` | See [Primitive](#primitive)                 |
+| `UTF8`      | Variable length valid utf-8 encoded strings |
+| `Binary`    | Arbitrary variable length bytes             |
+| `Struct`    | See [Struct](#struct)                       |
+| `List`      | See [List](#list)                           |
+| `Extension` | See [Extension](#extension)                 |
+
+:::{note}
+There are additional logical types that Vortex does not yet support, for example fixed-length binary, utf-8, and list
+types, as well as a map type. These may be added in future versions.
+:::
+
+### Primitive
+
+Primitive dtypes are an enumeration of different fixed-width primitive values.
+
+| Name  | Domain                  |
+|-------|-------------------------|
+| `I8`  | 8-bit signed integer    |
+| `I16` | 16-bit signed integer   |
+| `I32` | 32-bit signed integer   |
+| `I64` | 64-bit signed integer   |
+| `U8`  | 8-bit unsigned integer  |
+| `U16` | 16-bit unsigned integer |
+| `U32` | 32-bit unsigned integer |
+| `U64` | 64-bit unsigned integer |
+| `F16` | IEEE 754-2008 half      |
+| `F32` | IEEE 754-1985 single    |
+| `F64` | IEEE 754-1985 double    |
+
+### Struct
+
+A `Struct` dtype is an ordered collection of named fields, each of which has its own logical dtype.
+
+### List
+
+A `List` dtype has a single _element type_, itself a logical dtype, and represents an array of variable-length
+sequences of elements of that type.
+
+### Extension
+
+An `Extension` dtype is a logical dtype with an `id`, a `storage` dtype, and a `metadata` field. The `id` and `metadata`
+fields together may implicitly restrict the domain of values of the `storage` dtype.
+
+For example, a `vortex.date` type is logically stored as a `U32` representing the number of days since the Unix epoch.
+
+## Vs. Arrow
+
+This section helps those familiar with Apache Arrow to quickly understand the differences vs. Vortex's dtypes.
+
+* In Arrow, nullability is tied to a {obj}`pyarrow.Field` rather than the data type.
+  Data types in Vortex instead always define explicit `nullability`.
+* In Arrow, there are multiple ways to describe the same logical data type, for example {func}`pyarrow.string` and
+  {func}`pyarrow.large_string` both represent UTF-8 values. In Vortex, there is a single `UTF8` dtype.
+* In Arrow, encoded data is described with additional data types, for example {func}`pyarrow.dictionary`. In Vortex,
+  encodings are a distinct concept from dtypes.
+* In Arrow, date and time types are defined as first-class data types. In Vortex, these are represented as `Extension`
+  dtypes since that can be composed of other more primitive logical dtypes.
+* In Arrow, tables and record batches have a _schema_ that defines the types of the columns. Vortex makes no
+  distinction between a data type and a schema. Columnar data can be stored with a struct dtype, and integer data can
+  be stored equally well without a top-level struct.