docs: update user-defined-functions for 0.19.x

MarcoGorelli · MarcoGorelli · commit 356bd52e206c · 2023-12-16T19:15:12.000Z
diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml
@@ -51,8 +51,8 @@ python:
   interpolate: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.interpolate.html
   fill_nan: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.fill_nan.html
   operators: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/operators.html
-  map: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.map.html
-  apply: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.apply.html
+  map_batches: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html
+  map_elements: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html
   over: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.over.html
   implode: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.implode.html
   DataFrame.explode: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.explode.html
diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py
@@ -11,22 +11,25 @@
         "values": [10, 7, 1],
     }
 )
+print(df)
+# --8<-- [end:dataframe]
 
+# --8<-- [start:shift_map_batches]
 out = df.group_by("keys", maintain_order=True).agg(
-    pl.col("values").map_batches(lambda s: s.shift()).alias("shift_map"),
+    pl.col("values").map_batches(lambda s: s.shift()).alias("shift_map_batches"),
     pl.col("values").shift().alias("shift_expression"),
 )
-print(df)
-# --8<-- [end:dataframe]
+print(out)
+# --8<-- [end:shift_map_batches]
 
 
-# --8<-- [start:apply]
+# --8<-- [start:map_elements]
 out = df.group_by("keys", maintain_order=True).agg(
-    pl.col("values").map_elements(lambda s: s.shift()).alias("shift_map"),
+    pl.col("values").map_elements(lambda s: s.shift()).alias("shift_map_elements"),
     pl.col("values").shift().alias("shift_expression"),
 )
 print(out)
-# --8<-- [end:apply]
+# --8<-- [end:map_elements]
 
 # --8<-- [start:counter]
 counter = 0
@@ -39,7 +42,7 @@ def add_counter(val: int) -> int:
 
 
 out = df.select(
-    pl.col("values").map_elements(add_counter).alias("solution_apply"),
+    pl.col("values").map_elements(add_counter).alias("solution_map_elements"),
     (pl.col("values") + pl.int_range(1, pl.count() + 1)).alias("solution_expr"),
 )
 print(out)
@@ -49,7 +52,7 @@ def add_counter(val: int) -> int:
 out = df.select(
     pl.struct(["keys", "values"])
     .map_elements(lambda x: len(x["keys"]) + x["values"])
-    .alias("solution_apply"),
+    .alias("solution_map_elements"),
     (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"),
 )
 print(out)
diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs
@@ -6,36 +6,43 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         "keys" => &["a", "a", "b"],
         "values" => &[10, 7, 1],
     )?;
+    println!("{}", df);
+    // --8<-- [end:dataframe]
 
+    // --8<-- [start:shift_map_batches]
     let out = df
         .clone()
         .lazy()
         .group_by(["keys"])
         .agg([
             col("values")
                 .map(|s| Ok(Some(s.shift(1))), GetOutput::default())
-                .alias("shift_map"),
+                // note: the `'shift_map_batches'` alias is just there to show how you
+                // get the same output as in the Python API example.
+                .alias("shift_map_batches"),
             col("values").shift(lit(1)).alias("shift_expression"),
         ])
         .collect()?;
 
     println!("{}", out);
-    // --8<-- [end:dataframe]
+    // --8<-- [end:shift_map_batches]
 
-    // --8<-- [start:apply]
+    // --8<-- [start:map_elements]
     let out = df
         .clone()
         .lazy()
         .group_by([col("keys")])
         .agg([
             col("values")
                 .apply(|s| Ok(Some(s.shift(1))), GetOutput::default())
-                .alias("shift_map"),
+                // note: the `'shift_map_elements'` alias is just there to show how you
+                // get the same output as in the Python API example.
+                .alias("shift_map_elements"),
             col("values").shift(lit(1)).alias("shift_expression"),
         ])
         .collect()?;
     println!("{}", out);
-    // --8<-- [end:apply]
+    // --8<-- [end:map_elements]
 
     // --8<-- [start:counter]
 
@@ -75,7 +82,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                     },
                     GetOutput::from_type(DataType::Int32),
                 )
-                .alias("solution_apply"),
+                // note: the `'solution_map_elements'` alias is just there to show how you
+                // get the same output as in the Python API example.
+                .alias("solution_map_elements"),
             (col("keys").str().count_matches(lit("."), true) + col("values"))
                 .alias("solution_expr"),
         ])
diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md
@@ -1,9 +1,5 @@
 # User-defined functions (Python)
 
-!!! warning "Not updated for Python Polars `0.19.0`"
-
-    This section of the user guide still needs to be updated for the latest Polars release.
-
 You should be convinced by now that Polars expressions are so powerful and flexible that there is much less need for custom Python functions
 than in other libraries.
 
@@ -12,28 +8,28 @@ over data in Polars.
 
 For this we provide the following expressions:
 
-- `map`
-- `apply`
+- `map_batches`
+- `map_elements`
 
-## To `map` or to `apply`.
+## To `map_batches` or to `map_elements`.
 
 These functions have an important distinction in how they operate and consequently what data they will pass to the user.
 
-A `map` passes the `Series` backed by the `expression` as is.
+A `map_batches` passes the `Series` backed by the `expression` as is.
 
-`map` follows the same rules in both the `select` and the `group_by` context, this will
+`map_batches` follows the same rules in both the `select` and the `group_by` context, this will
 mean that the `Series` represents a column in a `DataFrame`. Note that in the `group_by` context, that column is not yet
 aggregated!
 
-Use cases for `map` are for instance passing the `Series` in an expression to a third party library. Below we show how
-we could use `map` to pass an expression column to a neural network model.
+Use cases for `map_batches` are for instance passing the `Series` in an expression to a third party library. Below we show how
+we could use `map_batches` to pass an expression column to a neural network model.
 
 === ":fontawesome-brands-python: Python"
-[:material-api: `map`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.map.html)
+[:material-api: `map_batches`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html)
 
 ```python
 df.with_columns([
-    pl.col("features").map(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations")
+    pl.col("features").map_batches(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations")
 ])
 ```
 
@@ -45,9 +41,9 @@ df.with_columns([
 ])
 ```
 
-Use cases for `map` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why.
+Use cases for `map_batches` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why.
 
-{{code_block('user-guide/expressions/user-defined-functions','dataframe',['map'])}}
+{{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}}
 
 ```python exec="on" result="text" session="user-guide/udf"
 --8<-- "python/user-guide/expressions/user-defined-functions.py:setup"
@@ -68,75 +64,65 @@ If we would then apply a `shift` operation to the right, we'd expect:
 "b" -> [null]
 ```
 
-Now, let's print and see what we've got.
+Let's try that out and see what we get:
 
-```python
-print(out)
-```
+{{code_block('user-guide/expressions/user-defined-functions','shift_map_batches',['map_batches'])}}
 
-```
-shape: (2, 3)
-┌──────┬────────────┬──────────────────┐
-│ keys ┆ shift_map  ┆ shift_expression │
-│ ---  ┆ ---        ┆ ---              │
-│ str  ┆ list[i64]  ┆ list[i64]        │
-╞══════╪════════════╪══════════════════╡
-│ a    ┆ [null, 10] ┆ [null, 10]       │
-│ b    ┆ [7]        ┆ [null]           │
-└──────┴────────────┴──────────────────┘
+```python exec="on" result="text" session="user-guide/udf"
+--8<-- "python/user-guide/expressions/user-defined-functions.py:shift_map_batches"
 ```
 
 Ouch.. we clearly get the wrong results here. Group `"b"` even got a value from group `"a"` 😵.
 
-This went horribly wrong, because the `map` applies the function before we aggregate! So that means the whole column `[10, 7, 1`\] got shifted to `[null, 10, 7]` and was then aggregated.
+This went horribly wrong, because the `map_batches` applies the function before we aggregate! So that means the whole column `[10, 7, 1`\] got shifted to `[null, 10, 7]` and was then aggregated.
 
-So my advice is to never use `map` in the `group_by` context unless you know you need it and know what you are doing.
+So my advice is to never use `map_batches` in the `group_by` context unless you know you need it and know what you are doing.
 
-## To `apply`
+## To `map_elements`
 
-Luckily we can fix previous example with `apply`. `apply` works on the smallest logical elements for that operation.
+Luckily we can fix previous example with `map_elements`. `map_elements` works on the smallest logical elements for that operation.
 
 That is:
 
 - `select context` -> single elements
 - `group by context` -> single groups
 
-So with `apply` we should be able to fix our example:
+So with `map_elements` we should be able to fix our example:
 
-{{code_block('user-guide/expressions/user-defined-functions','apply',['apply'])}}
+{{code_block('user-guide/expressions/user-defined-functions','map_elements',['map_elements'])}}
 
 ```python exec="on" result="text" session="user-guide/udf"
---8<-- "python/user-guide/expressions/user-defined-functions.py:apply"
+--8<-- "python/user-guide/expressions/user-defined-functions.py:map_elements"
 ```
 
 And observe, a valid result! 🎉
 
-## `apply` in the `select` context
+## `map_elements` in the `select` context
 
-In the `select` context, the `apply` expression passes elements of the column to the Python function.
+In the `select` context, the `map_elements` expression passes elements of the column to the Python function.
 
 _Note that you are now running Python, this will be slow._
 
 Let's go through some examples to see what to expect. We will continue with the `DataFrame` we defined at the start of
-this section and show an example with the `apply` function and a counter example where we use the expression API to
+this section and show an example with the `map_elements` function and a counter example where we use the expression API to
 achieve the same goals.
 
 ### Adding a counter
 
 In this example we create a global `counter` and then add the integer `1` to the global state at every element processed.
 Every iteration the result of the increment will be added to the element value.
 
-> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this apply is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees.
+> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this `apply` is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees.
 
-{{code_block('user-guide/expressions/user-defined-functions','counter',['apply'])}}
+{{code_block('user-guide/expressions/user-defined-functions','counter',['map_elements'])}}
 
 ```python exec="on" result="text" session="user-guide/udf"
 --8<-- "python/user-guide/expressions/user-defined-functions.py:counter"
 ```
 
 ### Combining multiple column values
 
-If we want to have access to values of different columns in a single `apply` function call, we can create `struct` data
+If we want to have access to values of different columns in a single `map_elements` function call, we can create `struct` data
 type. This data type collects those columns as fields in the `struct`. So if we'd create a struct from the columns
 `"keys"` and `"values"`, we would get the following struct elements:
 
@@ -150,7 +136,7 @@ type. This data type collects those columns as fields in the `struct`. So if we'
 
 In Python, those would be passed as `dict` to the calling Python function and can thus be indexed by `field: str`. In Rust, you'll get a `Series` with the `Struct` type. The fields of the struct can then be indexed and downcast.
 
-{{code_block('user-guide/expressions/user-defined-functions','combine',['apply','struct'])}}
+{{code_block('user-guide/expressions/user-defined-functions','combine',['map_elements','struct'])}}
 
 ```python exec="on" result="text" session="user-guide/udf"
 --8<-- "python/user-guide/expressions/user-defined-functions.py:combine"