From 356bd52e206ceb29f5cde585037d1ee3211d31a6 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Dec 2023 11:28:40 +0000 Subject: [PATCH 1/2] docs: update user-defined-functions for 0.19.x --- docs/_build/API_REFERENCE_LINKS.yml | 4 +- .../expressions/user-defined-functions.py | 19 ++--- .../expressions/user-defined-functions.rs | 21 ++++-- .../expressions/user-defined-functions.md | 72 ++++++++----------- 4 files changed, 57 insertions(+), 59 deletions(-) diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml index 51959eae6199..d2cb2b70e2b3 100644 --- a/docs/_build/API_REFERENCE_LINKS.yml +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -51,8 +51,8 @@ python: interpolate: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.interpolate.html fill_nan: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.fill_nan.html operators: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/operators.html - map: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.map.html - apply: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.apply.html + map_batches: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html + map_elements: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html over: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.over.html implode: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.implode.html DataFrame.explode: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.explode.html diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 920812babd93..16f0da8dca76 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -11,22 +11,25 @@ "values": [10, 7, 1], } ) +print(df) +# --8<-- [end:dataframe] +# --8<-- [start:shift_map_batches] out = df.group_by("keys", maintain_order=True).agg( - pl.col("values").map_batches(lambda s: s.shift()).alias("shift_map"), + pl.col("values").map_batches(lambda s: s.shift()).alias("shift_map_batches"), pl.col("values").shift().alias("shift_expression"), ) -print(df) -# --8<-- [end:dataframe] +print(out) +# --8<-- [end:shift_map_batches] -# --8<-- [start:apply] +# --8<-- [start:map_elements] out = df.group_by("keys", maintain_order=True).agg( - pl.col("values").map_elements(lambda s: s.shift()).alias("shift_map"), + pl.col("values").map_elements(lambda s: s.shift()).alias("shift_map_elements"), pl.col("values").shift().alias("shift_expression"), ) print(out) -# --8<-- [end:apply] +# --8<-- [end:map_elements] # --8<-- [start:counter] counter = 0 @@ -39,7 +42,7 @@ def add_counter(val: int) -> int: out = df.select( - pl.col("values").map_elements(add_counter).alias("solution_apply"), + pl.col("values").map_elements(add_counter).alias("solution_map_elements"), (pl.col("values") + pl.int_range(1, pl.count() + 1)).alias("solution_expr"), ) print(out) @@ -49,7 +52,7 @@ def add_counter(val: int) -> int: out = df.select( pl.struct(["keys", "values"]) .map_elements(lambda x: len(x["keys"]) + x["values"]) - .alias("solution_apply"), + .alias("solution_map_elements"), (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"), ) print(out) diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index 44e8e69fc5f9..82663fe0b3ff 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -6,7 +6,10 @@ fn main() -> Result<(), Box> { "keys" => &["a", "a", "b"], "values" => &[10, 7, 1], )?; + println!("{}", df); + // --8<-- [end:dataframe] + // --8<-- [start:shift_map_batches] let out = df .clone() .lazy() @@ -14,15 +17,17 @@ fn main() -> Result<(), Box> { .agg([ col("values") .map(|s| Ok(Some(s.shift(1))), GetOutput::default()) - .alias("shift_map"), + // note: the `'shift_map_batches'` alias is just there to show how you + // get the same output as in the Python API example. + .alias("shift_map_batches"), col("values").shift(lit(1)).alias("shift_expression"), ]) .collect()?; println!("{}", out); - // --8<-- [end:dataframe] + // --8<-- [end:shift_map_batches] - // --8<-- [start:apply] + // --8<-- [start:map_elements] let out = df .clone() .lazy() @@ -30,12 +35,14 @@ fn main() -> Result<(), Box> { .agg([ col("values") .apply(|s| Ok(Some(s.shift(1))), GetOutput::default()) - .alias("shift_map"), + // note: the `'shift_map_elements'` alias is just there to show how you + // get the same output as in the Python API example. + .alias("shift_map_elements"), col("values").shift(lit(1)).alias("shift_expression"), ]) .collect()?; println!("{}", out); - // --8<-- [end:apply] + // --8<-- [end:map_elements] // --8<-- [start:counter] @@ -75,7 +82,9 @@ fn main() -> Result<(), Box> { }, GetOutput::from_type(DataType::Int32), ) - .alias("solution_apply"), + // note: the `'solution_map_elements'` alias is just there to show how you + // get the same output as in the Python API example. + .alias("solution_map_elements"), (col("keys").str().count_matches(lit("."), true) + col("values")) .alias("solution_expr"), ]) diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 25764a414ef2..785cf080fb95 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -1,9 +1,5 @@ # User-defined functions (Python) -!!! warning "Not updated for Python Polars `0.19.0`" - - This section of the user guide still needs to be updated for the latest Polars release. - You should be convinced by now that Polars expressions are so powerful and flexible that there is much less need for custom Python functions than in other libraries. @@ -12,28 +8,28 @@ over data in Polars. For this we provide the following expressions: -- `map` -- `apply` +- `map_batches` +- `map_elements` -## To `map` or to `apply`. +## To `map_batches` or to `map_elements`. These functions have an important distinction in how they operate and consequently what data they will pass to the user. -A `map` passes the `Series` backed by the `expression` as is. +A `map_batches` passes the `Series` backed by the `expression` as is. -`map` follows the same rules in both the `select` and the `group_by` context, this will +`map_batches` follows the same rules in both the `select` and the `group_by` context, this will mean that the `Series` represents a column in a `DataFrame`. Note that in the `group_by` context, that column is not yet aggregated! -Use cases for `map` are for instance passing the `Series` in an expression to a third party library. Below we show how -we could use `map` to pass an expression column to a neural network model. +Use cases for `map_batches` are for instance passing the `Series` in an expression to a third party library. Below we show how +we could use `map_batches` to pass an expression column to a neural network model. === ":fontawesome-brands-python: Python" -[:material-api: `map`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.map.html) +[:material-api: `map_batches`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) ```python df.with_columns([ - pl.col("features").map(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations") + pl.col("features").map_batches(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations") ]) ``` @@ -45,9 +41,9 @@ df.with_columns([ ]) ``` -Use cases for `map` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why. +Use cases for `map_batches` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why. -{{code_block('user-guide/expressions/user-defined-functions','dataframe',['map'])}} +{{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}} ```python exec="on" result="text" session="user-guide/udf" --8<-- "python/user-guide/expressions/user-defined-functions.py:setup" @@ -68,57 +64,47 @@ If we would then apply a `shift` operation to the right, we'd expect: "b" -> [null] ``` -Now, let's print and see what we've got. +Let's try that out and see what we get: -```python -print(out) -``` +{{code_block('user-guide/expressions/user-defined-functions','shift_map_batches',['map_batches'])}} -``` -shape: (2, 3) -┌──────┬────────────┬──────────────────┐ -│ keys ┆ shift_map ┆ shift_expression │ -│ --- ┆ --- ┆ --- │ -│ str ┆ list[i64] ┆ list[i64] │ -╞══════╪════════════╪══════════════════╡ -│ a ┆ [null, 10] ┆ [null, 10] │ -│ b ┆ [7] ┆ [null] │ -└──────┴────────────┴──────────────────┘ +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:shift_map_batches" ``` Ouch.. we clearly get the wrong results here. Group `"b"` even got a value from group `"a"` 😵. -This went horribly wrong, because the `map` applies the function before we aggregate! So that means the whole column `[10, 7, 1`\] got shifted to `[null, 10, 7]` and was then aggregated. +This went horribly wrong, because the `map_batches` applies the function before we aggregate! So that means the whole column `[10, 7, 1`\] got shifted to `[null, 10, 7]` and was then aggregated. -So my advice is to never use `map` in the `group_by` context unless you know you need it and know what you are doing. +So my advice is to never use `map_batches` in the `group_by` context unless you know you need it and know what you are doing. -## To `apply` +## To `map_elements` -Luckily we can fix previous example with `apply`. `apply` works on the smallest logical elements for that operation. +Luckily we can fix previous example with `map_elements`. `map_elements` works on the smallest logical elements for that operation. That is: - `select context` -> single elements - `group by context` -> single groups -So with `apply` we should be able to fix our example: +So with `map_elements` we should be able to fix our example: -{{code_block('user-guide/expressions/user-defined-functions','apply',['apply'])}} +{{code_block('user-guide/expressions/user-defined-functions','map_elements',['map_elements'])}} ```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:apply" +--8<-- "python/user-guide/expressions/user-defined-functions.py:map_elements" ``` And observe, a valid result! 🎉 -## `apply` in the `select` context +## `map_elements` in the `select` context -In the `select` context, the `apply` expression passes elements of the column to the Python function. +In the `select` context, the `map_elements` expression passes elements of the column to the Python function. _Note that you are now running Python, this will be slow._ Let's go through some examples to see what to expect. We will continue with the `DataFrame` we defined at the start of -this section and show an example with the `apply` function and a counter example where we use the expression API to +this section and show an example with the `map_elements` function and a counter example where we use the expression API to achieve the same goals. ### Adding a counter @@ -126,9 +112,9 @@ achieve the same goals. In this example we create a global `counter` and then add the integer `1` to the global state at every element processed. Every iteration the result of the increment will be added to the element value. -> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this apply is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees. +> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this `apply` is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees. -{{code_block('user-guide/expressions/user-defined-functions','counter',['apply'])}} +{{code_block('user-guide/expressions/user-defined-functions','counter',['map_elements'])}} ```python exec="on" result="text" session="user-guide/udf" --8<-- "python/user-guide/expressions/user-defined-functions.py:counter" @@ -136,7 +122,7 @@ Every iteration the result of the increment will be added to the element value. ### Combining multiple column values -If we want to have access to values of different columns in a single `apply` function call, we can create `struct` data +If we want to have access to values of different columns in a single `map_elements` function call, we can create `struct` data type. This data type collects those columns as fields in the `struct`. So if we'd create a struct from the columns `"keys"` and `"values"`, we would get the following struct elements: @@ -150,7 +136,7 @@ type. This data type collects those columns as fields in the `struct`. So if we' In Python, those would be passed as `dict` to the calling Python function and can thus be indexed by `field: str`. In Rust, you'll get a `Series` with the `Struct` type. The fields of the struct can then be indexed and downcast. -{{code_block('user-guide/expressions/user-defined-functions','combine',['apply','struct'])}} +{{code_block('user-guide/expressions/user-defined-functions','combine',['map_elements','struct'])}} ```python exec="on" result="text" session="user-guide/udf" --8<-- "python/user-guide/expressions/user-defined-functions.py:combine" From baef62d6cc8f06fa1ee2d4f7cb882dcd58e7f94f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 17 Dec 2023 09:06:22 +0000 Subject: [PATCH 2/2] avoid invalid references in Rust docs --- docs/_build/API_REFERENCE_LINKS.yml | 4 ---- docs/user-guide/expressions/user-defined-functions.md | 11 +++++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml index d2cb2b70e2b3..35565e96d492 100644 --- a/docs/_build/API_REFERENCE_LINKS.yml +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -51,8 +51,6 @@ python: interpolate: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.interpolate.html fill_nan: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.fill_nan.html operators: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/operators.html - map_batches: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html - map_elements: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html over: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.over.html implode: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.implode.html DataFrame.explode: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.explode.html @@ -273,8 +271,6 @@ rust: concat_list: name: concat_lst link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/fn.concat_lst.html - map: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.map - apply: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.apply over: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.over alias: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/enum.Expr.html#method.alias diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 785cf080fb95..3d508a4225da 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -66,7 +66,7 @@ If we would then apply a `shift` operation to the right, we'd expect: Let's try that out and see what we get: -{{code_block('user-guide/expressions/user-defined-functions','shift_map_batches',['map_batches'])}} +{{code_block('user-guide/expressions/user-defined-functions','shift_map_batches',[])}} ```python exec="on" result="text" session="user-guide/udf" --8<-- "python/user-guide/expressions/user-defined-functions.py:shift_map_batches" @@ -89,7 +89,10 @@ That is: So with `map_elements` we should be able to fix our example: -{{code_block('user-guide/expressions/user-defined-functions','map_elements',['map_elements'])}} +=== ":fontawesome-brands-python: Python" +[:material-api: `map_elements`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html) + +{{code_block('user-guide/expressions/user-defined-functions','map_elements',[])}} ```python exec="on" result="text" session="user-guide/udf" --8<-- "python/user-guide/expressions/user-defined-functions.py:map_elements" @@ -114,7 +117,7 @@ Every iteration the result of the increment will be added to the element value. > Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this `apply` is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees. -{{code_block('user-guide/expressions/user-defined-functions','counter',['map_elements'])}} +{{code_block('user-guide/expressions/user-defined-functions','counter',[])}} ```python exec="on" result="text" session="user-guide/udf" --8<-- "python/user-guide/expressions/user-defined-functions.py:counter" @@ -136,7 +139,7 @@ type. This data type collects those columns as fields in the `struct`. So if we' In Python, those would be passed as `dict` to the calling Python function and can thus be indexed by `field: str`. In Rust, you'll get a `Series` with the `Struct` type. The fields of the struct can then be indexed and downcast. -{{code_block('user-guide/expressions/user-defined-functions','combine',['map_elements','struct'])}} +{{code_block('user-guide/expressions/user-defined-functions','combine',[])}} ```python exec="on" result="text" session="user-guide/udf" --8<-- "python/user-guide/expressions/user-defined-functions.py:combine"