Skip to content

Commit bb8efc5

Browse files
feat: Add a dedicated remove method for DataFrame and LazyFrame (#21259)
1 parent e0a3bb5 commit bb8efc5

File tree

13 files changed

+559
-172
lines changed

13 files changed

+559
-172
lines changed

crates/polars-lazy/src/frame/mod.rs

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -928,9 +928,10 @@ impl LazyFrame {
928928
Ok(())
929929
}
930930

931-
/// Filter by some predicate expression.
931+
/// Filter frame rows that match a predicate expression.
932932
///
933-
/// The expression must yield boolean values.
933+
/// The expression must yield boolean values (note that rows where the
934+
/// predicate resolves to `null` are *not* included in the resulting frame).
934935
///
935936
/// # Example
936937
///
@@ -950,6 +951,27 @@ impl LazyFrame {
950951
Self::from_logical_plan(lp, opt_state)
951952
}
952953

954+
/// Remove frame rows that match a predicate expression.
955+
///
956+
/// The expression must yield boolean values (note that rows where the
957+
/// predicate resolves to `null` are *not* removed from the resulting frame).
958+
///
959+
/// # Example
960+
///
961+
/// ```rust
962+
/// use polars_core::prelude::*;
963+
/// use polars_lazy::prelude::*;
964+
///
965+
/// fn example(df: DataFrame) -> LazyFrame {
966+
/// df.lazy()
967+
/// .remove(col("sepal_width").is_null())
968+
/// .select([col("sepal_width"), col("sepal_length")])
969+
/// }
970+
/// ```
971+
pub fn remove(self, predicate: Expr) -> Self {
972+
self.filter(predicate.neq_missing(lit(true)))
973+
}
974+
953975
/// Select (and optionally rename, with [`alias`](crate::dsl::Expr::alias)) columns from the query.
954976
///
955977
/// Columns can be selected with [`col`];

crates/polars-python/src/lazyframe/general.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -875,6 +875,11 @@ impl PyLazyFrame {
875875
ldf.filter(predicate.inner).into()
876876
}
877877

878+
fn remove(&mut self, predicate: PyExpr) -> Self {
879+
let ldf = self.ldf.clone();
880+
ldf.remove(predicate.inner).into()
881+
}
882+
878883
fn select(&mut self, exprs: Vec<PyExpr>) -> Self {
879884
let ldf = self.ldf.clone();
880885
let exprs = exprs.to_exprs();

crates/polars-sql/src/context.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -996,11 +996,11 @@ impl SQLContext {
996996
filter_expression = all_horizontal([filter_expression])?;
997997
}
998998
lf = self.process_subqueries(lf, vec![&mut filter_expression]);
999-
if invert_filter {
1000-
// negate the filter (being careful about null values)
1001-
filter_expression = filter_expression.neq_missing(lit(true))
1002-
}
1003-
lf = lf.filter(filter_expression);
999+
lf = if invert_filter {
1000+
lf.remove(filter_expression)
1001+
} else {
1002+
lf.filter(filter_expression)
1003+
};
10041004
}
10051005
Ok(lf)
10061006
}

py-polars/docs/source/reference/dataframe/modify_select.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ Manipulation/selection
4444
DataFrame.pipe
4545
DataFrame.pivot
4646
DataFrame.rechunk
47+
DataFrame.remove
4748
DataFrame.rename
4849
DataFrame.replace_column
4950
DataFrame.reverse

py-polars/docs/source/reference/lazyframe/modify_select.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Manipulation/selection
3232
LazyFrame.limit
3333
LazyFrame.melt
3434
LazyFrame.merge_sorted
35+
LazyFrame.remove
3536
LazyFrame.rename
3637
LazyFrame.reverse
3738
LazyFrame.rolling

py-polars/polars/dataframe/frame.py

Lines changed: 169 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4892,30 +4892,33 @@ def filter(
48924892
**constraints: Any,
48934893
) -> DataFrame:
48944894
"""
4895-
Filter the rows in the DataFrame based on one or more predicate expressions.
4895+
Filter rows, retaining those that match the given predicate expression(s).
48964896
48974897
The original order of the remaining rows is preserved.
48984898
4899-
Rows where the filter does not evaluate to True are discarded, including nulls.
4899+
Only rows where the predicate resolves as True are retained; when the
4900+
predicate result is False (or null), the row is discarded.
49004901
49014902
Parameters
49024903
----------
49034904
predicates
4904-
Expression(s) that evaluates to a boolean Series.
4905+
Expression(s) that evaluate to a boolean Series.
49054906
constraints
49064907
Column filters; use `name = value` to filter columns by the supplied value.
49074908
Each constraint will behave the same as `pl.col(name).eq(value)`, and
4908-
will be implicitly joined with the other filter conditions using `&`.
4909+
be implicitly joined with the other filter conditions using `&`.
49094910
49104911
Notes
49114912
-----
4912-
If you are transitioning from pandas and performing filter operations based on
4913-
the comparison of two or more columns, please note that in Polars,
4914-
any comparison involving null values will always result in null.
4915-
As a result, these rows will be filtered out.
4916-
Ensure to handle null values appropriately to avoid unintended filtering
4917-
(See examples below).
4913+
If you are transitioning from Pandas, and performing filter operations based on
4914+
the comparison of two or more columns, please note that in Polars any comparison
4915+
involving `null` values will result in a `null` result, *not* boolean True or
4916+
False. As a result, these rows will not be retained. Ensure that null values
4917+
are handled appropriately to avoid unexpected behaviour (see examples below).
49184918
4919+
See Also
4920+
--------
4921+
remove
49194922
49204923
Examples
49214924
--------
@@ -4927,7 +4930,7 @@ def filter(
49274930
... }
49284931
... )
49294932
4930-
Filter on one condition:
4933+
Filter rows matching a condition:
49314934
49324935
>>> df.filter(pl.col("foo") > 1)
49334936
shape: (3, 3)
@@ -4943,7 +4946,9 @@ def filter(
49434946
49444947
Filter on multiple conditions, combined with and/or operators:
49454948
4946-
>>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a"))
4949+
>>> df.filter(
4950+
... (pl.col("foo") < 3) & (pl.col("ham") == "a"),
4951+
... )
49474952
shape: (1, 3)
49484953
┌─────┬─────┬─────┐
49494954
│ foo ┆ bar ┆ ham │
@@ -4953,7 +4958,9 @@ def filter(
49534958
│ 1 ┆ 6 ┆ a │
49544959
└─────┴─────┴─────┘
49554960
4956-
>>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c"))
4961+
>>> df.filter(
4962+
... (pl.col("foo") == 1) | (pl.col("ham") == "c"),
4963+
... )
49574964
shape: (2, 3)
49584965
┌─────┬─────┬─────┐
49594966
│ foo ┆ bar ┆ ham │
@@ -4992,9 +4999,11 @@ def filter(
49924999
│ 2 ┆ 7 ┆ b │
49935000
└─────┴─────┴─────┘
49945001
4995-
Filter by comparing two columns against each other
5002+
Filter by comparing two columns against each other:
49965003
4997-
>>> df.filter(pl.col("foo") == pl.col("bar"))
5004+
>>> df.filter(
5005+
... pl.col("foo") == pl.col("bar"),
5006+
... )
49985007
shape: (1, 3)
49995008
┌─────┬─────┬─────┐
50005009
│ foo ┆ bar ┆ ham │
@@ -5004,7 +5013,9 @@ def filter(
50045013
│ 0 ┆ 0 ┆ f │
50055014
└─────┴─────┴─────┘
50065015
5007-
>>> df.filter(pl.col("foo") != pl.col("bar"))
5016+
>>> df.filter(
5017+
... pl.col("foo") != pl.col("bar"),
5018+
... )
50085019
shape: (3, 3)
50095020
┌─────┬─────┬─────┐
50105021
│ foo ┆ bar ┆ ham │
@@ -5019,7 +5030,9 @@ def filter(
50195030
Notice how the row with `None` values is filtered out. In order to keep the
50205031
same behavior as pandas, use:
50215032
5022-
>>> df.filter(pl.col("foo").ne_missing(pl.col("bar")))
5033+
>>> df.filter(
5034+
... pl.col("foo").ne_missing(pl.col("bar")),
5035+
... )
50235036
shape: (5, 3)
50245037
┌──────┬──────┬─────┐
50255038
│ foo ┆ bar ┆ ham │
@@ -5032,10 +5045,148 @@ def filter(
50325045
│ 4 ┆ null ┆ d │
50335046
│ null ┆ 9 ┆ e │
50345047
└──────┴──────┴─────┘
5035-
50365048
"""
50375049
return self.lazy().filter(*predicates, **constraints).collect(_eager=True)
50385050

5051+
def remove(
5052+
self,
5053+
*predicates: (
5054+
IntoExprColumn
5055+
| Iterable[IntoExprColumn]
5056+
| bool
5057+
| list[bool]
5058+
| np.ndarray[Any, Any]
5059+
),
5060+
**constraints: Any,
5061+
) -> DataFrame:
5062+
"""
5063+
Remove rows, dropping those that match the given predicate expression(s).
5064+
5065+
The original order of the remaining rows is preserved.
5066+
5067+
Rows where the filter predicate does not evaluate to True are retained
5068+
(this includes rows where the predicate evaluates as `null`).
5069+
5070+
Parameters
5071+
----------
5072+
predicates
5073+
Expression that evaluates to a boolean Series.
5074+
constraints
5075+
Column filters; use `name = value` to filter columns using the supplied
5076+
value. Each constraint behaves the same as `pl.col(name).eq(value)`,
5077+
and is implicitly joined with the other filter conditions using `&`.
5078+
5079+
Notes
5080+
-----
5081+
If you are transitioning from Pandas, and performing filter operations based on
5082+
the comparison of two or more columns, please note that in Polars any comparison
5083+
involving `null` values will result in a `null` result, *not* boolean True or
5084+
False. As a result, these rows will not be removed. Ensure that null values
5085+
are handled appropriately to avoid unexpected behaviour (see examples below).
5086+
5087+
See Also
5088+
--------
5089+
filter
5090+
5091+
Examples
5092+
--------
5093+
>>> df = pl.DataFrame(
5094+
... {
5095+
... "foo": [2, 3, None, 4, 0],
5096+
... "bar": [5, 6, None, None, 0],
5097+
... "ham": ["a", "b", None, "c", "d"],
5098+
... }
5099+
... )
5100+
5101+
Remove rows matching a condition:
5102+
5103+
>>> df.remove(pl.col("bar") >= 5)
5104+
shape: (3, 3)
5105+
┌──────┬──────┬──────┐
5106+
│ foo ┆ bar ┆ ham │
5107+
│ --- ┆ --- ┆ --- │
5108+
│ i64 ┆ i64 ┆ str │
5109+
╞══════╪══════╪══════╡
5110+
│ null ┆ null ┆ null │
5111+
│ 4 ┆ null ┆ c │
5112+
│ 0 ┆ 0 ┆ d │
5113+
└──────┴──────┴──────┘
5114+
5115+
Discard rows based on multiple conditions, combined with and/or operators:
5116+
5117+
>>> df.remove(
5118+
... (pl.col("foo") >= 0) & (pl.col("bar") >= 0),
5119+
... )
5120+
shape: (2, 3)
5121+
┌──────┬──────┬──────┐
5122+
│ foo ┆ bar ┆ ham │
5123+
│ --- ┆ --- ┆ --- │
5124+
│ i64 ┆ i64 ┆ str │
5125+
╞══════╪══════╪══════╡
5126+
│ null ┆ null ┆ null │
5127+
│ 4 ┆ null ┆ c │
5128+
└──────┴──────┴──────┘
5129+
5130+
>>> df.remove(
5131+
... (pl.col("foo") >= 0) | (pl.col("bar") >= 0),
5132+
... )
5133+
shape: (1, 3)
5134+
┌──────┬──────┬──────┐
5135+
│ foo ┆ bar ┆ ham │
5136+
│ --- ┆ --- ┆ --- │
5137+
│ i64 ┆ i64 ┆ str │
5138+
╞══════╪══════╪══════╡
5139+
│ null ┆ null ┆ null │
5140+
└──────┴──────┴──────┘
5141+
5142+
Provide multiple constraints using `*args` syntax:
5143+
5144+
>>> df.remove(
5145+
... pl.col("ham").is_not_null(),
5146+
... pl.col("bar") >= 0,
5147+
... )
5148+
shape: (2, 3)
5149+
┌──────┬──────┬──────┐
5150+
│ foo ┆ bar ┆ ham │
5151+
│ --- ┆ --- ┆ --- │
5152+
│ i64 ┆ i64 ┆ str │
5153+
╞══════╪══════╪══════╡
5154+
│ null ┆ null ┆ null │
5155+
│ 4 ┆ null ┆ c │
5156+
└──────┴──────┴──────┘
5157+
5158+
Provide constraints(s) using `**kwargs` syntax:
5159+
5160+
>>> df.remove(foo=0, bar=0)
5161+
shape: (4, 3)
5162+
┌──────┬──────┬──────┐
5163+
│ foo ┆ bar ┆ ham │
5164+
│ --- ┆ --- ┆ --- │
5165+
│ i64 ┆ i64 ┆ str │
5166+
╞══════╪══════╪══════╡
5167+
│ 2 ┆ 5 ┆ a │
5168+
│ 3 ┆ 6 ┆ b │
5169+
│ null ┆ null ┆ null │
5170+
│ 4 ┆ null ┆ c │
5171+
└──────┴──────┴──────┘
5172+
5173+
Remove rows by comparing two columns against each other:
5174+
5175+
>>> df.remove(
5176+
... pl.col("foo").ne_missing(pl.col("bar")),
5177+
... )
5178+
shape: (2, 3)
5179+
┌──────┬──────┬──────┐
5180+
│ foo ┆ bar ┆ ham │
5181+
│ --- ┆ --- ┆ --- │
5182+
│ i64 ┆ i64 ┆ str │
5183+
╞══════╪══════╪══════╡
5184+
│ null ┆ null ┆ null │
5185+
│ 0 ┆ 0 ┆ d │
5186+
└──────┴──────┴──────┘
5187+
"""
5188+
return self.lazy().remove(*predicates, **constraints).collect(_eager=True)
5189+
50395190
@overload
50405191
def glimpse(
50415192
self,
@@ -7293,7 +7444,6 @@ def join_asof(
72937444
│ Netherlands ┆ 2018-08-01 ┆ 17.32 ┆ 910 │
72947445
│ Netherlands ┆ 2019-01-01 ┆ 17.4 ┆ 910 │
72957446
└─────────────┴────────────┴────────────┴──────┘
7296-
72977447
"""
72987448
if not isinstance(other, DataFrame):
72997449
msg = f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}"
@@ -7628,7 +7778,6 @@ def join_where(
76287778
│ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
76297779
│ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
76307780
└─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
7631-
76327781
"""
76337782
if not isinstance(other, DataFrame):
76347783
msg = f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}"

0 commit comments

Comments
 (0)