From 884f61964980bac959f52b70a8c71342e9989bfd Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 20 Dec 2025 11:07:30 -0500 Subject: [PATCH 1/7] ENH: Generalize pd.col and expressions --- pandas/core/col.py | 176 +++++++++++++++++++----------------- pandas/core/frame.py | 6 +- pandas/core/generic.py | 7 ++ pandas/core/indexing.py | 12 +++ pandas/core/reshape/tile.py | 5 + pandas/tests/test_col.py | 131 ++++++++++++++++++++++----- 6 files changed, 228 insertions(+), 109 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index a91a53dd19fde..da837626e93a0 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -11,10 +11,11 @@ from pandas.util._decorators import set_module -from pandas.core.series import Series - if TYPE_CHECKING: - from pandas import DataFrame + from pandas import ( + DataFrame, + Series, + ) # Used only for generating the str repr of expressions. @@ -48,13 +49,15 @@ def _parse_args(df: DataFrame, *args: Any) -> tuple[Series]: # Parse `args`, evaluating any expressions we encounter. - return tuple([x(df) if isinstance(x, Expression) else x for x in args]) + return tuple( + [x._eval_expression(df) if isinstance(x, Expression) else x for x in args] + ) def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[str, Any]: # Parse `kwargs`, evaluating any expressions we encounter. return { - key: val(df) if isinstance(val, Expression) else val + key: val._eval_expression(df) if isinstance(val, Expression) else val for key, val in kwargs.items() } @@ -85,105 +88,134 @@ class Expression: This is not meant to be instantiated directly. Instead, use :meth:`pandas.col`. """ - def __init__(self, func: Callable[[DataFrame], Any], repr_str: str) -> None: + def __init__( + self, + func: Callable[[DataFrame], Any], + repr_str: str, + needs_parenthese: bool = False, + ) -> None: self._func = func self._repr_str = repr_str + self._needs_parentheses = needs_parenthese - def __call__(self, df: DataFrame) -> Any: + def _eval_expression(self, df: DataFrame) -> Any: return self._func(df) - def _with_binary_op(self, op: str, other: Any) -> Expression: + def _with_op(self, op: str, other: Any) -> Expression: op_symbol = _OP_SYMBOLS.get(op, op) - if isinstance(other, Expression): - if op.startswith("__r"): - repr_str = f"({other._repr_str} {op_symbol} {self._repr_str})" - else: - repr_str = f"({self._repr_str} {op_symbol} {other._repr_str})" - return Expression(lambda df: getattr(self(df), op)(other(df)), repr_str) + if op == "__getitem__": + needs_parentheses = False + repr_str = f"{self!r}[{other!r}]" else: + needs_parentheses = True + self_repr = f"{self!r}" + if self._needs_parentheses: + self_repr = f"({self_repr})" + other_repr = f"{other!r}" + if isinstance(other, Expression) and other._needs_parentheses: + other_repr = f"({other_repr})" + if op.startswith("__r"): - repr_str = f"({other!r} {op_symbol} {self._repr_str})" + repr_str = f"{other_repr} {op_symbol} {self_repr}" else: - repr_str = f"({self._repr_str} {op_symbol} {other!r})" - return Expression(lambda df: getattr(self(df), op)(other), repr_str) + repr_str = f"{self_repr} {op_symbol} {other_repr}" + + if isinstance(other, Expression): + return Expression( + lambda df: getattr(self._eval_expression(df), op)( + other._eval_expression(df) + ), + repr_str, + needs_parenthese=needs_parentheses, + ) + else: + return Expression( + lambda df: getattr(self._eval_expression(df), op)(other), + repr_str, + needs_parenthese=needs_parentheses, + ) # Binary ops def __add__(self, other: Any) -> Expression: - return self._with_binary_op("__add__", other) + return self._with_op("__add__", other) def __radd__(self, other: Any) -> Expression: - return self._with_binary_op("__radd__", other) + return self._with_op("__radd__", other) def __sub__(self, other: Any) -> Expression: - return self._with_binary_op("__sub__", other) + return self._with_op("__sub__", other) def __rsub__(self, other: Any) -> Expression: - return self._with_binary_op("__rsub__", other) + return self._with_op("__rsub__", other) def __mul__(self, other: Any) -> Expression: - return self._with_binary_op("__mul__", other) + return self._with_op("__mul__", other) def __rmul__(self, other: Any) -> Expression: - return self._with_binary_op("__rmul__", other) + return self._with_op("__rmul__", other) def __truediv__(self, other: Any) -> Expression: - return self._with_binary_op("__truediv__", other) + return self._with_op("__truediv__", other) def __rtruediv__(self, other: Any) -> Expression: - return self._with_binary_op("__rtruediv__", other) + return self._with_op("__rtruediv__", other) def __floordiv__(self, other: Any) -> Expression: - return self._with_binary_op("__floordiv__", other) + return self._with_op("__floordiv__", other) def __rfloordiv__(self, other: Any) -> Expression: - return self._with_binary_op("__rfloordiv__", other) + return self._with_op("__rfloordiv__", other) def __ge__(self, other: Any) -> Expression: - return self._with_binary_op("__ge__", other) + return self._with_op("__ge__", other) def __gt__(self, other: Any) -> Expression: - return self._with_binary_op("__gt__", other) + return self._with_op("__gt__", other) def __le__(self, other: Any) -> Expression: - return self._with_binary_op("__le__", other) + return self._with_op("__le__", other) def __lt__(self, other: Any) -> Expression: - return self._with_binary_op("__lt__", other) + return self._with_op("__lt__", other) def __eq__(self, other: object) -> Expression: # type: ignore[override] - return self._with_binary_op("__eq__", other) + return self._with_op("__eq__", other) def __ne__(self, other: object) -> Expression: # type: ignore[override] - return self._with_binary_op("__ne__", other) + return self._with_op("__ne__", other) def __mod__(self, other: Any) -> Expression: - return self._with_binary_op("__mod__", other) + return self._with_op("__mod__", other) def __rmod__(self, other: Any) -> Expression: - return self._with_binary_op("__rmod__", other) + return self._with_op("__rmod__", other) # Logical ops def __and__(self, other: Any) -> Expression: - return self._with_binary_op("__and__", other) + return self._with_op("__and__", other) def __rand__(self, other: Any) -> Expression: - return self._with_binary_op("__rand__", other) + return self._with_op("__rand__", other) def __or__(self, other: Any) -> Expression: - return self._with_binary_op("__or__", other) + return self._with_op("__or__", other) def __ror__(self, other: Any) -> Expression: - return self._with_binary_op("__ror__", other) + return self._with_op("__ror__", other) def __xor__(self, other: Any) -> Expression: - return self._with_binary_op("__xor__", other) + return self._with_op("__xor__", other) def __rxor__(self, other: Any) -> Expression: - return self._with_binary_op("__rxor__", other) + return self._with_op("__rxor__", other) def __invert__(self) -> Expression: - return Expression(lambda df: ~self(df), f"(~{self._repr_str})") + return Expression( + lambda df: ~self._eval_expression(df), + f"~{self._repr_str}", + needs_parenthese=True, + ) def __array_ufunc__( self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any @@ -198,57 +230,35 @@ def func(df: DataFrame) -> Any: return Expression(func, repr_str) - # Everything else - def __getattr__(self, attr: str, /) -> Any: - if attr in Series._accessors: - return NamespaceExpression(self, attr) + def __getitem__(self, item: Any) -> Expression: + return self._with_op("__getitem__", item) - def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: - parsed_args = _parse_args(df, *args) + def _call_from_func(self, func, **kwargs) -> Expression: + def wrapped(df: DataFrame) -> Any: parsed_kwargs = _parse_kwargs(df, **kwargs) - return getattr(self(df), attr)(*parsed_args, **parsed_kwargs) + return func(**parsed_kwargs) - def wrapper(*args: Any, **kwargs: Any) -> Expression: - args_str = _pretty_print_args_kwargs(*args, **kwargs) - repr_str = f"{self._repr_str}.{attr}({args_str})" + args_str = _pretty_print_args_kwargs(**kwargs) + repr_str = func.__name__ + "(" + args_str + ")" - return Expression(lambda df: func(df, *args, **kwargs), repr_str) - - return wrapper - - def __repr__(self) -> str: - return self._repr_str or "Expr(...)" - - -class NamespaceExpression: - def __init__(self, func: Expression, namespace: str) -> None: - self._func = func - self._namespace = namespace - - def __call__(self, df: DataFrame) -> Any: - return self._func(df) - - def __getattr__(self, attr: str) -> Any: - if isinstance(getattr(getattr(Series, self._namespace), attr), property): - repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}" - return Expression( - lambda df: getattr(getattr(self(df), self._namespace), attr), - repr_str, - ) + return Expression(wrapped, repr_str) + def __call__(self, *args, **kwargs): def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: parsed_args = _parse_args(df, *args) parsed_kwargs = _parse_kwargs(df, **kwargs) - return getattr(getattr(self(df), self._namespace), attr)( - *parsed_args, **parsed_kwargs - ) + return self._eval_expression(df)(*parsed_args, **parsed_kwargs) + + args_str = _pretty_print_args_kwargs(*args, **kwargs) + repr_str = f"{self._repr_str}({args_str})" + return Expression(lambda df: func(df, *args, **kwargs), repr_str) - def wrapper(*args: Any, **kwargs: Any) -> Expression: - args_str = _pretty_print_args_kwargs(*args, **kwargs) - repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}({args_str})" - return Expression(lambda df: func(df, *args, **kwargs), repr_str) + def __getattr__(self, name: str, /) -> Any: + repr_str = f"{self._repr_str}.{name}" + return Expression(lambda df: getattr(self._eval_expression(df), name), repr_str) - return wrapper + def __repr__(self) -> str: + return self._repr_str or "Expr(...)" @set_module("pandas") diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a7535835f4b84..969402451cbd7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -146,6 +146,7 @@ ) from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.arrays.string_ import StringDtype +from pandas.core.col import Expression from pandas.core.construction import ( ensure_wrapped_if_datetimelike, sanitize_array, @@ -5520,7 +5521,10 @@ def assign(self, **kwargs) -> DataFrame: data = self.copy(deep=False) for k, v in kwargs.items(): - data[k] = com.apply_if_callable(v, data) + if isinstance(v, Expression): + data[k] = v._eval_expression(data) + else: + data[k] = com.apply_if_callable(v, data) return data def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c0be5568170d0..875ccb7e3e32a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9934,6 +9934,8 @@ def where( 3 True True 4 True True """ + from pandas.core.col import Expression + inplace = validate_bool_kwarg(inplace, "inplace") if inplace: if not CHAINED_WARNING_DISABLED: @@ -9946,6 +9948,11 @@ def where( stacklevel=2, ) + if isinstance(cond, Expression): + cond = cond._eval_expression(self) + if isinstance(other, Expression): + other = other._eval_expression(self) + other = common.apply_if_callable(other, self) return self._where(cond, other, inplace=inplace, axis=axis, level=level) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a476415d6c7c0..9e944b814e7cf 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -916,6 +916,8 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: @final def __setitem__(self, key, value) -> None: + from pandas.core.col import Expression + if not CHAINED_WARNING_DISABLED: if sys.getrefcount(self.obj) <= REF_COUNT_IDX: warnings.warn( @@ -925,8 +927,14 @@ def __setitem__(self, key, value) -> None: check_dict_or_set_indexers(key) if isinstance(key, tuple): key = (list(x) if is_iterator(x) else x for x in key) + key = ( + x._eval_expression(self.obj) if isinstance(x, Expression) else x + for x in key + ) key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: + if isinstance(key, Expression): + key = key._eval_expression(self.obj) maybe_callable = com.apply_if_callable(key, self.obj) key = self._raise_callable_usage(key, maybe_callable) indexer = self._get_setitem_indexer(key) @@ -1199,9 +1207,13 @@ def __getitem__(self, key): return self.obj._get_value(*key, takeable=self._takeable) return self._getitem_tuple(key) else: + from pandas.core.col import Expression + # we by definition only have the 0th axis axis = self.axis or 0 + if isinstance(key, Expression): + key = key._eval_expression(self.obj) maybe_callable = com.apply_if_callable(key, self.obj) maybe_callable = self._raise_callable_usage(key, maybe_callable) return self._getitem_axis(maybe_callable, axis=axis) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 659e82d979a91..e2d4e7496f6fc 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -43,6 +43,7 @@ ) import pandas.core.algorithms as algos from pandas.core.arrays.datetimelike import dtype_to_unit +from pandas.core.col import Expression if TYPE_CHECKING: from collections.abc import Callable @@ -359,6 +360,10 @@ def qcut( >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3]) """ + if isinstance(x, Expression): + return x._call_from_func( + qcut, x=x, q=q, labels=labels, retbins=retbins, precision=precision + ) original = x x_idx = _preprocess_for_cut(x) x_idx, _ = _coerce_to_type(x_idx) diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index cf7901a912279..c553a09e18de3 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._libs.properties import cache_readonly + import pandas as pd import pandas._testing as tm from pandas.api.typing import Expression @@ -13,25 +15,25 @@ ("expr", "expected_values", "expected_str"), [ (pd.col("a"), [1, 2], "col('a')"), - (pd.col("a") * 2, [2, 4], "(col('a') * 2)"), + (pd.col("a") * 2, [2, 4], "col('a') * 2"), (pd.col("a").sum(), [3, 3], "col('a').sum()"), - (pd.col("a") + 1, [2, 3], "(col('a') + 1)"), - (1 + pd.col("a"), [2, 3], "(1 + col('a'))"), - (pd.col("a") - 1, [0, 1], "(col('a') - 1)"), - (1 - pd.col("a"), [0, -1], "(1 - col('a'))"), - (pd.col("a") * 1, [1, 2], "(col('a') * 1)"), - (1 * pd.col("a"), [1, 2], "(1 * col('a'))"), - (pd.col("a") / 1, [1.0, 2.0], "(col('a') / 1)"), - (1 / pd.col("a"), [1.0, 0.5], "(1 / col('a'))"), - (pd.col("a") // 1, [1, 2], "(col('a') // 1)"), - (1 // pd.col("a"), [1, 0], "(1 // col('a'))"), - (pd.col("a") % 1, [0, 0], "(col('a') % 1)"), - (1 % pd.col("a"), [0, 1], "(1 % col('a'))"), - (pd.col("a") > 1, [False, True], "(col('a') > 1)"), - (pd.col("a") >= 1, [True, True], "(col('a') >= 1)"), - (pd.col("a") < 1, [False, False], "(col('a') < 1)"), - (pd.col("a") <= 1, [True, False], "(col('a') <= 1)"), - (pd.col("a") == 1, [True, False], "(col('a') == 1)"), + (pd.col("a") + 1, [2, 3], "col('a') + 1"), + (1 + pd.col("a"), [2, 3], "1 + col('a')"), + (pd.col("a") - 1, [0, 1], "col('a') - 1"), + (1 - pd.col("a"), [0, -1], "1 - col('a')"), + (pd.col("a") * 1, [1, 2], "col('a') * 1"), + (1 * pd.col("a"), [1, 2], "1 * col('a')"), + (pd.col("a") / 1, [1.0, 2.0], "col('a') / 1"), + (1 / pd.col("a"), [1.0, 0.5], "1 / col('a')"), + (pd.col("a") // 1, [1, 2], "col('a') // 1"), + (1 // pd.col("a"), [1, 0], "1 // col('a')"), + (pd.col("a") % 1, [0, 0], "col('a') % 1"), + (1 % pd.col("a"), [0, 1], "1 % col('a')"), + (pd.col("a") > 1, [False, True], "col('a') > 1"), + (pd.col("a") >= 1, [True, True], "col('a') >= 1"), + (pd.col("a") < 1, [False, False], "col('a') < 1"), + (pd.col("a") <= 1, [True, False], "col('a') <= 1"), + (pd.col("a") == 1, [True, False], "col('a') == 1"), (np.power(pd.col("a"), 2), [1, 4], "power(col('a'), 2)"), (np.divide(pd.col("a"), pd.col("a")), [1.0, 1.0], "divide(col('a'), col('a'))"), ], @@ -105,37 +107,37 @@ def mean(self): ( pd.col("a") & pd.col("b"), [False, False, True, False], - "(col('a') & col('b'))", + "col('a') & col('b')", ), ( pd.col("a") & True, [True, False, True, False], - "(col('a') & True)", + "col('a') & True", ), ( pd.col("a") | pd.col("b"), [True, True, True, True], - "(col('a') | col('b'))", + "col('a') | col('b')", ), ( pd.col("a") | False, [True, False, True, False], - "(col('a') | False)", + "col('a') | False", ), ( pd.col("a") ^ pd.col("b"), [True, True, False, True], - "(col('a') ^ col('b'))", + "col('a') ^ col('b')", ), ( pd.col("a") ^ True, [False, True, False, True], - "(col('a') ^ True)", + "col('a') ^ True", ), ( ~pd.col("a"), [False, True, False, True], - "(~col('a'))", + "~col('a')", ), ], ) @@ -159,3 +161,82 @@ def test_col_logical_ops( result = df.loc[expr] expected = df[expected_values] tm.assert_frame_equal(result, expected) + + +def test_compound_op() -> None: + df = pd.DataFrame({"a": [1, 2, 3]}) + expr = (pd.col("a") + 1) * (pd.col("a") + 2) + expected_str = "(col('a') + 1) * (col('a') + 2)" + assert str(expr) == expected_str + + result = df.assign(b=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [6, 12, 20]}) + tm.assert_frame_equal(result, expected) + + +def test_getitem() -> None: + df = pd.DataFrame({"a": [1, 2, 3]}) + expr = pd.col("a")[1] + expected_str = "col('a')[1]" + + assert str(expr) == expected_str + + result = df.assign(b=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) + tm.assert_frame_equal(result, expected) + + +def test_property() -> None: + df = pd.DataFrame({"a": [1, 2, 3]}) + expr = pd.col("a").index + expected_str = "col('a').index" + + assert str(expr) == expected_str + + result = df.assign(b=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) + tm.assert_frame_equal(result, expected) + + +def test_cached_property() -> None: + # Ensure test is valid + assert isinstance(pd.Index.dtype, cache_readonly) + + df = pd.DataFrame({"a": [1, 2, 3]}) + expr = pd.col("a").index.dtype + expected_str = "col('a').index.dtype" + assert str(expr) == expected_str + + result = df.assign(b=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": np.int64}) + tm.assert_frame_equal(result, expected) + + +def test_qcut() -> None: + df = pd.DataFrame({"a": [1, 2, 3]}) + expr = pd.qcut(pd.col("a"), 3) + expected_str = "qcut(x=\"col('a')\", q=3, labels=None, retbins=False, precision=3)" + assert str(expr) == expected_str, str(expr) + + result = df.assign(b=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": pd.qcut(df["a"], 3)}) + tm.assert_frame_equal(result, expected) + + +def test_where() -> None: + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expr = pd.col("a").where(pd.col("b") == 5, 100) + expected_str = "col('a').where(col('b') == 5, 100)" + assert str(expr) == expected_str, str(expr) + + result = df.assign(c=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [100, 2, 100]}) + tm.assert_frame_equal(result, expected) + + expr = pd.col("a").where(pd.col("b") == 5, pd.col("a") + 1) + expected_str = "col('a').where(col('b') == 5, col('a') + 1)" + assert str(expr) == expected_str, str(expr) + + result = df.assign(c=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [2, 2, 4]}) + tm.assert_frame_equal(result, expected) From 4a59f56281946474b685c9b7e4e3383c2f5b6173 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 20 Dec 2025 11:14:46 -0500 Subject: [PATCH 2/7] docs --- doc/source/whatsnew/v3.0.0.rst | 6 ++---- pandas/tests/test_col.py | 6 ++++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7d65ca781d81e..98e02a0409416 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -164,10 +164,8 @@ The expression object returned by :func:`col` supports all standard operators (like ``+``, ``-``, ``*``, ``/``, etc.) and all Series methods and namespaces (like ``pd.col("name").sum()``, ``pd.col("name").str.upper()``, etc.). -Currently, the ``pd.col()`` syntax can be used in any place which accepts a -callable that takes the calling DataFrame as first argument and returns a -Series, like ``lambda df: df[col_name]``. -This includes :meth:`DataFrame.assign`, :meth:`DataFrame.loc`, and getitem/setitem. +Currently, the ``pd.col()`` syntax can be used in :meth:`DataFrame.assign`, +:meth:`DataFrame.loc`, and getitem/setitem. It is expected that the support for ``pd.col()`` will be expanded to more methods in future releases. diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index c553a09e18de3..18970f90eebb2 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -164,6 +164,7 @@ def test_col_logical_ops( def test_compound_op() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 df = pd.DataFrame({"a": [1, 2, 3]}) expr = (pd.col("a") + 1) * (pd.col("a") + 2) expected_str = "(col('a') + 1) * (col('a') + 2)" @@ -175,6 +176,7 @@ def test_compound_op() -> None: def test_getitem() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 df = pd.DataFrame({"a": [1, 2, 3]}) expr = pd.col("a")[1] expected_str = "col('a')[1]" @@ -187,6 +189,7 @@ def test_getitem() -> None: def test_property() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 df = pd.DataFrame({"a": [1, 2, 3]}) expr = pd.col("a").index expected_str = "col('a').index" @@ -199,6 +202,7 @@ def test_property() -> None: def test_cached_property() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 # Ensure test is valid assert isinstance(pd.Index.dtype, cache_readonly) @@ -213,6 +217,7 @@ def test_cached_property() -> None: def test_qcut() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 df = pd.DataFrame({"a": [1, 2, 3]}) expr = pd.qcut(pd.col("a"), 3) expected_str = "qcut(x=\"col('a')\", q=3, labels=None, retbins=False, precision=3)" @@ -224,6 +229,7 @@ def test_qcut() -> None: def test_where() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expr = pd.col("a").where(pd.col("b") == 5, 100) expected_str = "col('a').where(col('b') == 5, 100)" From 1eabb0400ef5f49369ff9af1fb93c1dc77ff84d3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 20 Dec 2025 11:44:34 -0500 Subject: [PATCH 3/7] Refinements --- doc/source/whatsnew/v3.0.0.rst | 6 ++- pandas/core/col.py | 7 +++- pandas/core/common.py | 6 ++- pandas/core/frame.py | 6 +-- pandas/core/generic.py | 6 --- pandas/core/indexing.py | 12 ------ pandas/core/reshape/tile.py | 2 +- pandas/tests/test_col.py | 67 +++++++++++++++++++++++++++------- 8 files changed, 69 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 98e02a0409416..7d65ca781d81e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -164,8 +164,10 @@ The expression object returned by :func:`col` supports all standard operators (like ``+``, ``-``, ``*``, ``/``, etc.) and all Series methods and namespaces (like ``pd.col("name").sum()``, ``pd.col("name").str.upper()``, etc.). -Currently, the ``pd.col()`` syntax can be used in :meth:`DataFrame.assign`, -:meth:`DataFrame.loc`, and getitem/setitem. +Currently, the ``pd.col()`` syntax can be used in any place which accepts a +callable that takes the calling DataFrame as first argument and returns a +Series, like ``lambda df: df[col_name]``. +This includes :meth:`DataFrame.assign`, :meth:`DataFrame.loc`, and getitem/setitem. It is expected that the support for ``pd.col()`` will be expanded to more methods in future releases. diff --git a/pandas/core/col.py b/pandas/core/col.py index da837626e93a0..5fa88499eae98 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -233,7 +233,7 @@ def func(df: DataFrame) -> Any: def __getitem__(self, item: Any) -> Expression: return self._with_op("__getitem__", item) - def _call_from_func(self, func, **kwargs) -> Expression: + def _call_with_func(self, func, **kwargs) -> Expression: def wrapped(df: DataFrame) -> Any: parsed_kwargs = _parse_kwargs(df, **kwargs) return func(**parsed_kwargs) @@ -254,7 +254,10 @@ def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: return Expression(lambda df: func(df, *args, **kwargs), repr_str) def __getattr__(self, name: str, /) -> Any: - repr_str = f"{self._repr_str}.{name}" + repr_str = f"{self!r}" + if self._needs_parentheses: + repr_str = f"({repr_str})" + repr_str += f".{name}" return Expression(lambda df: getattr(self._eval_expression(df), name), repr_str) def __repr__(self) -> str: diff --git a/pandas/core/common.py b/pandas/core/common.py index 7b6ba2d9010a7..3ca6586222ca1 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -50,6 +50,8 @@ ) from pandas.core.dtypes.inference import iterable_not_string +from pandas.core.col import Expression + if TYPE_CHECKING: from pandas._typing import ( AnyArrayLike, @@ -383,7 +385,9 @@ def apply_if_callable(maybe_callable, obj, **kwargs): obj : NDFrame **kwargs """ - if callable(maybe_callable): + if isinstance(maybe_callable, Expression): + return maybe_callable._eval_expression(obj, **kwargs) + elif callable(maybe_callable): return maybe_callable(obj, **kwargs) return maybe_callable diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 969402451cbd7..a7535835f4b84 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -146,7 +146,6 @@ ) from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.arrays.string_ import StringDtype -from pandas.core.col import Expression from pandas.core.construction import ( ensure_wrapped_if_datetimelike, sanitize_array, @@ -5521,10 +5520,7 @@ def assign(self, **kwargs) -> DataFrame: data = self.copy(deep=False) for k, v in kwargs.items(): - if isinstance(v, Expression): - data[k] = v._eval_expression(data) - else: - data[k] = com.apply_if_callable(v, data) + data[k] = com.apply_if_callable(v, data) return data def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 875ccb7e3e32a..d757370b1975b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9934,7 +9934,6 @@ def where( 3 True True 4 True True """ - from pandas.core.col import Expression inplace = validate_bool_kwarg(inplace, "inplace") if inplace: @@ -9948,11 +9947,6 @@ def where( stacklevel=2, ) - if isinstance(cond, Expression): - cond = cond._eval_expression(self) - if isinstance(other, Expression): - other = other._eval_expression(self) - other = common.apply_if_callable(other, self) return self._where(cond, other, inplace=inplace, axis=axis, level=level) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9e944b814e7cf..a476415d6c7c0 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -916,8 +916,6 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: @final def __setitem__(self, key, value) -> None: - from pandas.core.col import Expression - if not CHAINED_WARNING_DISABLED: if sys.getrefcount(self.obj) <= REF_COUNT_IDX: warnings.warn( @@ -927,14 +925,8 @@ def __setitem__(self, key, value) -> None: check_dict_or_set_indexers(key) if isinstance(key, tuple): key = (list(x) if is_iterator(x) else x for x in key) - key = ( - x._eval_expression(self.obj) if isinstance(x, Expression) else x - for x in key - ) key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: - if isinstance(key, Expression): - key = key._eval_expression(self.obj) maybe_callable = com.apply_if_callable(key, self.obj) key = self._raise_callable_usage(key, maybe_callable) indexer = self._get_setitem_indexer(key) @@ -1207,13 +1199,9 @@ def __getitem__(self, key): return self.obj._get_value(*key, takeable=self._takeable) return self._getitem_tuple(key) else: - from pandas.core.col import Expression - # we by definition only have the 0th axis axis = self.axis or 0 - if isinstance(key, Expression): - key = key._eval_expression(self.obj) maybe_callable = com.apply_if_callable(key, self.obj) maybe_callable = self._raise_callable_usage(key, maybe_callable) return self._getitem_axis(maybe_callable, axis=axis) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index e2d4e7496f6fc..263b8c04ed51b 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -361,7 +361,7 @@ def qcut( array([0, 0, 1, 2, 3]) """ if isinstance(x, Expression): - return x._call_from_func( + return x._call_with_func( qcut, x=x, q=q, labels=labels, retbins=retbins, precision=precision ) original = x diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index 18970f90eebb2..ee5af6922da49 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -36,16 +36,67 @@ (pd.col("a") == 1, [True, False], "col('a') == 1"), (np.power(pd.col("a"), 2), [1, 4], "power(col('a'), 2)"), (np.divide(pd.col("a"), pd.col("a")), [1.0, 1.0], "divide(col('a'), col('a'))"), + ( + (pd.col("a") + 1) * (pd.col("b") + 2), + [10, 18], + "(col('a') + 1) * (col('b') + 2)", + ), + ( + (pd.col("a") - 1).astype("bool"), + [False, True], + "(col('a') - 1).astype('bool')", + ), ], ) def test_col_simple( expr: Expression, expected_values: list[object], expected_str: str ) -> None: df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + assert str(expr) == expected_str + result = df.assign(c=expr) expected = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": expected_values}) tm.assert_frame_equal(result, expected) - assert str(expr) == expected_str + + +def test_frame_getitem() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + expr = pd.col("a") == 2 + result = df[expr] + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + +def test_frame_setitem() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + expr = pd.col("a") == 2 + + result = df.copy() + result[expr] = 100 + expected = pd.DataFrame({"a": [1, 100], "b": [3, 100]}) + tm.assert_frame_equal(result, expected) + + +def test_frame_loc() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + expr = pd.col("a") == 2 + result = df.copy() + result.loc[expr, "b"] = 100 + expected = pd.DataFrame({"a": [1, 2], "b": [3, 100]}) + tm.assert_frame_equal(result, expected) + + +def test_frame_iloc() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + expr = pd.col("a") == 2 + result = df.copy() + result.iloc[expr, 1] = 100 + expected = pd.DataFrame({"a": [1, 2], "b": [3, 100]}) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -163,19 +214,7 @@ def test_col_logical_ops( tm.assert_frame_equal(result, expected) -def test_compound_op() -> None: - # https://github.com/pandas-dev/pandas/pull/63439 - df = pd.DataFrame({"a": [1, 2, 3]}) - expr = (pd.col("a") + 1) * (pd.col("a") + 2) - expected_str = "(col('a') + 1) * (col('a') + 2)" - assert str(expr) == expected_str - - result = df.assign(b=expr) - expected = pd.DataFrame({"a": [1, 2, 3], "b": [6, 12, 20]}) - tm.assert_frame_equal(result, expected) - - -def test_getitem() -> None: +def test_expression_getitem() -> None: # https://github.com/pandas-dev/pandas/pull/63439 df = pd.DataFrame({"a": [1, 2, 3]}) expr = pd.col("a")[1] From 690022bb88537b752d72fda57615e36c20062922 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 20 Dec 2025 11:47:23 -0500 Subject: [PATCH 4/7] Cleanup --- pandas/core/generic.py | 1 - pandas/tests/test_col.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d757370b1975b..c0be5568170d0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9934,7 +9934,6 @@ def where( 3 True True 4 True True """ - inplace = validate_bool_kwarg(inplace, "inplace") if inplace: if not CHAINED_WARNING_DISABLED: diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index ee5af6922da49..d7238a9185534 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -52,11 +52,10 @@ def test_col_simple( expr: Expression, expected_values: list[object], expected_str: str ) -> None: df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - assert str(expr) == expected_str - result = df.assign(c=expr) expected = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": expected_values}) tm.assert_frame_equal(result, expected) + assert str(expr) == expected_str def test_frame_getitem() -> None: From 94382540dddd6cd87b60cae5db7d35c25913b902 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 20 Dec 2025 12:16:17 -0500 Subject: [PATCH 5/7] Fix repr --- pandas/core/col.py | 9 ++------- pandas/tests/test_col.py | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 5fa88499eae98..e07c7fe3f855a 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -63,13 +63,8 @@ def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[str, Any]: def _pretty_print_args_kwargs(*args: Any, **kwargs: Any) -> str: - inputs_repr = ", ".join( - arg._repr_str if isinstance(arg, Expression) else repr(arg) for arg in args - ) - kwargs_repr = ", ".join( - f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" - for k, v in kwargs.items() - ) + inputs_repr = ", ".join(repr(arg) for arg in args) + kwargs_repr = ", ".join(f"{k}={v!r}" for k, v in kwargs.items()) all_args = [] if inputs_repr: diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index d7238a9185534..55db17cca8d99 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -258,7 +258,7 @@ def test_qcut() -> None: # https://github.com/pandas-dev/pandas/pull/63439 df = pd.DataFrame({"a": [1, 2, 3]}) expr = pd.qcut(pd.col("a"), 3) - expected_str = "qcut(x=\"col('a')\", q=3, labels=None, retbins=False, precision=3)" + expected_str = "qcut(x=col('a'), q=3, labels=None, retbins=False, precision=3)" assert str(expr) == expected_str, str(expr) result = df.assign(b=expr) From 987a1619d507c8bb04c60730e8f34df0c525a6de Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 20 Dec 2025 12:17:14 -0500 Subject: [PATCH 6/7] type-hints --- pandas/core/col.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index e07c7fe3f855a..f02f7b5ddcdc5 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -228,7 +228,7 @@ def func(df: DataFrame) -> Any: def __getitem__(self, item: Any) -> Expression: return self._with_op("__getitem__", item) - def _call_with_func(self, func, **kwargs) -> Expression: + def _call_with_func(self, func: Callable, **kwargs: Any) -> Expression: def wrapped(df: DataFrame) -> Any: parsed_kwargs = _parse_kwargs(df, **kwargs) return func(**parsed_kwargs) @@ -238,7 +238,7 @@ def wrapped(df: DataFrame) -> Any: return Expression(wrapped, repr_str) - def __call__(self, *args, **kwargs): + def __call__(self, *args: Any, **kwargs: Any): def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: parsed_args = _parse_args(df, *args) parsed_kwargs = _parse_kwargs(df, **kwargs) From fcdb3063f25f7ff74bc02188de486864423a752e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 20 Dec 2025 12:44:56 -0500 Subject: [PATCH 7/7] fixup --- pandas/core/col.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index f02f7b5ddcdc5..ad9271c6161c9 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -238,7 +238,7 @@ def wrapped(df: DataFrame) -> Any: return Expression(wrapped, repr_str) - def __call__(self, *args: Any, **kwargs: Any): + def __call__(self, *args: Any, **kwargs: Any) -> Expression: def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: parsed_args = _parse_args(df, *args) parsed_kwargs = _parse_kwargs(df, **kwargs)