diff --git a/pandas/core/col.py b/pandas/core/col.py index a91a53dd19fde..ad9271c6161c9 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -11,10 +11,11 @@ from pandas.util._decorators import set_module -from pandas.core.series import Series - if TYPE_CHECKING: - from pandas import DataFrame + from pandas import ( + DataFrame, + Series, + ) # Used only for generating the str repr of expressions. @@ -48,25 +49,22 @@ def _parse_args(df: DataFrame, *args: Any) -> tuple[Series]: # Parse `args`, evaluating any expressions we encounter. - return tuple([x(df) if isinstance(x, Expression) else x for x in args]) + return tuple( + [x._eval_expression(df) if isinstance(x, Expression) else x for x in args] + ) def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[str, Any]: # Parse `kwargs`, evaluating any expressions we encounter. return { - key: val(df) if isinstance(val, Expression) else val + key: val._eval_expression(df) if isinstance(val, Expression) else val for key, val in kwargs.items() } def _pretty_print_args_kwargs(*args: Any, **kwargs: Any) -> str: - inputs_repr = ", ".join( - arg._repr_str if isinstance(arg, Expression) else repr(arg) for arg in args - ) - kwargs_repr = ", ".join( - f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" - for k, v in kwargs.items() - ) + inputs_repr = ", ".join(repr(arg) for arg in args) + kwargs_repr = ", ".join(f"{k}={v!r}" for k, v in kwargs.items()) all_args = [] if inputs_repr: @@ -85,105 +83,134 @@ class Expression: This is not meant to be instantiated directly. Instead, use :meth:`pandas.col`. """ - def __init__(self, func: Callable[[DataFrame], Any], repr_str: str) -> None: + def __init__( + self, + func: Callable[[DataFrame], Any], + repr_str: str, + needs_parenthese: bool = False, + ) -> None: self._func = func self._repr_str = repr_str + self._needs_parentheses = needs_parenthese - def __call__(self, df: DataFrame) -> Any: + def _eval_expression(self, df: DataFrame) -> Any: return self._func(df) - def _with_binary_op(self, op: str, other: Any) -> Expression: + def _with_op(self, op: str, other: Any) -> Expression: op_symbol = _OP_SYMBOLS.get(op, op) - if isinstance(other, Expression): - if op.startswith("__r"): - repr_str = f"({other._repr_str} {op_symbol} {self._repr_str})" - else: - repr_str = f"({self._repr_str} {op_symbol} {other._repr_str})" - return Expression(lambda df: getattr(self(df), op)(other(df)), repr_str) + if op == "__getitem__": + needs_parentheses = False + repr_str = f"{self!r}[{other!r}]" else: + needs_parentheses = True + self_repr = f"{self!r}" + if self._needs_parentheses: + self_repr = f"({self_repr})" + other_repr = f"{other!r}" + if isinstance(other, Expression) and other._needs_parentheses: + other_repr = f"({other_repr})" + if op.startswith("__r"): - repr_str = f"({other!r} {op_symbol} {self._repr_str})" + repr_str = f"{other_repr} {op_symbol} {self_repr}" else: - repr_str = f"({self._repr_str} {op_symbol} {other!r})" - return Expression(lambda df: getattr(self(df), op)(other), repr_str) + repr_str = f"{self_repr} {op_symbol} {other_repr}" + + if isinstance(other, Expression): + return Expression( + lambda df: getattr(self._eval_expression(df), op)( + other._eval_expression(df) + ), + repr_str, + needs_parenthese=needs_parentheses, + ) + else: + return Expression( + lambda df: getattr(self._eval_expression(df), op)(other), + repr_str, + needs_parenthese=needs_parentheses, + ) # Binary ops def __add__(self, other: Any) -> Expression: - return self._with_binary_op("__add__", other) + return self._with_op("__add__", other) def __radd__(self, other: Any) -> Expression: - return self._with_binary_op("__radd__", other) + return self._with_op("__radd__", other) def __sub__(self, other: Any) -> Expression: - return self._with_binary_op("__sub__", other) + return self._with_op("__sub__", other) def __rsub__(self, other: Any) -> Expression: - return self._with_binary_op("__rsub__", other) + return self._with_op("__rsub__", other) def __mul__(self, other: Any) -> Expression: - return self._with_binary_op("__mul__", other) + return self._with_op("__mul__", other) def __rmul__(self, other: Any) -> Expression: - return self._with_binary_op("__rmul__", other) + return self._with_op("__rmul__", other) def __truediv__(self, other: Any) -> Expression: - return self._with_binary_op("__truediv__", other) + return self._with_op("__truediv__", other) def __rtruediv__(self, other: Any) -> Expression: - return self._with_binary_op("__rtruediv__", other) + return self._with_op("__rtruediv__", other) def __floordiv__(self, other: Any) -> Expression: - return self._with_binary_op("__floordiv__", other) + return self._with_op("__floordiv__", other) def __rfloordiv__(self, other: Any) -> Expression: - return self._with_binary_op("__rfloordiv__", other) + return self._with_op("__rfloordiv__", other) def __ge__(self, other: Any) -> Expression: - return self._with_binary_op("__ge__", other) + return self._with_op("__ge__", other) def __gt__(self, other: Any) -> Expression: - return self._with_binary_op("__gt__", other) + return self._with_op("__gt__", other) def __le__(self, other: Any) -> Expression: - return self._with_binary_op("__le__", other) + return self._with_op("__le__", other) def __lt__(self, other: Any) -> Expression: - return self._with_binary_op("__lt__", other) + return self._with_op("__lt__", other) def __eq__(self, other: object) -> Expression: # type: ignore[override] - return self._with_binary_op("__eq__", other) + return self._with_op("__eq__", other) def __ne__(self, other: object) -> Expression: # type: ignore[override] - return self._with_binary_op("__ne__", other) + return self._with_op("__ne__", other) def __mod__(self, other: Any) -> Expression: - return self._with_binary_op("__mod__", other) + return self._with_op("__mod__", other) def __rmod__(self, other: Any) -> Expression: - return self._with_binary_op("__rmod__", other) + return self._with_op("__rmod__", other) # Logical ops def __and__(self, other: Any) -> Expression: - return self._with_binary_op("__and__", other) + return self._with_op("__and__", other) def __rand__(self, other: Any) -> Expression: - return self._with_binary_op("__rand__", other) + return self._with_op("__rand__", other) def __or__(self, other: Any) -> Expression: - return self._with_binary_op("__or__", other) + return self._with_op("__or__", other) def __ror__(self, other: Any) -> Expression: - return self._with_binary_op("__ror__", other) + return self._with_op("__ror__", other) def __xor__(self, other: Any) -> Expression: - return self._with_binary_op("__xor__", other) + return self._with_op("__xor__", other) def __rxor__(self, other: Any) -> Expression: - return self._with_binary_op("__rxor__", other) + return self._with_op("__rxor__", other) def __invert__(self) -> Expression: - return Expression(lambda df: ~self(df), f"(~{self._repr_str})") + return Expression( + lambda df: ~self._eval_expression(df), + f"~{self._repr_str}", + needs_parenthese=True, + ) def __array_ufunc__( self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any @@ -198,57 +225,38 @@ def func(df: DataFrame) -> Any: return Expression(func, repr_str) - # Everything else - def __getattr__(self, attr: str, /) -> Any: - if attr in Series._accessors: - return NamespaceExpression(self, attr) + def __getitem__(self, item: Any) -> Expression: + return self._with_op("__getitem__", item) - def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: - parsed_args = _parse_args(df, *args) + def _call_with_func(self, func: Callable, **kwargs: Any) -> Expression: + def wrapped(df: DataFrame) -> Any: parsed_kwargs = _parse_kwargs(df, **kwargs) - return getattr(self(df), attr)(*parsed_args, **parsed_kwargs) - - def wrapper(*args: Any, **kwargs: Any) -> Expression: - args_str = _pretty_print_args_kwargs(*args, **kwargs) - repr_str = f"{self._repr_str}.{attr}({args_str})" - - return Expression(lambda df: func(df, *args, **kwargs), repr_str) - - return wrapper - - def __repr__(self) -> str: - return self._repr_str or "Expr(...)" - - -class NamespaceExpression: - def __init__(self, func: Expression, namespace: str) -> None: - self._func = func - self._namespace = namespace + return func(**parsed_kwargs) - def __call__(self, df: DataFrame) -> Any: - return self._func(df) + args_str = _pretty_print_args_kwargs(**kwargs) + repr_str = func.__name__ + "(" + args_str + ")" - def __getattr__(self, attr: str) -> Any: - if isinstance(getattr(getattr(Series, self._namespace), attr), property): - repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}" - return Expression( - lambda df: getattr(getattr(self(df), self._namespace), attr), - repr_str, - ) + return Expression(wrapped, repr_str) + def __call__(self, *args: Any, **kwargs: Any) -> Expression: def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: parsed_args = _parse_args(df, *args) parsed_kwargs = _parse_kwargs(df, **kwargs) - return getattr(getattr(self(df), self._namespace), attr)( - *parsed_args, **parsed_kwargs - ) + return self._eval_expression(df)(*parsed_args, **parsed_kwargs) + + args_str = _pretty_print_args_kwargs(*args, **kwargs) + repr_str = f"{self._repr_str}({args_str})" + return Expression(lambda df: func(df, *args, **kwargs), repr_str) - def wrapper(*args: Any, **kwargs: Any) -> Expression: - args_str = _pretty_print_args_kwargs(*args, **kwargs) - repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}({args_str})" - return Expression(lambda df: func(df, *args, **kwargs), repr_str) + def __getattr__(self, name: str, /) -> Any: + repr_str = f"{self!r}" + if self._needs_parentheses: + repr_str = f"({repr_str})" + repr_str += f".{name}" + return Expression(lambda df: getattr(self._eval_expression(df), name), repr_str) - return wrapper + def __repr__(self) -> str: + return self._repr_str or "Expr(...)" @set_module("pandas") diff --git a/pandas/core/common.py b/pandas/core/common.py index 7b6ba2d9010a7..3ca6586222ca1 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -50,6 +50,8 @@ ) from pandas.core.dtypes.inference import iterable_not_string +from pandas.core.col import Expression + if TYPE_CHECKING: from pandas._typing import ( AnyArrayLike, @@ -383,7 +385,9 @@ def apply_if_callable(maybe_callable, obj, **kwargs): obj : NDFrame **kwargs """ - if callable(maybe_callable): + if isinstance(maybe_callable, Expression): + return maybe_callable._eval_expression(obj, **kwargs) + elif callable(maybe_callable): return maybe_callable(obj, **kwargs) return maybe_callable diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 659e82d979a91..263b8c04ed51b 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -43,6 +43,7 @@ ) import pandas.core.algorithms as algos from pandas.core.arrays.datetimelike import dtype_to_unit +from pandas.core.col import Expression if TYPE_CHECKING: from collections.abc import Callable @@ -359,6 +360,10 @@ def qcut( >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3]) """ + if isinstance(x, Expression): + return x._call_with_func( + qcut, x=x, q=q, labels=labels, retbins=retbins, precision=precision + ) original = x x_idx = _preprocess_for_cut(x) x_idx, _ = _coerce_to_type(x_idx) diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index cf7901a912279..55db17cca8d99 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._libs.properties import cache_readonly + import pandas as pd import pandas._testing as tm from pandas.api.typing import Expression @@ -13,27 +15,37 @@ ("expr", "expected_values", "expected_str"), [ (pd.col("a"), [1, 2], "col('a')"), - (pd.col("a") * 2, [2, 4], "(col('a') * 2)"), + (pd.col("a") * 2, [2, 4], "col('a') * 2"), (pd.col("a").sum(), [3, 3], "col('a').sum()"), - (pd.col("a") + 1, [2, 3], "(col('a') + 1)"), - (1 + pd.col("a"), [2, 3], "(1 + col('a'))"), - (pd.col("a") - 1, [0, 1], "(col('a') - 1)"), - (1 - pd.col("a"), [0, -1], "(1 - col('a'))"), - (pd.col("a") * 1, [1, 2], "(col('a') * 1)"), - (1 * pd.col("a"), [1, 2], "(1 * col('a'))"), - (pd.col("a") / 1, [1.0, 2.0], "(col('a') / 1)"), - (1 / pd.col("a"), [1.0, 0.5], "(1 / col('a'))"), - (pd.col("a") // 1, [1, 2], "(col('a') // 1)"), - (1 // pd.col("a"), [1, 0], "(1 // col('a'))"), - (pd.col("a") % 1, [0, 0], "(col('a') % 1)"), - (1 % pd.col("a"), [0, 1], "(1 % col('a'))"), - (pd.col("a") > 1, [False, True], "(col('a') > 1)"), - (pd.col("a") >= 1, [True, True], "(col('a') >= 1)"), - (pd.col("a") < 1, [False, False], "(col('a') < 1)"), - (pd.col("a") <= 1, [True, False], "(col('a') <= 1)"), - (pd.col("a") == 1, [True, False], "(col('a') == 1)"), + (pd.col("a") + 1, [2, 3], "col('a') + 1"), + (1 + pd.col("a"), [2, 3], "1 + col('a')"), + (pd.col("a") - 1, [0, 1], "col('a') - 1"), + (1 - pd.col("a"), [0, -1], "1 - col('a')"), + (pd.col("a") * 1, [1, 2], "col('a') * 1"), + (1 * pd.col("a"), [1, 2], "1 * col('a')"), + (pd.col("a") / 1, [1.0, 2.0], "col('a') / 1"), + (1 / pd.col("a"), [1.0, 0.5], "1 / col('a')"), + (pd.col("a") // 1, [1, 2], "col('a') // 1"), + (1 // pd.col("a"), [1, 0], "1 // col('a')"), + (pd.col("a") % 1, [0, 0], "col('a') % 1"), + (1 % pd.col("a"), [0, 1], "1 % col('a')"), + (pd.col("a") > 1, [False, True], "col('a') > 1"), + (pd.col("a") >= 1, [True, True], "col('a') >= 1"), + (pd.col("a") < 1, [False, False], "col('a') < 1"), + (pd.col("a") <= 1, [True, False], "col('a') <= 1"), + (pd.col("a") == 1, [True, False], "col('a') == 1"), (np.power(pd.col("a"), 2), [1, 4], "power(col('a'), 2)"), (np.divide(pd.col("a"), pd.col("a")), [1.0, 1.0], "divide(col('a'), col('a'))"), + ( + (pd.col("a") + 1) * (pd.col("b") + 2), + [10, 18], + "(col('a') + 1) * (col('b') + 2)", + ), + ( + (pd.col("a") - 1).astype("bool"), + [False, True], + "(col('a') - 1).astype('bool')", + ), ], ) def test_col_simple( @@ -46,6 +58,46 @@ def test_col_simple( assert str(expr) == expected_str +def test_frame_getitem() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + expr = pd.col("a") == 2 + result = df[expr] + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + +def test_frame_setitem() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + expr = pd.col("a") == 2 + + result = df.copy() + result[expr] = 100 + expected = pd.DataFrame({"a": [1, 100], "b": [3, 100]}) + tm.assert_frame_equal(result, expected) + + +def test_frame_loc() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + expr = pd.col("a") == 2 + result = df.copy() + result.loc[expr, "b"] = 100 + expected = pd.DataFrame({"a": [1, 2], "b": [3, 100]}) + tm.assert_frame_equal(result, expected) + + +def test_frame_iloc() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + expr = pd.col("a") == 2 + result = df.copy() + result.iloc[expr, 1] = 100 + expected = pd.DataFrame({"a": [1, 2], "b": [3, 100]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( ("expr", "expected_values", "expected_str"), [ @@ -105,37 +157,37 @@ def mean(self): ( pd.col("a") & pd.col("b"), [False, False, True, False], - "(col('a') & col('b'))", + "col('a') & col('b')", ), ( pd.col("a") & True, [True, False, True, False], - "(col('a') & True)", + "col('a') & True", ), ( pd.col("a") | pd.col("b"), [True, True, True, True], - "(col('a') | col('b'))", + "col('a') | col('b')", ), ( pd.col("a") | False, [True, False, True, False], - "(col('a') | False)", + "col('a') | False", ), ( pd.col("a") ^ pd.col("b"), [True, True, False, True], - "(col('a') ^ col('b'))", + "col('a') ^ col('b')", ), ( pd.col("a") ^ True, [False, True, False, True], - "(col('a') ^ True)", + "col('a') ^ True", ), ( ~pd.col("a"), [False, True, False, True], - "(~col('a'))", + "~col('a')", ), ], ) @@ -159,3 +211,76 @@ def test_col_logical_ops( result = df.loc[expr] expected = df[expected_values] tm.assert_frame_equal(result, expected) + + +def test_expression_getitem() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2, 3]}) + expr = pd.col("a")[1] + expected_str = "col('a')[1]" + + assert str(expr) == expected_str + + result = df.assign(b=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) + tm.assert_frame_equal(result, expected) + + +def test_property() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2, 3]}) + expr = pd.col("a").index + expected_str = "col('a').index" + + assert str(expr) == expected_str + + result = df.assign(b=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) + tm.assert_frame_equal(result, expected) + + +def test_cached_property() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + # Ensure test is valid + assert isinstance(pd.Index.dtype, cache_readonly) + + df = pd.DataFrame({"a": [1, 2, 3]}) + expr = pd.col("a").index.dtype + expected_str = "col('a').index.dtype" + assert str(expr) == expected_str + + result = df.assign(b=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": np.int64}) + tm.assert_frame_equal(result, expected) + + +def test_qcut() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2, 3]}) + expr = pd.qcut(pd.col("a"), 3) + expected_str = "qcut(x=col('a'), q=3, labels=None, retbins=False, precision=3)" + assert str(expr) == expected_str, str(expr) + + result = df.assign(b=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": pd.qcut(df["a"], 3)}) + tm.assert_frame_equal(result, expected) + + +def test_where() -> None: + # https://github.com/pandas-dev/pandas/pull/63439 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expr = pd.col("a").where(pd.col("b") == 5, 100) + expected_str = "col('a').where(col('b') == 5, 100)" + assert str(expr) == expected_str, str(expr) + + result = df.assign(c=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [100, 2, 100]}) + tm.assert_frame_equal(result, expected) + + expr = pd.col("a").where(pd.col("b") == 5, pd.col("a") + 1) + expected_str = "col('a').where(col('b') == 5, col('a') + 1)" + assert str(expr) == expected_str, str(expr) + + result = df.assign(c=expr) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [2, 2, 4]}) + tm.assert_frame_equal(result, expected)