From 65482ac0ffd52c487c6423b430ffebc18e3350b6 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 17 Dec 2025 09:52:24 -0800 Subject: [PATCH 1/4] PERF: avoid NumPy fallback in ArrowStringArray._from_sequence for integer types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When converting ArrowExtensionArray to string dtype, use PyArrow's native pc.cast() for integer and string types where the string representation matches Python's str(). This avoids unnecessary conversion through NumPy. Float and boolean types still fall back to lib.ensure_string_array because PyArrow's string representation differs from Python's str(): - Float: 1.0 -> "1" (PyArrow) vs "1.0" (Python) - Bool: True -> "true" (PyArrow) vs "True" (Python) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pandas/core/arrays/string_arrow.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8bad206eea028..d43eb48a04857 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -210,6 +210,23 @@ def _from_sequence( result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) pa_arr = pa.array(result, mask=na_values, type=pa.large_string()) + elif isinstance(scalars, ArrowExtensionArray): + pa_type = scalars._pa_array.type + # Use PyArrow's native cast for integer and string types where + # the string representation matches Python's str(). + # Float and boolean have different representations in PyArrow + # (e.g., 1.0 -> "1" instead of "1.0", True -> "true" instead of "True") + if ( + pa.types.is_integer(pa_type) + or pa.types.is_large_string(pa_type) + or pa.types.is_string(pa_type) + ): + pa_arr = pc.cast(scalars._pa_array, pa.large_string()) + else: + # Fall back for types where PyArrow's string representation + # differs from Python's str() + result = lib.ensure_string_array(scalars, copy=copy) + pa_arr = pa.array(result, type=pa.large_string(), from_pandas=True) elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): pa_arr = pc.cast(scalars, pa.large_string()) else: From d368576c9cebebc6175dcdabb6621f5c71e6690a Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 18 Dec 2025 10:15:59 -0800 Subject: [PATCH 2/4] update test --- pandas/tests/copy_view/test_astype.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 0075a7ed59795..a21c394b60d54 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW - from pandas import ( DataFrame, Series, @@ -218,10 +216,7 @@ def test_convert_dtypes(using_infer_string): df_orig = df.copy() df2 = df.convert_dtypes() - if HAS_PYARROW: - assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d")) assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c")) From c3284babf46d6c28008b8ebe09b32867408abec2 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 18 Dec 2025 20:17:53 -0800 Subject: [PATCH 3/4] TST: fix test_convert_dtypes for PANDAS_FUTURE_INFER_STRING=0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using_infer_string is False, string columns start as object dtype and get converted to Arrow, so memory is not shared. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pandas/tests/copy_view/test_astype.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index a21c394b60d54..c436391739ab2 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -216,7 +216,12 @@ def test_convert_dtypes(using_infer_string): df_orig = df.copy() df2 = df.convert_dtypes() - assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if using_infer_string: + # String column is already Arrow-backed, so memory is shared + assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + # String column converts from object to Arrow, no memory sharing + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d")) assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c")) From c19fb29b9969485ca3307b2036bcbcbe3033ca4c Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 19 Dec 2025 11:04:21 -0800 Subject: [PATCH 4/4] use arrow for bool array --- pandas/core/arrays/string_arrow.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d43eb48a04857..3506a8e3a5279 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -212,16 +212,19 @@ def _from_sequence( pa_arr = pa.array(result, mask=na_values, type=pa.large_string()) elif isinstance(scalars, ArrowExtensionArray): pa_type = scalars._pa_array.type - # Use PyArrow's native cast for integer and string types where - # the string representation matches Python's str(). - # Float and boolean have different representations in PyArrow - # (e.g., 1.0 -> "1" instead of "1.0", True -> "true" instead of "True") + # Use PyArrow's native cast for integer, string, and boolean types. + # Float has different representation in PyArrow: 1.0 -> "1" instead + # of "1.0", and uses different scientific notation (1e+10 vs 1e10). + # Boolean needs capitalize (true -> True, false -> False). if ( pa.types.is_integer(pa_type) or pa.types.is_large_string(pa_type) or pa.types.is_string(pa_type) + or pa.types.is_boolean(pa_type) ): pa_arr = pc.cast(scalars._pa_array, pa.large_string()) + if pa.types.is_boolean(pa_type): + pa_arr = pc.utf8_capitalize(pa_arr) else: # Fall back for types where PyArrow's string representation # differs from Python's str()