Skip to content

feat: support typed pyarrow.Scalar in assignment #1930

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jul 23, 2025
2 changes: 2 additions & 0 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,8 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:

def infer_literal_type(literal) -> typing.Optional[Dtype]:
# Maybe also normalize literal to canonical python representation to remove this burden from compilers?
if isinstance(literal, pa.Scalar):
return arrow_dtype_to_bigframes_dtype(literal.type)
if pd.api.types.is_list_like(literal):
element_types = [infer_literal_type(i) for i in literal]
common_type = lcd_type(*element_types)
Expand Down
50 changes: 44 additions & 6 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,15 +906,53 @@ def test_df_to_pandas_batches(scalars_dfs):
assert_pandas_df_equal(pd.concat(filtered_batches), pd_result)


def test_assign_new_column(scalars_dfs):
@pytest.mark.parametrize(
("literal", "expected_dtype"),
(
pytest.param(
2,
dtypes.INT_DTYPE,
id="INT64",
),
# ====================================================================
# NULL values
#
# These are regression tests for b/428999884. It needs to be possible to
# set a column to NULL with a desired type (not just the pandas default
# of float64).
# ====================================================================
pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"),
pytest.param(
pa.scalar(None, type=pa.int64()),
dtypes.INT_DTYPE,
id="NULL-pyarrow-TIMESTAMP",
),
pytest.param(
pa.scalar(None, type=pa.timestamp("us", tz="UTC")),
dtypes.TIMESTAMP_DTYPE,
id="NULL-pyarrow-TIMESTAMP",
),
pytest.param(
pa.scalar(None, type=pa.timestamp("us")),
dtypes.DATETIME_DTYPE,
id="NULL-pyarrow-DATETIME",
),
),
)
def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype):
scalars_df, scalars_pandas_df = scalars_dfs
kwargs = {"new_col": 2}
df = scalars_df.assign(**kwargs)
df = scalars_df.assign(new_col=literal)
bf_result = df.to_pandas()
pd_result = scalars_pandas_df.assign(**kwargs)

# Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes.
pd_result["new_col"] = pd_result["new_col"].astype("Int64")
new_col_pd = literal
if isinstance(literal, pa.Scalar):
# PyArrow integer scalars aren't yet supported in pandas Int64Dtype.
new_col_pd = literal.as_py()

# Pandas might not pick the same dtype as BigFrames, but it should at least
# be castable to it.
pd_result = scalars_pandas_df.assign(new_col=new_col_pd)
pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype)

assert_pandas_df_equal(bf_result, pd_result)

Expand Down
2 changes: 1 addition & 1 deletion tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,7 @@ def test_read_gbq_wildcard(
"query": {
"useQueryCache": True,
"maximumBytesBilled": "1000000000",
"timeoutMs": 10000,
"timeoutMs": 120_000,
}
},
pytest.param(
Expand Down
16 changes: 16 additions & 0 deletions tests/unit/core/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,3 +272,19 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal():
ValueError,
):
bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"})


@pytest.mark.parametrize(
["scalar", "expected_dtype"],
[
(pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE),
# Support NULL scalars.
(pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE),
(pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE),
(pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE),
],
)
def test_infer_literal_type_arrow_scalar(scalar, expected_dtype):
assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype
5 changes: 5 additions & 0 deletions third_party/bigframes_vendored/ibis/common/temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,3 +260,8 @@ def _from_numpy_datetime64(value):
raise TypeError("Unable to convert np.datetime64 without pandas")
else:
return pd.Timestamp(value).to_pydatetime()


@normalize_datetime.register("pyarrow.Scalar")
def _from_pyarrow_scalar(value):
return value.as_py()
12 changes: 12 additions & 0 deletions third_party/bigframes_vendored/ibis/expr/datatypes/value.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import bigframes_vendored.ibis.expr.datatypes as dt
from bigframes_vendored.ibis.expr.datatypes.cast import highest_precedence
from public import public
import pyarrow as pa
import toolz


Expand Down Expand Up @@ -71,6 +72,14 @@ def infer_list(values: Sequence[Any]) -> dt.Array:
return dt.Array(highest_precedence(map(infer, values)))


@infer.register("pyarrow.Scalar")
def infer_pyarrow_scalar(value: "pa.Scalar"):
"""Infert the type of a PyArrow Scalar value."""
import bigframes_vendored.ibis.formats.pyarrow

return bigframes_vendored.ibis.formats.pyarrow.PyArrowType.to_ibis(value.type)


@infer.register(datetime.time)
def infer_time(value: datetime.time) -> dt.Time:
return dt.time
Expand Down Expand Up @@ -253,6 +262,9 @@ def infer_shapely_multipolygon(value) -> dt.MultiPolygon:
def normalize(typ, value):
"""Ensure that the Python type underlying a literal resolves to a single type."""

if pa is not None and isinstance(value, pa.Scalar):
value = value.as_py()

dtype = dt.dtype(typ)
if value is None:
if not dtype.nullable:
Expand Down
2 changes: 0 additions & 2 deletions third_party/bigframes_vendored/ibis/formats/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
@functools.cache
def _from_pyarrow_types():
import pyarrow as pa
import pyarrow_hotfix # noqa: F401

return {
pa.int8(): dt.Int8,
Expand Down Expand Up @@ -87,7 +86,6 @@ class PyArrowType(TypeMapper):
def to_ibis(cls, typ: pa.DataType, nullable=True) -> dt.DataType:
"""Convert a pyarrow type to an ibis type."""
import pyarrow as pa
import pyarrow_hotfix # noqa: F401

if pa.types.is_null(typ):
return dt.null
Expand Down