Skip to content

feat: allow pandas.cut 'labels' parameter to accept a list of strings #1549

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions bigframes/core/compile/aggregate_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,13 @@ def _(
for this_bin in range(op.bins):
if op.labels is False:
value = compile_ibis_types.literal_to_ibis_scalar(
this_bin, force_dtype=pd.Int64Dtype()
this_bin,
force_dtype=pd.Int64Dtype(),
)
elif isinstance(op.labels, typing.Iterable):
value = compile_ibis_types.literal_to_ibis_scalar(
list(op.labels)[this_bin],
force_dtype=pd.StringDtype(storage="pyarrow"),
)
else:
left_adj = adj if this_bin == 0 and op.right else 0
Expand Down Expand Up @@ -402,7 +408,13 @@ def _(

if op.labels is False:
value = compile_ibis_types.literal_to_ibis_scalar(
this_bin, force_dtype=pd.Int64Dtype()
this_bin,
force_dtype=pd.Int64Dtype(),
)
elif isinstance(op.labels, typing.Iterable):
value = compile_ibis_types.literal_to_ibis_scalar(
list(op.labels)[this_bin],
force_dtype=pd.StringDtype(storage="pyarrow"),
)
else:
if op.right:
Expand Down
43 changes: 35 additions & 8 deletions bigframes/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile
import pandas as pd

import bigframes.constants
import bigframes.core.expression as ex
import bigframes.core.ordering as order
import bigframes.core.utils as utils
Expand All @@ -41,17 +42,37 @@ def cut(
right: typing.Optional[bool] = True,
labels: typing.Union[typing.Iterable[str], bool, None] = None,
) -> bigframes.series.Series:
if labels is not None and labels is not False:
if (
labels is not None
and labels is not False
and not isinstance(labels, typing.Iterable)
):
raise ValueError(
"Bin labels must either be False, None or passed in as a list-like argument"
)
if (
isinstance(labels, typing.Iterable)
and len(list(labels)) > 0
and not isinstance(list(labels)[0], str)
):
raise NotImplementedError(
"The 'labels' parameter must be either False or None. "
"Please provide a valid value for 'labels'."
"When using an iterable for labels, only iterables of strings are supported "
f"but found {type(list(labels)[0])}. {constants.FEEDBACK_LINK}"
)

if x.size == 0:
raise ValueError("Cannot cut empty array.")

if isinstance(bins, int):
if bins <= 0:
raise ValueError("`bins` should be a positive integer.")
if isinstance(labels, typing.Iterable):
labels = tuple(labels)
if len(labels) != bins:
raise ValueError(
f"Bin labels({len(labels)}) must be same as the value of bins({bins})"
)

op = agg_ops.CutOp(bins, right=right, labels=labels)
return x._apply_window_op(op, window_spec=window_specs.unbound())
elif isinstance(bins, typing.Iterable):
Expand All @@ -64,9 +85,6 @@ def cut(
elif len(list(bins)) == 0:
as_index = pd.IntervalIndex.from_tuples(list(bins))
bins = tuple()
# To maintain consistency with pandas' behavior
right = True
labels = None
elif isinstance(list(bins)[0], tuple):
as_index = pd.IntervalIndex.from_tuples(list(bins))
bins = tuple(bins)
Expand All @@ -88,8 +106,17 @@ def cut(
raise ValueError("`bins` iterable should contain tuples or numerics.")

if as_index.is_overlapping:
raise ValueError("Overlapping IntervalIndex is not accepted.")
elif len(as_index) == 0:
raise ValueError("Overlapping IntervalIndex is not accepted.") # TODO: test

if isinstance(labels, typing.Iterable):
labels = tuple(labels)
if len(labels) != len(as_index):
raise ValueError(
f"Bin labels({len(labels)}) must be same as the number of bin edges"
f"({len(as_index)})"
)

if len(as_index) == 0:
dtype = agg_ops.CutOp(bins, right=right, labels=labels).output_type()
return bigframes.series.Series(
[pd.NA] * len(x),
Expand Down
4 changes: 3 additions & 1 deletion bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ class CutOp(UnaryWindowOp):
# TODO: Unintuitive, refactor into multiple ops?
bins: typing.Union[int, Iterable]
right: Optional[bool]
labels: Optional[bool]
labels: typing.Union[bool, Iterable[str], None]

@property
def skips_nulls(self):
Expand All @@ -349,6 +349,8 @@ def skips_nulls(self):
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
if self.labels is False:
return dtypes.INT_DTYPE
elif isinstance(self.labels, Iterable):
return dtypes.STRING_DTYPE
else:
# Assumption: buckets use same numeric type
if isinstance(self.bins, int):
Expand Down
65 changes: 52 additions & 13 deletions tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,14 @@ def _convert_pandas_category(pd_s: pd.Series):
f"Input must be a pandas Series with categorical data: {pd_s.dtype}"
)

if pd.api.types.is_object_dtype(pd_s.cat.categories.dtype):
return pd_s.astype(pd.StringDtype(storage="pyarrow"))

if not isinstance(pd_s.cat.categories.dtype, pd.IntervalDtype):
raise ValueError(
f"Must be a IntervalDtype with categorical data: {pd_s.cat.categories.dtype}"
)

if pd_s.cat.categories.dtype.closed == "left": # type: ignore
left_key = "left_inclusive"
right_key = "right_exclusive"
Expand Down Expand Up @@ -465,6 +473,17 @@ def test_cut_by_int_bins(scalars_dfs, labels, right):
pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)


def test_cut_by_int_bins_w_labels(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

labels = ["A", "B", "C", "D", "E"]
pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=labels)
bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=labels)

pd_result = _convert_pandas_category(pd_result)
pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)


@pytest.mark.parametrize(
("breaks", "right", "labels"),
[
Expand Down Expand Up @@ -494,7 +513,7 @@ def test_cut_by_int_bins(scalars_dfs, labels, right):
),
],
)
def test_cut_numeric_breaks(scalars_dfs, breaks, right, labels):
def test_cut_by_numeric_breaks(scalars_dfs, breaks, right, labels):
scalars_df, scalars_pandas_df = scalars_dfs

pd_result = pd.cut(
Expand All @@ -508,6 +527,18 @@ def test_cut_numeric_breaks(scalars_dfs, breaks, right, labels):
pd.testing.assert_series_equal(bf_result, pd_result_converted)


def test_cut_by_numeric_breaks_w_labels(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

bins = [0, 5, 10, 15, 20]
labels = ["A", "B", "C", "D"]
pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels)
bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels)

pd_result = _convert_pandas_category(pd_result)
pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)


@pytest.mark.parametrize(
("bins", "right", "labels"),
[
Expand All @@ -534,7 +565,7 @@ def test_cut_numeric_breaks(scalars_dfs, breaks, right, labels):
),
],
)
def test_cut_with_interval(scalars_dfs, bins, right, labels):
def test_cut_by_interval_bins(scalars_dfs, bins, right, labels):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = bpd.cut(
scalars_df["int64_too"], bins, labels=labels, right=right
Expand All @@ -548,22 +579,30 @@ def test_cut_with_interval(scalars_dfs, bins, right, labels):
pd.testing.assert_series_equal(bf_result, pd_result_converted)


def test_cut_by_interval_bins_w_labels(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

bins = pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)])
labels = ["A", "B", "C", "D", "E"]
pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels)
bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels)

pd_result = _convert_pandas_category(pd_result)
pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)


@pytest.mark.parametrize(
"bins",
("bins", "labels"),
[
pytest.param([], id="empty_breaks"),
pytest.param(
[1], id="single_int_breaks", marks=pytest.mark.skip(reason="b/404338651")
),
pytest.param(pd.IntervalIndex.from_tuples([]), id="empty_interval_index"),
pytest.param([], None, id="empty_breaks"),
pytest.param([1], False, id="single_int_breaks"),
pytest.param(pd.IntervalIndex.from_tuples([]), None, id="empty_interval_index"),
],
)
def test_cut_by_edge_cases_bins(scalars_dfs, bins):
def test_cut_by_edge_cases_bins(scalars_dfs, bins, labels):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas()
if isinstance(bins, list):
bins = pd.IntervalIndex.from_tuples(bins)
pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False)
bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=labels).to_pandas()
pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=labels)

pd_result_converted = _convert_pandas_category(pd_result)
pd.testing.assert_series_equal(bf_result, pd_result_converted)
Expand Down
57 changes: 51 additions & 6 deletions tests/unit/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,48 @@ def test_method_matches_session(method_name: str):
assert pandas_signature.return_annotation == session_signature.return_annotation


def test_cut_raises_with_labels():
@pytest.mark.parametrize(
("bins", "labels", "error_message"),
[
pytest.param(
5,
True,
"Bin labels must either be False, None or passed in as a list-like argument",
id="true",
),
pytest.param(
5,
1.5,
"Bin labels must either be False, None or passed in as a list-like argument",
id="invalid_types",
),
pytest.param(
2,
["A"],
"must be same as the value of bins",
id="int_bins_mismatch",
),
pytest.param(
[1, 2, 3],
["A"],
"must be same as the number of bin edges",
id="iterator_bins_mismatch",
),
],
)
def test_cut_raises_with_invalid_labels(bins: int, labels, error_message: str):
mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
with pytest.raises(ValueError, match=error_message):
bigframes.pandas.cut(mock_series, bins, labels=labels)


def test_cut_raises_with_unsupported_labels():
mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
labels = [1, 2]
with pytest.raises(
NotImplementedError,
match="The 'labels' parameter must be either False or None.",
NotImplementedError, match=r".*only iterables of strings are supported.*"
):
mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"])
bigframes.pandas.cut(mock_series, 2, labels=labels) # type: ignore


@pytest.mark.parametrize(
Expand All @@ -111,11 +146,21 @@ def test_cut_raises_with_labels():
"`bins` iterable should contain tuples or numerics",
id="iterable_w_wrong_type",
),
pytest.param(
[10, 3],
"left side of interval must be <= right side",
id="decreased_breaks",
),
pytest.param(
[(1, 10), (2, 25)],
"Overlapping IntervalIndex is not accepted.",
id="overlapping_intervals",
),
],
)
def test_cut_raises_with_invalid_bins(bins: int, error_message: str):
mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
with pytest.raises(ValueError, match=error_message):
mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
bigframes.pandas.cut(mock_series, bins, labels=False)


Expand Down
21 changes: 14 additions & 7 deletions third_party/bigframes_vendored/pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ def cut(
age ranges. Supports binning into an equal number of bins, or a
pre-specified array of bins.

``labels=False`` implies you just want the bins back.

**Examples:**

>>> import bigframes.pandas as bpd
Expand All @@ -55,7 +53,16 @@ def cut(
3 {'left_exclusive': 7.5, 'right_inclusive': 10.0}
dtype: struct<left_exclusive: double, right_inclusive: double>[pyarrow]

Cut with an integer (equal-width bins) and labels=False:
Cut with the same bins, but assign them specific labels:

>>> bpd.cut(s, bins=3, labels=["bad", "medium", "good"])
0 bad
1 bad
2 medium
3 good
dtype: string

`labels=False` implies you want the bins back.

>>> bpd.cut(s, bins=4, labels=False)
0 0
Expand All @@ -67,7 +74,6 @@ def cut(
Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex:

>>> import pandas as pd

>>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)])
>>> bpd.cut(s, bins=interval_index)
0 <NA>
Expand Down Expand Up @@ -107,7 +113,7 @@ def cut(
dtype: struct<left_inclusive: int64, right_exclusive: int64>[pyarrow]

Args:
x (Series):
x (bigframes.pandas.Series):
The input Series to be binned. Must be 1-dimensional.
bins (int, pd.IntervalIndex, Iterable):
The criteria to bin by.
Expand All @@ -127,10 +133,11 @@ def cut(
``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
indicate (1,2], (2,3], (3,4]. This argument is ignored when
`bins` is an IntervalIndex.
labels (default None):
labels (bool, Iterable, default None):
Specifies the labels for the returned bins. Must be the same length as
the resulting bins. If False, returns only integer indicators of the
bins. This affects the type of the output container.
bins. This affects the type of the output container. This argument is
ignored when `bins` is an IntervalIndex. If True, raises an error.

Returns:
bigframes.pandas.Series:
Expand Down