Skip to content

feat: support list of numerics in pandas.cut #580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 12, 2024
26 changes: 23 additions & 3 deletions bigframes/core/reshape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from __future__ import annotations

import typing
from typing import Iterable, Literal, Optional, Tuple, Union
from typing import Iterable, Literal, Optional, Union

import pandas as pd

Expand Down Expand Up @@ -113,7 +113,7 @@ def cut(
bins: Union[
int,
pd.IntervalIndex,
Iterable[Tuple[Union[int, float], Union[int, float]]],
Iterable,
],
*,
labels: Optional[bool] = None,
Expand All @@ -125,9 +125,29 @@ def cut(
if isinstance(bins, pd.IntervalIndex):
as_index: pd.IntervalIndex = bins
bins = tuple((bin.left.item(), bin.right.item()) for bin in bins)
else:
elif len(list(bins)) == 0:
raise ValueError("`bins` iterable should have at least one item")
elif isinstance(list(bins)[0], tuple):
as_index = pd.IntervalIndex.from_tuples(list(bins))
bins = tuple(bins)
elif pd.api.types.is_number(list(bins)[0]):
bins_list = list(bins)
if len(bins_list) < 2:
raise ValueError(
"`bins` iterable of numeric breaks should have"
" at least two items"
)
as_index = pd.IntervalIndex.from_breaks(bins_list)
single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list])
numeric_type = type(bins_list[0]) if single_type else float
bins = tuple(
[
(numeric_type(bins_list[i]), numeric_type(bins_list[i + 1]))
for i in range(len(bins_list) - 1)
]
)
else:
raise ValueError("`bins` iterable should contain tuples or numerics")

if as_index.is_overlapping:
raise ValueError("Overlapping IntervalIndex is not accepted.")
Expand Down
6 changes: 3 additions & 3 deletions bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import abc
import dataclasses
import typing
from typing import ClassVar, Hashable, Optional, Tuple
from typing import ClassVar, Iterable, Optional

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -213,7 +213,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
@dataclasses.dataclass(frozen=True)
class CutOp(UnaryWindowOp):
# TODO: Unintuitive, refactor into multiple ops?
bins: typing.Union[int, Tuple[Tuple[Hashable, Hashable], ...]]
bins: typing.Union[int, Iterable]
labels: Optional[bool]

@property
Expand All @@ -232,7 +232,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
interval_dtype = (
pa.float64()
if isinstance(self.bins, int)
else dtypes.infer_literal_arrow_type(self.bins[0][0])
else dtypes.infer_literal_arrow_type(list(self.bins)[0][0])
)
pa_type = pa.struct(
[
Expand Down
52 changes: 52 additions & 0 deletions tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,58 @@ def test_cut_default_labels(scalars_dfs):
)


@pytest.mark.parametrize(
("breaks",),
[
([0, 5, 10, 15, 20, 100, 1000],), # ints
([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],), # floats
([0, 5, 10.5, 15.5, 20, 100, 1000.5],), # mixed
],
)
def test_cut_numeric_breaks(scalars_dfs, breaks):
scalars_df, scalars_pandas_df = scalars_dfs

pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks)
bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas()

# Convert to match data format
pd_result_converted = pd.Series(
[
{"left_exclusive": interval.left, "right_inclusive": interval.right}
if pd.notna(val)
else pd.NA
for val, interval in zip(
pd_result, pd_result.cat.categories[pd_result.cat.codes]
)
],
name=pd_result.name,
)

pd.testing.assert_series_equal(
bf_result, pd_result_converted, check_index=False, check_dtype=False
)


@pytest.mark.parametrize(
("bins",),
[
(-1,), # negative integer bins argument
([],), # empty iterable of bins
(["notabreak"],), # iterable of wrong type
([1],), # numeric breaks with only one numeric
# this is supported by pandas but not by
# the bigquery operation and a bigframes workaround
# is not yet available. Should return column
# of structs with all NaN values.
],
)
def test_cut_errors(scalars_dfs, bins):
scalars_df, _ = scalars_dfs

with pytest.raises(ValueError):
bpd.cut(scalars_df["float64_col"], bins)


@pytest.mark.parametrize(
("bins",),
[
Expand Down
16 changes: 15 additions & 1 deletion third_party/bigframes_vendored/pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,20 @@ def cut(
3 {'left_exclusive': 5, 'right_inclusive': 20}
dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]

Cut with an iterable of ints:

>>> bins_ints = [0, 1, 5, 20]
>>> bpd.cut(s, bins=bins_ints)
0 <NA>
1 {'left_exclusive': 0, 'right_inclusive': 1}
2 {'left_exclusive': 1, 'right_inclusive': 5}
3 {'left_exclusive': 5, 'right_inclusive': 20}
dtype: struct<left_exclusive: int64, right_inclusive: int64>[pyarrow]

Args:
x (Series):
The input Series to be binned. Must be 1-dimensional.
bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]):
bins (int, pd.IntervalIndex, Iterable):
The criteria to bin by.

int: Defines the number of equal-width bins in the range of `x`. The
Expand All @@ -88,6 +98,10 @@ def cut(

pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used.
It's important to ensure that these bins are non-overlapping.

Iterable of numerics: Defines the exact bins by using the interval
between each item and its following item. The items must be monotonically
increasing.
labels (None):
Specifies the labels for the returned bins. Must be the same length as
the resulting bins. If False, returns only integer indicators of the
Expand Down