Skip to content

feat: allow case_when to change dtypes if case list contains the condition True #1311

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,19 @@ def between(self, left, right, inclusive="both"):
)

def case_when(self, caselist) -> Series:
cases = list(itertools.chain(*caselist, (True, self)))
cases = []

for condition, output in itertools.chain(caselist, [(True, self)]):
cases.append(condition)
cases.append(output)
# In pandas, the default value if no case matches is the original value.
# This makes it impossible to change the type of the column, but if
# the condition is always True, we know it will match and no subsequent
# conditions matter (including the fallback to `self`). This break allows
# the type to change (see: internal issue 349926559).
if condition is True:
break

return self._apply_nary_op(
ops.case_when_op,
cases,
Expand Down
36 changes: 36 additions & 0 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2862,6 +2862,42 @@ def test_series_case_when(scalars_dfs_maybe_ordered):
)


def test_series_case_when_change_type(scalars_dfs_maybe_ordered):
pytest.importorskip(
"pandas",
minversion="2.2.0",
reason="case_when added in pandas 2.2.0",
)
scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered

bf_series = scalars_df["int64_col"]
pd_series = scalars_pandas_df["int64_col"]

# TODO(tswast): pandas case_when appears to assume True when a value is
# null. I suspect this should be considered a bug in pandas.

bf_conditions = [
((bf_series > 645).fillna(True), scalars_df["string_col"]),
((bf_series <= -100).fillna(True), pd.NA),
(True, "not_found"),
]

pd_conditions = [
((pd_series > 645).fillna(True), scalars_pandas_df["string_col"]),
((pd_series <= -100).fillna(True), pd.NA),
# pandas currently fails if both the condition and the value are literals.
([True] * len(pd_series), ["not_found"] * len(pd_series)),
]

bf_result = bf_series.case_when(bf_conditions).to_pandas()
pd_result = pd_series.case_when(pd_conditions)

pd.testing.assert_series_equal(
bf_result,
pd_result.astype("string[pyarrow]"),
)


def test_to_frame(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

Expand Down
15 changes: 15 additions & 0 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2648,6 +2648,21 @@ def case_when(
3 2
Name: c, dtype: Int64

If you'd like to change the type, add a case with the condition True at the end of the case list

>>> c.case_when(
... caselist=[
... (a.gt(0), 'a'), # condition, replacement
... (b.gt(0), 'b'),
... (True, 'c'),
... ]
... )
0 c
1 b
2 a
3 a
Name: c, dtype: string

**See also:**

- :func:`bigframes.pandas.Series.mask` : Replace values where the condition is True.
Expand Down