Skip to content

Commit 8b8155f

Browse files
feat: Add DataFrame.mask method (#1302)
1 parent 3bee635 commit 8b8155f

File tree

3 files changed

+106
-0
lines changed

3 files changed

+106
-0
lines changed

‎bigframes/dataframe.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2365,6 +2365,9 @@ def where(self, cond, other=None):
23652365
result.columns.names = self.columns.names
23662366
return result
23672367

2368+
def mask(self, cond, other=None):
2369+
return self.where(~cond, other=other)
2370+
23682371
def dropna(
23692372
self,
23702373
*,

‎tests/system/small/test_dataframe.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,17 @@ def test_where_series_cond(scalars_df_index, scalars_pandas_df_index):
331331
pandas.testing.assert_frame_equal(bf_result, pd_result)
332332

333333

334+
def test_mask_series_cond(scalars_df_index, scalars_pandas_df_index):
335+
cond_bf = scalars_df_index["int64_col"] > 0
336+
cond_pd = scalars_pandas_df_index["int64_col"] > 0
337+
338+
bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]]
339+
pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]]
340+
bf_result = bf_df.mask(cond_bf, bf_df + 1).to_pandas()
341+
pd_result = pd_df.mask(cond_pd, pd_df + 1)
342+
pandas.testing.assert_frame_equal(bf_result, pd_result)
343+
344+
334345
def test_where_series_multi_index(scalars_df_index, scalars_pandas_df_index):
335346
# Test when a dataframe has multi-index or multi-columns.
336347
columns = ["int64_col", "float64_col"]

‎third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2048,6 +2048,98 @@ def where(self, cond, other):
20482048
"""
20492049
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
20502050

2051+
def mask(self, cond, other):
2052+
"""Replace values where the condition is False.
2053+
2054+
**Examples:**
2055+
2056+
>>> import bigframes.pandas as bpd
2057+
>>> bpd.options.display.progress_bar = None
2058+
2059+
>>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]})
2060+
>>> df
2061+
a b
2062+
0 20 0
2063+
1 10 10
2064+
2 0 20
2065+
<BLANKLINE>
2066+
[3 rows x 2 columns]
2067+
2068+
You can filter the values in the dataframe based on a condition. The
2069+
values matching the condition would be kept, and not matching would be
2070+
replaced. The default replacement value is ``NA``. For example, when the
2071+
condition is a dataframe:
2072+
2073+
>>> df.mask(df > 0)
2074+
a b
2075+
0 <NA> 0
2076+
1 <NA> <NA>
2077+
2 0 <NA>
2078+
<BLANKLINE>
2079+
[3 rows x 2 columns]
2080+
2081+
You can specify a custom replacement value for non-matching values.
2082+
2083+
>>> df.mask(df > 0, -1)
2084+
a b
2085+
0 -1 0
2086+
1 -1 -1
2087+
2 0 -1
2088+
<BLANKLINE>
2089+
[3 rows x 2 columns]
2090+
2091+
Besides dataframe, the condition can be a series too. For example:
2092+
2093+
>>> df.mask(df['a'] > 10, -1)
2094+
a b
2095+
0 -1 -1
2096+
1 10 10
2097+
2 0 20
2098+
<BLANKLINE>
2099+
[3 rows x 2 columns]
2100+
2101+
As for the replacement, it can be a dataframe too. For example:
2102+
2103+
>>> df.mask(df > 10, -df)
2104+
a b
2105+
0 -20 0
2106+
1 10 10
2107+
2 0 -20
2108+
<BLANKLINE>
2109+
[3 rows x 2 columns]
2110+
2111+
>>> df.mask(df['a'] > 10, -df)
2112+
a b
2113+
0 -20 0
2114+
1 10 10
2115+
2 0 20
2116+
<BLANKLINE>
2117+
[3 rows x 2 columns]
2118+
2119+
Please note, replacement doesn't support Series for now. In pandas, when
2120+
specifying a Series as replacement, the axis value should be specified
2121+
at the same time, which is not supported in bigframes DataFrame.
2122+
2123+
Args:
2124+
cond (bool Series/DataFrame, array-like, or callable):
2125+
Where cond is False, keep the original value. Where True, replace
2126+
with corresponding value from other. If cond is callable, it is
2127+
computed on the Series/DataFrame and returns boolean
2128+
Series/DataFrame or array. The callable must not change input
2129+
Series/DataFrame (though pandas doesn’t check it).
2130+
other (scalar, DataFrame, or callable):
2131+
Entries where cond is True are replaced with corresponding value
2132+
from other. If other is callable, it is computed on the
2133+
DataFrame and returns scalar or DataFrame. The callable must not
2134+
change input DataFrame (though pandas doesn’t check it). If not
2135+
specified, entries will be filled with the corresponding NULL
2136+
value (np.nan for numpy dtypes, pd.NA for extension dtypes).
2137+
2138+
Returns:
2139+
DataFrame: DataFrame after the replacement.
2140+
"""
2141+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
2142+
20512143
# ----------------------------------------------------------------------
20522144
# Sorting
20532145

0 commit comments

Comments
 (0)