Skip to content

fix: Use exact median implementation by default #619

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def quantile(
columns: Sequence[str],
qs: Sequence[float],
grouping_column_ids: Sequence[str] = (),
dropna: bool = False,
) -> blocks.Block:
# TODO: handle windowing and more interpolation methods
window = core.WindowSpec(
Expand All @@ -134,7 +135,7 @@ def quantile(
block, results = block.aggregate(
grouping_column_ids,
tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
dropna=True,
dropna=dropna,
)
return block.select_columns(results).with_column_labels(labels)

Expand Down
8 changes: 4 additions & 4 deletions bigframes/core/groupby/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame:
self._raise_on_non_numeric("mean")
return self._aggregate_all(agg_ops.mean_op, numeric_only=True)

def median(
self, numeric_only: bool = False, *, exact: bool = False
) -> df.DataFrame:
def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame:
if not numeric_only:
self._raise_on_non_numeric("median")
if exact:
Expand All @@ -138,6 +136,7 @@ def quantile(
q_cols,
qs=tuple(q) if multi_q else (q,), # type: ignore
grouping_column_ids=self._by_col_ids,
dropna=self._dropna,
)
result_df = df.DataFrame(result)
if multi_q:
Expand Down Expand Up @@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series:
def median(
self,
*args,
exact: bool = False,
exact: bool = True,
**kwargs,
) -> series.Series:
if exact:
Expand All @@ -508,6 +507,7 @@ def quantile(
(self._value_column,),
qs=tuple(q) if multi_q else (q,), # type: ignore
grouping_column_ids=self._by_col_ids,
dropna=self._dropna,
)
if multi_q:
return series.Series(result.stack())
Expand Down
10 changes: 4 additions & 6 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1995,18 +1995,16 @@ def mean(
return bigframes.series.Series(block.select_column("values"))

def median(
self, *, numeric_only: bool = False, exact: bool = False
self, *, numeric_only: bool = False, exact: bool = True
) -> bigframes.series.Series:
if exact:
raise NotImplementedError(
f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
)
if not numeric_only:
frame = self._raise_on_non_numeric("median")
else:
frame = self._drop_non_numeric()
if exact:
return self.quantile()
result = frame.quantile()
result.name = None
return result
else:
block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
return bigframes.series.Series(block.select_column("values"))
Expand Down
2 changes: 1 addition & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,7 @@ def mode(self) -> Series:
def mean(self) -> float:
return typing.cast(float, self._apply_aggregation(agg_ops.mean_op))

def median(self, *, exact: bool = False) -> float:
def median(self, *, exact: bool = True) -> float:
if exact:
return typing.cast(float, self.quantile(0.5))
else:
Expand Down
29 changes: 24 additions & 5 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,10 +1345,9 @@ def test_numeric_literal(scalars_dfs):
scalars_df, _ = scalars_dfs
col_name = "numeric_col"
assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9))
bf_result = scalars_df[col_name] - scalars_df[col_name].median()
bf_result = scalars_df[col_name] + 42
assert bf_result.size == scalars_df[col_name].size
# TODO(b/323387826): The precision increased by 1 unexpectedly.
# assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))
assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))


def test_repr(scalars_dfs):
Expand Down Expand Up @@ -1523,12 +1522,32 @@ def test_groupby_mean(scalars_dfs):
)


def test_groupby_median(scalars_dfs):
def test_groupby_median_exact(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_too"
bf_series = (
bf_result = (
scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median()
)
pd_result = (
scalars_pandas_df[col_name]
.groupby(scalars_pandas_df["string_col"], dropna=False)
.median()
)

assert_series_equal(
pd_result,
bf_result.to_pandas(),
)


def test_groupby_median_inexact(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_too"
bf_series = (
scalars_df[col_name]
.groupby(scalars_df["string_col"], dropna=False)
.median(exact=False)
)
pd_max = (
scalars_pandas_df[col_name]
.groupby(scalars_pandas_df["string_col"], dropna=False)
Expand Down
12 changes: 6 additions & 6 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def median(self, *, numeric_only: bool = False, exact: bool = False):
def median(self, *, numeric_only: bool = False, exact: bool = True):
"""Return the median of the values over colunms.

**Examples:**
Expand All @@ -4500,15 +4500,15 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
Finding the median value of each column.

>>> df.median()
A 1
B 2
dtype: Int64
A 2.0
B 3.0
dtype: Float64

Args:
numeric_only (bool. default False):
Default False. Include only float, int, boolean columns.
exact (bool. default False):
Default False. Get the exact median instead of an approximate
exact (bool. default True):
Default True. Get the exact median instead of an approximate
one.

Returns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,16 @@ def median(
self,
numeric_only: bool = False,
*,
exact: bool = False,
exact: bool = True,
):
"""
Compute median of groups, excluding missing values.

Args:
numeric_only (bool, default False):
Include only float, int, boolean columns.
exact (bool, default False):
Calculate the exact median instead of an approximation. Note:
``exact=True`` is not supported.
exact (bool, default True):
Calculate the exact median instead of an approximation.

Returns:
pandas.Series or pandas.DataFrame: Median of groups.
Expand Down
8 changes: 4 additions & 4 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3150,13 +3150,13 @@ def mean(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def median(self, *, exact: bool = False):
def median(self, *, exact: bool = True):
"""Return the median of the values over the requested axis.

Args:
exact (bool. default False):
Default False. Get the exact median instead of an approximate
one. Note: ``exact=True`` not yet supported.
exact (bool. default True):
Default True. Get the exact median instead of an approximate
one.

Returns:
scalar: Scalar.
Expand Down