Skip to content

fix: handle multi-level columns for df aggregates properly #305

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 10, 2024
12 changes: 10 additions & 2 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,13 +855,21 @@ def aggregate_all_and_stack(
aggregations = [
(col_id, operation, col_id) for col_id in self.value_columns
]
index_col_ids = [
guid.generate_guid() for i in range(self.column_labels.nlevels)
]
result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot(
row_labels=self.column_labels.to_list(),
index_col_ids=["index"],
index_col_ids=index_col_ids,
unpivot_columns=tuple([(value_col_id, tuple(self.value_columns))]),
dtype=dtype,
)
return Block(result_expr, index_columns=["index"], column_labels=[None])
return Block(
result_expr,
index_columns=index_col_ids,
column_labels=[None],
index_labels=self.column_labels.names,
)
else: # axis_n == 1
# using offsets as identity to group on.
# TODO: Allow to promote identity/total_order columns instead for better perf
Expand Down
20 changes: 20 additions & 0 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,26 @@ def test_column_multi_index_binary_op(scalars_df_index, scalars_pandas_df_index)
pandas.testing.assert_series_equal(bf_result, pd_result)


@skip_legacy_pandas
def test_column_multi_index_any():
columns = pandas.MultiIndex.from_tuples(
[("col0", "col00"), ("col0", "col00"), ("col1", "col11")]
)
pd_df = pandas.DataFrame(
[[0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2]], columns=columns
)
bf_df = bpd.DataFrame(pd_df)

pd_result = pd_df.isna().any()
bf_result = bf_df.isna().any().to_pandas()

pandas.testing.assert_frame_equal(
bf_result.reset_index(drop=False),
pd_result.reset_index(drop=False),
check_dtype=False,
)


def test_column_multi_index_agg(scalars_df_index, scalars_pandas_df_index):
columns = ["int64_too", "int64_col", "float64_col"]
multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
Expand Down