Skip to content

feat: show possible correct key(s) in .__getitem__ KeyError message #1097

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 23, 2024
16 changes: 15 additions & 1 deletion bigframes/core/groupby/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import bigframes_vendored.constants as constants
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
import jellyfish
import pandas as pd

from bigframes.core import log_adapter
Expand Down Expand Up @@ -91,8 +92,21 @@ def __getitem__(

bad_keys = [key for key in keys if key not in self._block.column_labels]

# Raise a KeyError message with the possible correct key(s)
if len(bad_keys) > 0:
raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")
possible_key = []
for bad_key in bad_keys:
possible_key.append(
min(
self._block.column_labels,
key=lambda item: jellyfish.damerau_levenshtein_distance(
bad_key, item
),
)
)
raise KeyError(
f"Columns not found: {str(bad_keys)[1:-1]}. Did you mean {str(possible_key)[1:-1]}?"
)

columns = [
col_id for col_id, label in self._col_id_labels.items() if label in keys
Expand Down
37 changes: 21 additions & 16 deletions tests/system/small/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,24 +426,12 @@ def test_dataframe_groupby_getitem_error(
scalars_pandas_df_index,
):
col_names = ["float64_col", "int64_col", "bool_col", "string_col"]
with pytest.raises(KeyError, match="\"Columns not found: 'not_in_group'\""):
(
scalars_df_index[col_names]
.groupby("string_col")["not_in_group"]
.min()
.to_pandas()
)


def test_dataframe_groupby_getitem_multiple_columns_error(
scalars_df_index,
scalars_pandas_df_index,
):
col_names = ["float64_col", "int64_col", "bool_col", "string_col"]
with pytest.raises(KeyError, match="\"Columns not found: 'col1', 'col2'\""):
with pytest.raises(
KeyError, match=r"Columns not found: 'not_in_group'. Did you mean 'string_col'?"
):
(
scalars_df_index[col_names]
.groupby("string_col")["col1", "col2"]
.groupby("bool_col")["not_in_group"]
.min()
.to_pandas()
)
Expand All @@ -464,6 +452,23 @@ def test_dataframe_groupby_getitem_list(
pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)


def test_dataframe_groupby_getitem_list_error(
scalars_df_index,
scalars_pandas_df_index,
):
col_names = ["float64_col", "int64_col", "bool_col", "string_col"]
with pytest.raises(
KeyError,
match=r"Columns not found: 'col1', 'float'. Did you mean 'bool_col', 'float64_col'?",
):
(
scalars_df_index[col_names]
.groupby("string_col")["col1", "float"]
.min()
.to_pandas()
)


def test_dataframe_groupby_nonnumeric_with_mean():
df = pd.DataFrame(
{
Expand Down