Skip to content

fix: translate labels to col ids when copying dataframes #1372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 6, 2025
20 changes: 20 additions & 0 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,26 @@ def label_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]:
mapping[label] = (*mapping.get(label, ()), id)
return mapping

def resolve_label_exact(self, label: Label) -> Optional[str]:
"""Returns the column id matching the label if there is exactly
one such column. If there are multiple columns with the same name,
raises an error. If there is no such a column, returns None."""
matches = self.label_to_col_id.get(label, [])
if len(matches) > 1:
raise ValueError(
f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}"
)
return matches[0] if len(matches) != 0 else None

def resolve_label_exact_or_error(self, label: Label) -> str:
"""Returns the column id matching the label if there is exactly
one such column. If there are multiple columns with the same name,
raises an error. If there is no such a column, raises an error too."""
col_id = self.resolve_label_exact(label)
if col_id is None:
raise ValueError(f"Label {label} not found. {constants.FEEDBACK_LINK}")
return col_id

@functools.cached_property
def col_id_to_index_name(self) -> typing.Mapping[str, Label]:
"""Get column label for value columns, or index name for index columns"""
Expand Down
15 changes: 5 additions & 10 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,10 @@ def __init__(
)
block = block.set_index([r_mapping[idx_col] for idx_col in idx_cols])
if columns:
block = block.select_columns(list(columns)) # type:ignore
column_ids = [
block.resolve_label_exact_or_error(label) for label in list(columns)
]
block = block.select_columns(column_ids) # type:ignore
if dtype:
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
Expand Down Expand Up @@ -238,15 +241,7 @@ def _find_indices(
return [self._block.value_columns.index(col_id) for col_id in col_ids]

def _resolve_label_exact(self, label) -> Optional[str]:
"""Returns the column id matching the label if there is exactly
one such column. If there are multiple columns with the same name,
raises an error. If there is no such column, returns None."""
matches = self._block.label_to_col_id.get(label, [])
if len(matches) > 1:
raise ValueError(
f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}"
)
return matches[0] if len(matches) != 0 else None
return self._block.resolve_label_exact(label)

def _sql_names(
self,
Expand Down
11 changes: 9 additions & 2 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,15 @@
def test_df_construct_copy(scalars_dfs):
columns = ["int64_col", "string_col", "float64_col"]
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = dataframe.DataFrame(scalars_df, columns=columns).to_pandas()
pd_result = pd.DataFrame(scalars_pandas_df, columns=columns)
# Make the mapping from label to col_id non-trivial
bf_df = scalars_df.copy()
bf_df["int64_col"] = bf_df["int64_col"] / 2
pd_df = scalars_pandas_df.copy()
pd_df["int64_col"] = pd_df["int64_col"] / 2

bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas()

pd_result = pd.DataFrame(pd_df, columns=columns)
pandas.testing.assert_frame_equal(bf_result, pd_result)


Expand Down