Skip to content

fix: only do row identity based joins when joining by index #356

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 7, 2024
Merged
2 changes: 1 addition & 1 deletion bigframes/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def join(
self,
other: ArrayValue,
join_def: join_def.JoinDefinition,
allow_row_identity_join: bool = True,
allow_row_identity_join: bool = False,
):
return ArrayValue(
nodes.JoinNode(
Expand Down
4 changes: 2 additions & 2 deletions bigframes/core/compile/single_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def join_by_column_ordered(
left: compiled.OrderedIR,
right: compiled.OrderedIR,
join: join_defs.JoinDefinition,
allow_row_identity_join: bool = True,
allow_row_identity_join: bool = False,
) -> compiled.OrderedIR:
"""Join two expressions by column equality.

Expand Down Expand Up @@ -134,7 +134,7 @@ def join_by_column_unordered(
left: compiled.UnorderedIR,
right: compiled.UnorderedIR,
join: join_defs.JoinDefinition,
allow_row_identity_join: bool = True,
allow_row_identity_join: bool = False,
) -> compiled.UnorderedIR:
"""Join two expressions by column equality.

Expand Down
2 changes: 1 addition & 1 deletion bigframes/core/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ class JoinNode(BigFrameNode):
left_child: BigFrameNode
right_child: BigFrameNode
join: JoinDefinition
allow_row_identity_join: bool = True
allow_row_identity_join: bool = False

@property
def row_preserving(self) -> bool:
Expand Down
7 changes: 7 additions & 0 deletions tests/system/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,13 @@ def scalars_table_id(test_data_tables) -> str:
return test_data_tables["scalars"]


@pytest.fixture(scope="session")
def baseball_schedules_df(session: bigframes.Session) -> bigframes.dataframe.DataFrame:
"""Public BQ table"""
df = session.read_gbq("bigquery-public-data.baseball.schedules")
return df


@pytest.fixture(scope="session")
def hockey_table_id(test_data_tables) -> str:
return test_data_tables["hockey_players"]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
import pytest

from tests.system.utils import assert_pandas_df_equal


@pytest.mark.parametrize(
("merge_how",),
[
("inner",),
("outer",),
("left",),
("right",),
],
)
def test_merge_after_filter(baseball_schedules_df, merge_how):
on = ["awayTeamName"]
left_columns = [
"gameId",
"year",
"homeTeamName",
"awayTeamName",
"duration_minutes",
]
right_columns = [
"gameId",
"year",
"homeTeamName",
"awayTeamName",
"duration_minutes",
]

left = baseball_schedules_df[left_columns]
left = left[left["homeTeamName"] == "Rays"]
# Offset the rows somewhat so that outer join can have an effect.
right = baseball_schedules_df[right_columns]
right = right[right["homeTeamName"] == "White Sox"]

df = left.merge(right, on=on, how=merge_how)
bf_result = df.to_pandas()

left_pandas = baseball_schedules_df.to_pandas()[left_columns]
left_pandas = left_pandas[left_pandas["homeTeamName"] == "Rays"]

right_pandas = baseball_schedules_df.to_pandas()[right_columns]
right_pandas = right_pandas[right_pandas["homeTeamName"] == "White Sox"]

pd_result = pd.merge(
left_pandas,
right_pandas,
merge_how,
on,
sort=True,
)

assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)