Skip to content

Commit aeb22ad

Browse files
perf: Automatically squash internal projection nodes and use internal schema system.
1 parent e8e66cf commit aeb22ad

File tree

4 files changed

+17
-32
lines changed

4 files changed

+17
-32
lines changed

‎bigframes/core/__init__.py

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def session(self) -> Session:
107107
@functools.cached_property
108108
def schema(self) -> schemata.ArraySchema:
109109
# TODO: switch to use self.node.schema
110-
return self._compiled_schema
110+
return self.node.schema
111111

112112
@functools.cached_property
113113
def _compiled_schema(self) -> schemata.ArraySchema:
@@ -118,18 +118,6 @@ def _compiled_schema(self) -> schemata.ArraySchema:
118118
)
119119
return schemata.ArraySchema(items)
120120

121-
def validate_schema(self):
122-
tree_derived = self.node.schema
123-
ibis_derived = self._compiled_schema
124-
if tree_derived.names != ibis_derived.names:
125-
raise ValueError(
126-
f"Unexpected names internal {tree_derived.names} vs compiled {ibis_derived.names}"
127-
)
128-
if tree_derived.dtypes != ibis_derived.dtypes:
129-
raise ValueError(
130-
f"Unexpected types internal {tree_derived.dtypes} vs compiled {ibis_derived.dtypes}"
131-
)
132-
133121
def _try_evaluate_local(self):
134122
"""Use only for unit testing paths - not fully featured. Will throw exception if fails."""
135123
import ibis
@@ -196,7 +184,7 @@ def project_to_id(self, expression: ex.Expression, output_id: str):
196184
child=self.node,
197185
assignments=tuple(exprs),
198186
)
199-
)
187+
).rewrite_projection()
200188

201189
def assign(self, source_id: str, destination_id: str) -> ArrayValue:
202190
if destination_id in self.column_ids: # Mutate case
@@ -221,7 +209,7 @@ def assign(self, source_id: str, destination_id: str) -> ArrayValue:
221209
child=self.node,
222210
assignments=tuple(exprs),
223211
)
224-
)
212+
).rewrite_projection()
225213

226214
def assign_constant(
227215
self,
@@ -251,7 +239,7 @@ def assign_constant(
251239
child=self.node,
252240
assignments=tuple(exprs),
253241
)
254-
)
242+
).rewrite_projection()
255243

256244
def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue:
257245
selections = ((ex.free_var(col_id), col_id) for col_id in column_ids)
@@ -260,7 +248,7 @@ def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue:
260248
child=self.node,
261249
assignments=tuple(selections),
262250
)
263-
)
251+
).rewrite_projection()
264252

265253
def drop_columns(self, columns: Iterable[str]) -> ArrayValue:
266254
new_projection = (
@@ -273,7 +261,7 @@ def drop_columns(self, columns: Iterable[str]) -> ArrayValue:
273261
child=self.node,
274262
assignments=tuple(new_projection),
275263
)
276-
)
264+
).rewrite_projection()
277265

278266
def aggregate(
279267
self,
@@ -404,3 +392,7 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue:
404392
The row numbers of result is non-deterministic, avoid to use.
405393
"""
406394
return ArrayValue(nodes.RandomSampleNode(self.node, fraction))
395+
396+
def rewrite_projection(self) -> ArrayValue:
397+
rewritten = bigframes.core.rewrite.maybe_squash_projection(self.node)
398+
return ArrayValue(rewritten)

‎bigframes/core/rewrite.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,13 @@ def expand(self) -> nodes.BigFrameNode:
170170
return nodes.ProjectionNode(child=root, assignments=self.columns)
171171

172172

173+
def maybe_squash_projection(node: nodes.BigFrameNode) -> nodes.BigFrameNode:
174+
squashed = SquashedSelect.from_node(node)
175+
if squashed.root not in node.child_nodes:
176+
return squashed.expand()
177+
return node
178+
179+
173180
def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode:
174181
left_side = SquashedSelect.from_node(join_node.left_child)
175182
right_side = SquashedSelect.from_node(join_node.right_child)

‎bigframes/dataframe.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from __future__ import annotations
1818

1919
import datetime
20-
import os
2120
import re
2221
import sys
2322
import textwrap
@@ -174,11 +173,6 @@ def __init__(
174173
self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block()
175174
self._query_job: Optional[bigquery.QueryJob] = None
176175

177-
# Runs strict validations to ensure internal type predictions and ibis are completely in sync
178-
# Do not execute these validations outside of testing suite.
179-
if "PYTEST_CURRENT_TEST" in os.environ:
180-
self._block.expr.validate_schema()
181-
182176
def __dir__(self):
183177
return dir(type(self)) + [
184178
label

‎tests/system/small/test_dataframe.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
import io
1616
import operator
17-
import sys
1817
import tempfile
1918
import typing
2019
from typing import Tuple
@@ -4034,13 +4033,6 @@ def test_df_dot_operator_series(
40344033
)
40354034

40364035

4037-
# TODO(tswast): We may be able to re-enable this test after we break large
4038-
# queries up in https://github.com/googleapis/python-bigquery-dataframes/pull/427
4039-
@pytest.mark.skipif(
4040-
sys.version_info >= (3, 12),
4041-
# See: https://github.com/python/cpython/issues/112282
4042-
reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.",
4043-
)
40444036
def test_recursion_limit(scalars_df_index):
40454037
scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]]
40464038
for i in range(400):

0 commit comments

Comments
 (0)