Skip to content

Commit e43d15d

Browse files
authored
feat: df.to_pandas_batches() returns one empty DataFrame if df is empty (#1878)
1 parent 7e8658b commit e43d15d

File tree

2 files changed

+38
-0
lines changed

2 files changed

+38
-0
lines changed

‎bigframes/core/blocks.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -620,15 +620,31 @@ def to_pandas_batches(
620620
ordered=True,
621621
use_explicit_destination=allow_large_results,
622622
)
623+
624+
total_batches = 0
623625
for df in execute_result.to_pandas_batches(
624626
page_size=page_size, max_results=max_results
625627
):
628+
total_batches += 1
626629
self._copy_index_to_pandas(df)
627630
if squeeze:
628631
yield df.squeeze(axis=1)
629632
else:
630633
yield df
631634

635+
# To reduce the number of edge cases to consider when working with the
636+
# results of this, always return at least one DataFrame. See:
637+
# b/428918844.
638+
if total_batches == 0:
639+
df = pd.DataFrame(
640+
{
641+
col: pd.Series([], dtype=self.expr.get_column_type(col))
642+
for col in itertools.chain(self.value_columns, self.index_columns)
643+
}
644+
)
645+
self._copy_index_to_pandas(df)
646+
yield df
647+
632648
def _copy_index_to_pandas(self, df: pd.DataFrame):
633649
"""Set the index on pandas DataFrame to match this block.
634650

‎tests/system/small/test_dataframe_io.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,28 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
347347
pd.testing.assert_series_equal(actual, expected)
348348

349349

350+
def test_to_pandas_batches_w_empty_dataframe(session):
351+
"""Verify to_pandas_batches() APIs returns at least one DataFrame.
352+
353+
See b/428918844 for additional context.
354+
"""
355+
empty = bpd.DataFrame(
356+
{
357+
"idx1": [],
358+
"idx2": [],
359+
"col1": pandas.Series([], dtype="string[pyarrow]"),
360+
"col2": pandas.Series([], dtype="Int64"),
361+
},
362+
session=session,
363+
).set_index(["idx1", "idx2"], drop=True)
364+
365+
results = list(empty.to_pandas_batches())
366+
assert len(results) == 1
367+
assert list(results[0].index.names) == ["idx1", "idx2"]
368+
assert list(results[0].columns) == ["col1", "col2"]
369+
pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes)
370+
371+
350372
@pytest.mark.parametrize("allow_large_results", (True, False))
351373
def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results):
352374
"""Verify to_pandas_batches() APIs returns the expected page size.

0 commit comments

Comments
 (0)