29
29
import random
30
30
import textwrap
31
31
import typing
32
- from typing import Iterable , List , Literal , Mapping , Optional , Sequence , Tuple , Union
32
+ from typing import (
33
+ Iterable ,
34
+ Iterator ,
35
+ List ,
36
+ Literal ,
37
+ Mapping ,
38
+ Optional ,
39
+ Sequence ,
40
+ Tuple ,
41
+ Union ,
42
+ )
33
43
import warnings
34
44
35
45
import bigframes_vendored .constants as constants
87
97
LevelsType = typing .Union [LevelType , typing .Sequence [LevelType ]]
88
98
89
99
90
- class BlockHolder (typing .Protocol ):
100
+ @dataclasses .dataclass
101
+ class PandasBatches (Iterator [pd .DataFrame ]):
91
102
"""Interface for mutable objects with state represented by a block value object."""
92
103
93
- def _set_block (self , block : Block ):
94
- """Set the underlying block value of the object"""
104
+ def __init__ (
105
+ self , pandas_batches : Iterator [pd .DataFrame ], total_rows : Optional [int ] = 0
106
+ ):
107
+ self ._dataframes : Iterator [pd .DataFrame ] = pandas_batches
108
+ self ._total_rows : Optional [int ] = total_rows
109
+
110
+ @property
111
+ def total_rows (self ) -> Optional [int ]:
112
+ return self ._total_rows
95
113
96
- def _get_block (self ) -> Block :
97
- """Get the underlying block value of the object"""
114
+ def __next__ (self ) -> pd . DataFrame :
115
+ return next ( self . _dataframes )
98
116
99
117
100
118
@dataclasses .dataclass ()
@@ -599,8 +617,7 @@ def try_peek(
599
617
self .expr , n , use_explicit_destination = allow_large_results
600
618
)
601
619
df = result .to_pandas ()
602
- self ._copy_index_to_pandas (df )
603
- return df
620
+ return self ._copy_index_to_pandas (df )
604
621
else :
605
622
return None
606
623
@@ -609,8 +626,7 @@ def to_pandas_batches(
609
626
page_size : Optional [int ] = None ,
610
627
max_results : Optional [int ] = None ,
611
628
allow_large_results : Optional [bool ] = None ,
612
- squeeze : Optional [bool ] = False ,
613
- ):
629
+ ) -> Iterator [pd .DataFrame ]:
614
630
"""Download results one message at a time.
615
631
616
632
page_size and max_results determine the size and number of batches,
@@ -621,43 +637,43 @@ def to_pandas_batches(
621
637
use_explicit_destination = allow_large_results ,
622
638
)
623
639
624
- total_batches = 0
625
- for df in execute_result .to_pandas_batches (
626
- page_size = page_size , max_results = max_results
627
- ):
628
- total_batches += 1
629
- self ._copy_index_to_pandas (df )
630
- if squeeze :
631
- yield df .squeeze (axis = 1 )
632
- else :
633
- yield df
634
-
635
640
# To reduce the number of edge cases to consider when working with the
636
641
# results of this, always return at least one DataFrame. See:
637
642
# b/428918844.
638
- if total_batches == 0 :
639
- df = pd .DataFrame (
640
- {
641
- col : pd .Series ([], dtype = self .expr .get_column_type (col ))
642
- for col in itertools .chain (self .value_columns , self .index_columns )
643
- }
644
- )
645
- self ._copy_index_to_pandas (df )
646
- yield df
643
+ empty_val = pd .DataFrame (
644
+ {
645
+ col : pd .Series ([], dtype = self .expr .get_column_type (col ))
646
+ for col in itertools .chain (self .value_columns , self .index_columns )
647
+ }
648
+ )
649
+ dfs = map (
650
+ lambda a : a [0 ],
651
+ itertools .zip_longest (
652
+ execute_result .to_pandas_batches (page_size , max_results ),
653
+ [0 ],
654
+ fillvalue = empty_val ,
655
+ ),
656
+ )
657
+ dfs = iter (map (self ._copy_index_to_pandas , dfs ))
647
658
648
- def _copy_index_to_pandas (self , df : pd .DataFrame ):
649
- """Set the index on pandas DataFrame to match this block.
659
+ total_rows = execute_result .total_rows
660
+ if (total_rows is not None ) and (max_results is not None ):
661
+ total_rows = min (total_rows , max_results )
650
662
651
- Warning: This method modifies ``df`` inplace.
652
- """
663
+ return PandasBatches (dfs , total_rows )
664
+
665
+ def _copy_index_to_pandas (self , df : pd .DataFrame ) -> pd .DataFrame :
666
+ """Set the index on pandas DataFrame to match this block."""
653
667
# Note: If BigQuery DataFrame has null index, a default one will be created for the local materialization.
668
+ new_df = df .copy ()
654
669
if len (self .index_columns ) > 0 :
655
- df .set_index (list (self .index_columns ), inplace = True )
670
+ new_df .set_index (list (self .index_columns ), inplace = True )
656
671
# Pandas names is annotated as list[str] rather than the more
657
672
# general Sequence[Label] that BigQuery DataFrames has.
658
673
# See: https://github.com/pandas-dev/pandas-stubs/issues/804
659
- df .index .names = self .index .names # type: ignore
660
- df .columns = self .column_labels
674
+ new_df .index .names = self .index .names # type: ignore
675
+ new_df .columns = self .column_labels
676
+ return new_df
661
677
662
678
def _materialize_local (
663
679
self , materialize_options : MaterializationOptions = MaterializationOptions ()
@@ -724,9 +740,7 @@ def _materialize_local(
724
740
)
725
741
else :
726
742
df = execute_result .to_pandas ()
727
- self ._copy_index_to_pandas (df )
728
-
729
- return df , execute_result .query_job
743
+ return self ._copy_index_to_pandas (df ), execute_result .query_job
730
744
731
745
def _downsample (
732
746
self , total_rows : int , sampling_method : str , fraction : float , random_state
@@ -1591,8 +1605,7 @@ def retrieve_repr_request_results(
1591
1605
row_count = self .session ._executor .execute (self .expr .row_count ()).to_py_scalar ()
1592
1606
1593
1607
head_df = head_result .to_pandas ()
1594
- self ._copy_index_to_pandas (head_df )
1595
- return head_df , row_count , head_result .query_job
1608
+ return self ._copy_index_to_pandas (head_df ), row_count , head_result .query_job
1596
1609
1597
1610
def promote_offsets (self , label : Label = None ) -> typing .Tuple [Block , str ]:
1598
1611
expr , result_id = self ._expr .promote_offsets ()
0 commit comments