17
17
from __future__ import annotations
18
18
19
19
import typing
20
- from typing import Mapping , Sequence , Tuple , Union
20
+ from typing import Hashable , Mapping , Optional , Sequence , Tuple , Union
21
21
22
+ import google .cloud .bigquery as bigquery
22
23
import numpy as np
23
24
import pandas
24
25
33
34
import bigframes .core .utils as utils
34
35
import bigframes .dtypes
35
36
import bigframes .dtypes as bf_dtypes
37
+ import bigframes .formatting_helpers as formatter
36
38
import bigframes .operations as ops
37
39
import bigframes .operations .aggregations as agg_ops
38
40
import third_party .bigframes_vendored .pandas .core .indexes .base as vendored_pandas_index
39
41
42
+ if typing .TYPE_CHECKING :
43
+ import bigframes .dataframe
44
+ import bigframes .series
45
+
40
46
41
47
class Index (vendored_pandas_index .Index ):
42
48
__doc__ = vendored_pandas_index .Index .__doc__
43
49
44
- def __init__ (self , data : blocks .BlockHolder ):
45
- self ._data = data
50
+ def __init__ (
51
+ self ,
52
+ data = None ,
53
+ dtype = None ,
54
+ * ,
55
+ name = None ,
56
+ ):
57
+ import bigframes .dataframe as df
58
+ import bigframes .series as series
59
+
60
+ if isinstance (data , blocks .Block ):
61
+ block = data .select_columns ([])
62
+ elif isinstance (data , df .DataFrame ):
63
+ raise ValueError ("Cannot construct index from dataframe." )
64
+ elif isinstance (data , series .Series ) or isinstance (data , Index ):
65
+ if isinstance (data , series .Series ):
66
+ block = data ._block
67
+ block = block .set_index (
68
+ col_ids = [data ._value_column ],
69
+ )
70
+ elif isinstance (data , Index ):
71
+ block = data ._block
72
+ index = Index (data = block )
73
+ name = data .name if name is None else name
74
+ if name is not None :
75
+ index .name = name
76
+ if dtype is not None :
77
+ index = index .astype (dtype )
78
+ block = index ._block
79
+ else :
80
+ pd_index = pandas .Index (data = data , dtype = dtype , name = name )
81
+ pd_df = pandas .DataFrame (index = pd_index )
82
+ block = df .DataFrame (pd_df )._block
83
+ self ._query_job = None
84
+ self ._block : blocks .Block = block
85
+
86
+ @classmethod
87
+ def from_frame (
88
+ cls , frame : Union [bigframes .series .Series , bigframes .dataframe .DataFrame ]
89
+ ) -> Index :
90
+ return FrameIndex (frame )
46
91
47
92
@property
48
93
def name (self ) -> blocks .Label :
@@ -55,15 +100,16 @@ def name(self, value: blocks.Label):
55
100
@property
56
101
def names (self ) -> typing .Sequence [blocks .Label ]:
57
102
"""Returns the names of the Index."""
58
- return self ._data . _get_block () ._index_labels
103
+ return self ._block ._index_labels
59
104
60
105
@names .setter
61
106
def names (self , values : typing .Sequence [blocks .Label ]):
62
- return self ._data ._set_block (self ._block .with_index_labels (values ))
107
+ new_block = self ._block .with_index_labels (values )
108
+ self ._block = new_block
63
109
64
110
@property
65
111
def nlevels (self ) -> int :
66
- return len (self ._data . _get_block () .index_columns )
112
+ return len (self ._block .index_columns )
67
113
68
114
@property
69
115
def values (self ) -> np .ndarray :
@@ -75,7 +121,7 @@ def ndim(self) -> int:
75
121
76
122
@property
77
123
def shape (self ) -> typing .Tuple [int ]:
78
- return (self ._data . _get_block () .shape [0 ],)
124
+ return (self ._block .shape [0 ],)
79
125
80
126
@property
81
127
def dtype (self ):
@@ -107,9 +153,7 @@ def is_monotonic_increasing(self) -> bool:
107
153
"""
108
154
return typing .cast (
109
155
bool ,
110
- self ._data ._get_block ().is_monotonic_increasing (
111
- self ._data ._get_block ().index_columns
112
- ),
156
+ self ._block .is_monotonic_increasing (self ._block .index_columns ),
113
157
)
114
158
115
159
@property
@@ -122,9 +166,7 @@ def is_monotonic_decreasing(self) -> bool:
122
166
"""
123
167
return typing .cast (
124
168
bool ,
125
- self ._data ._get_block ().is_monotonic_decreasing (
126
- self ._data ._get_block ().index_columns
127
- ),
169
+ self ._block .is_monotonic_decreasing (self ._block .index_columns ),
128
170
)
129
171
130
172
@property
@@ -149,14 +191,65 @@ def has_duplicates(self) -> bool:
149
191
duplicates_df = df .DataFrame (duplicates_block )
150
192
return duplicates_df ["is_duplicate" ].any ()
151
193
152
- @property
153
- def _block (self ) -> blocks .Block :
154
- return self ._data ._get_block ()
155
-
156
194
@property
157
195
def T (self ) -> Index :
158
196
return self .transpose ()
159
197
198
+ @property
199
+ def query_job (self ) -> Optional [bigquery .QueryJob ]:
200
+ """BigQuery job metadata for the most recent query.
201
+
202
+ Returns:
203
+ The most recent `QueryJob
204
+ <https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob>`_.
205
+ """
206
+ if self ._query_job is None :
207
+ self ._query_job = self ._block ._compute_dry_run ()
208
+ return self ._query_job
209
+
210
+ def __repr__ (self ) -> str :
211
+ # TODO(swast): Add a timeout here? If the query is taking a long time,
212
+ # maybe we just print the job metadata that we have so far?
213
+ # TODO(swast): Avoid downloading the whole series by using job
214
+ # metadata, like we do with DataFrame.
215
+ opts = bigframes .options .display
216
+ max_results = opts .max_rows
217
+ if opts .repr_mode == "deferred" :
218
+ return formatter .repr_query_job (self .query_job )
219
+
220
+ pandas_df , _ , query_job = self ._block .retrieve_repr_request_results (max_results )
221
+ self ._query_job = query_job
222
+ return repr (pandas_df .index )
223
+
224
+ def copy (self , name : Optional [Hashable ] = None ):
225
+ copy_index = Index (self ._block )
226
+ if name is not None :
227
+ copy_index .name = name
228
+ return copy_index
229
+
230
+ def to_series (
231
+ self , index : Optional [Index ] = None , name : Optional [Hashable ] = None
232
+ ) -> bigframes .series .Series :
233
+ if self .nlevels != 1 :
234
+ NotImplementedError (
235
+ f"Converting multi-index to series is not yet supported. { constants .FEEDBACK_LINK } "
236
+ )
237
+
238
+ import bigframes .series
239
+
240
+ name = self .name if name is None else name
241
+ if index is None :
242
+ return bigframes .series .Series (data = self , index = self , name = name )
243
+ else :
244
+ return bigframes .series .Series (data = self , index = Index (index ), name = name )
245
+
246
+ def get_level_values (self , level ) -> Index :
247
+ level_n = level if isinstance (level , int ) else self .names .index (level )
248
+ block = self ._block .drop_levels (
249
+ [self ._block .index_columns [i ] for i in range (self .nlevels ) if i != level_n ]
250
+ )
251
+ return Index (block )
252
+
160
253
def _memory_usage (self ) -> int :
161
254
(n_rows ,) = self .shape
162
255
return sum (
@@ -180,7 +273,7 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"):
180
273
order .OrderingColumnReference (column , direction = direction , na_last = na_last )
181
274
for column in index_columns
182
275
]
183
- return Index . _from_block (self ._block .order_by (ordering ))
276
+ return Index (self ._block .order_by (ordering ))
184
277
185
278
def astype (
186
279
self ,
@@ -269,7 +362,7 @@ def rename(self, name: Union[str, Sequence[str]]) -> Index:
269
362
names = [name ] if isinstance (name , str ) else list (name )
270
363
if len (names ) != self .nlevels :
271
364
raise ValueError ("'name' must be same length as levels" )
272
- return Index . _from_block (self ._block .with_index_labels (names ))
365
+ return Index (self ._block .with_index_labels (names ))
273
366
274
367
def drop (
275
368
self ,
@@ -291,17 +384,17 @@ def drop(
291
384
)
292
385
block = block .filter (condition_id , keep_null = True )
293
386
block = block .drop_columns ([condition_id ])
294
- return Index . _from_block (block )
387
+ return Index (block )
295
388
296
389
def dropna (self , how : str = "any" ) -> Index :
297
390
if how not in ("any" , "all" ):
298
391
raise ValueError ("'how' must be one of 'any', 'all'" )
299
392
result = block_ops .dropna (self ._block , self ._block .index_columns , how = how ) # type: ignore
300
- return Index . _from_block (result )
393
+ return Index (result )
301
394
302
395
def drop_duplicates (self , * , keep : str = "first" ) -> Index :
303
396
block = block_ops .drop_duplicates (self ._block , self ._block .index_columns , keep )
304
- return Index . _from_block (block )
397
+ return Index (block )
305
398
306
399
def isin (self , values ) -> Index :
307
400
if not utils .is_list_like (values ):
@@ -330,7 +423,7 @@ def _apply_unary_expr(
330
423
result_ids .append (result_id )
331
424
332
425
block = block .set_index (result_ids , index_labels = self ._block .index_labels )
333
- return Index . _from_block (block )
426
+ return Index (block )
334
427
335
428
def _apply_aggregation (self , op : agg_ops .AggregateOp ) -> typing .Any :
336
429
if self .nlevels > 1 :
@@ -344,7 +437,7 @@ def __getitem__(self, key: int) -> typing.Any:
344
437
result_pd_df , _ = self ._block .slice (key , key + 1 , 1 ).to_pandas ()
345
438
else : # special case, want [-1:] instead of [-1:0]
346
439
result_pd_df , _ = self ._block .slice (key ).to_pandas ()
347
- if result_pd_df .empty :
440
+ if result_pd_df .index . empty :
348
441
raise IndexError ("single positional indexer is out-of-bounds" )
349
442
return result_pd_df .index [0 ]
350
443
else :
@@ -367,11 +460,36 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray:
367
460
def __len__ (self ):
368
461
return self .shape [0 ]
369
462
370
- @classmethod
371
- def _from_block (cls , block : blocks .Block ) -> Index :
372
- import bigframes .dataframe as df
373
463
374
- return Index (df .DataFrame (block ))
464
+ # Index that mutates the originating dataframe/series
465
+ class FrameIndex (Index ):
466
+ def __init__ (
467
+ self ,
468
+ series_or_dataframe : typing .Union [
469
+ bigframes .series .Series , bigframes .dataframe .DataFrame
470
+ ],
471
+ ):
472
+ super ().__init__ (series_or_dataframe ._block )
473
+ self ._whole_frame = series_or_dataframe
474
+
475
+ @property
476
+ def name (self ) -> blocks .Label :
477
+ return self .names [0 ]
478
+
479
+ @name .setter
480
+ def name (self , value : blocks .Label ):
481
+ self .names = [value ]
482
+
483
+ @property
484
+ def names (self ) -> typing .Sequence [blocks .Label ]:
485
+ """Returns the names of the Index."""
486
+ return self ._block ._index_labels
487
+
488
+ @names .setter
489
+ def names (self , values : typing .Sequence [blocks .Label ]):
490
+ new_block = self ._whole_frame ._get_block ().with_index_labels (values )
491
+ self ._whole_frame ._set_block (new_block )
492
+ self ._block = new_block
375
493
376
494
377
495
class IndexValue :
@@ -406,15 +524,6 @@ def dtypes(
406
524
def session (self ) -> core .Session :
407
525
return self ._expr .session
408
526
409
- def __repr__ (self ) -> str :
410
- """Converts an Index to a string."""
411
- # TODO(swast): Add a timeout here? If the query is taking a long time,
412
- # maybe we just print the job metadata that we have so far?
413
- # TODO(swast): Avoid downloading the whole index by using job
414
- # metadata, like we do with DataFrame.
415
- preview = self .to_pandas ()
416
- return repr (preview )
417
-
418
527
def to_pandas (self ) -> pandas .Index :
419
528
"""Executes deferred operations and downloads the results."""
420
529
# Project down to only the index column. So the query can be cached to visualize other data.
0 commit comments