Skip to content

Commit e5d054e

Browse files
feat: Add Index constructor, repr, copy, get_level_values, to_series (#334)
* feat: Add Index constructor, copy, get_level_values, to_series fix mypy error * fix constructor bug * fix error with index name mutation * refactor index to make mutation clearer * fix index bugs * give index custom repr --------- Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com>
1 parent 677f014 commit e5d054e

File tree

11 files changed

+375
-76
lines changed

11 files changed

+375
-76
lines changed

‎bigframes/core/blocks.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -287,15 +287,14 @@ def reset_index(self, drop: bool = True) -> Block:
287287
A new Block because dropping index columns can break references
288288
from Index classes that point to this block.
289289
"""
290-
block = self
291290
new_index_col_id = guid.generate_guid()
292291
expr = self._expr.promote_offsets(new_index_col_id)
293292
if drop:
294293
# Even though the index might be part of the ordering, keep that
295294
# ordering expression as reset_index shouldn't change the row
296295
# order.
297296
expr = expr.drop_columns(self.index_columns)
298-
block = Block(
297+
return Block(
299298
expr,
300299
index_columns=[new_index_col_id],
301300
column_labels=self.column_labels,
@@ -321,13 +320,12 @@ def reset_index(self, drop: bool = True) -> Block:
321320
# See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html
322321
column_labels_modified = column_labels_modified.insert(level, label)
323322

324-
block = Block(
323+
return Block(
325324
expr,
326325
index_columns=[new_index_col_id],
327326
column_labels=column_labels_modified,
328327
index_labels=[None],
329328
)
330-
return block
331329

332330
def set_index(
333331
self,

‎bigframes/core/indexers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def _loc_getitem_series_or_dataframe(
294294
keys_df = keys_df.set_index(temp_name, drop=True)
295295
return _perform_loc_list_join(series_or_dataframe, keys_df)
296296
elif isinstance(key, bigframes.core.indexes.Index):
297-
block = key._data._get_block()
297+
block = key._block
298298
block = block.select_columns(())
299299
keys_df = bigframes.dataframe.DataFrame(block)
300300
return _perform_loc_list_join(series_or_dataframe, keys_df)

‎bigframes/core/indexes/index.py

Lines changed: 146 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717
from __future__ import annotations
1818

1919
import typing
20-
from typing import Mapping, Sequence, Tuple, Union
20+
from typing import Hashable, Mapping, Optional, Sequence, Tuple, Union
2121

22+
import google.cloud.bigquery as bigquery
2223
import numpy as np
2324
import pandas
2425

@@ -33,16 +34,60 @@
3334
import bigframes.core.utils as utils
3435
import bigframes.dtypes
3536
import bigframes.dtypes as bf_dtypes
37+
import bigframes.formatting_helpers as formatter
3638
import bigframes.operations as ops
3739
import bigframes.operations.aggregations as agg_ops
3840
import third_party.bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index
3941

42+
if typing.TYPE_CHECKING:
43+
import bigframes.dataframe
44+
import bigframes.series
45+
4046

4147
class Index(vendored_pandas_index.Index):
4248
__doc__ = vendored_pandas_index.Index.__doc__
4349

44-
def __init__(self, data: blocks.BlockHolder):
45-
self._data = data
50+
def __init__(
51+
self,
52+
data=None,
53+
dtype=None,
54+
*,
55+
name=None,
56+
):
57+
import bigframes.dataframe as df
58+
import bigframes.series as series
59+
60+
if isinstance(data, blocks.Block):
61+
block = data.select_columns([])
62+
elif isinstance(data, df.DataFrame):
63+
raise ValueError("Cannot construct index from dataframe.")
64+
elif isinstance(data, series.Series) or isinstance(data, Index):
65+
if isinstance(data, series.Series):
66+
block = data._block
67+
block = block.set_index(
68+
col_ids=[data._value_column],
69+
)
70+
elif isinstance(data, Index):
71+
block = data._block
72+
index = Index(data=block)
73+
name = data.name if name is None else name
74+
if name is not None:
75+
index.name = name
76+
if dtype is not None:
77+
index = index.astype(dtype)
78+
block = index._block
79+
else:
80+
pd_index = pandas.Index(data=data, dtype=dtype, name=name)
81+
pd_df = pandas.DataFrame(index=pd_index)
82+
block = df.DataFrame(pd_df)._block
83+
self._query_job = None
84+
self._block: blocks.Block = block
85+
86+
@classmethod
87+
def from_frame(
88+
cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame]
89+
) -> Index:
90+
return FrameIndex(frame)
4691

4792
@property
4893
def name(self) -> blocks.Label:
@@ -55,15 +100,16 @@ def name(self, value: blocks.Label):
55100
@property
56101
def names(self) -> typing.Sequence[blocks.Label]:
57102
"""Returns the names of the Index."""
58-
return self._data._get_block()._index_labels
103+
return self._block._index_labels
59104

60105
@names.setter
61106
def names(self, values: typing.Sequence[blocks.Label]):
62-
return self._data._set_block(self._block.with_index_labels(values))
107+
new_block = self._block.with_index_labels(values)
108+
self._block = new_block
63109

64110
@property
65111
def nlevels(self) -> int:
66-
return len(self._data._get_block().index_columns)
112+
return len(self._block.index_columns)
67113

68114
@property
69115
def values(self) -> np.ndarray:
@@ -75,7 +121,7 @@ def ndim(self) -> int:
75121

76122
@property
77123
def shape(self) -> typing.Tuple[int]:
78-
return (self._data._get_block().shape[0],)
124+
return (self._block.shape[0],)
79125

80126
@property
81127
def dtype(self):
@@ -107,9 +153,7 @@ def is_monotonic_increasing(self) -> bool:
107153
"""
108154
return typing.cast(
109155
bool,
110-
self._data._get_block().is_monotonic_increasing(
111-
self._data._get_block().index_columns
112-
),
156+
self._block.is_monotonic_increasing(self._block.index_columns),
113157
)
114158

115159
@property
@@ -122,9 +166,7 @@ def is_monotonic_decreasing(self) -> bool:
122166
"""
123167
return typing.cast(
124168
bool,
125-
self._data._get_block().is_monotonic_decreasing(
126-
self._data._get_block().index_columns
127-
),
169+
self._block.is_monotonic_decreasing(self._block.index_columns),
128170
)
129171

130172
@property
@@ -149,14 +191,65 @@ def has_duplicates(self) -> bool:
149191
duplicates_df = df.DataFrame(duplicates_block)
150192
return duplicates_df["is_duplicate"].any()
151193

152-
@property
153-
def _block(self) -> blocks.Block:
154-
return self._data._get_block()
155-
156194
@property
157195
def T(self) -> Index:
158196
return self.transpose()
159197

198+
@property
199+
def query_job(self) -> Optional[bigquery.QueryJob]:
200+
"""BigQuery job metadata for the most recent query.
201+
202+
Returns:
203+
The most recent `QueryJob
204+
<https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob>`_.
205+
"""
206+
if self._query_job is None:
207+
self._query_job = self._block._compute_dry_run()
208+
return self._query_job
209+
210+
def __repr__(self) -> str:
211+
# TODO(swast): Add a timeout here? If the query is taking a long time,
212+
# maybe we just print the job metadata that we have so far?
213+
# TODO(swast): Avoid downloading the whole series by using job
214+
# metadata, like we do with DataFrame.
215+
opts = bigframes.options.display
216+
max_results = opts.max_rows
217+
if opts.repr_mode == "deferred":
218+
return formatter.repr_query_job(self.query_job)
219+
220+
pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results)
221+
self._query_job = query_job
222+
return repr(pandas_df.index)
223+
224+
def copy(self, name: Optional[Hashable] = None):
225+
copy_index = Index(self._block)
226+
if name is not None:
227+
copy_index.name = name
228+
return copy_index
229+
230+
def to_series(
231+
self, index: Optional[Index] = None, name: Optional[Hashable] = None
232+
) -> bigframes.series.Series:
233+
if self.nlevels != 1:
234+
NotImplementedError(
235+
f"Converting multi-index to series is not yet supported. {constants.FEEDBACK_LINK}"
236+
)
237+
238+
import bigframes.series
239+
240+
name = self.name if name is None else name
241+
if index is None:
242+
return bigframes.series.Series(data=self, index=self, name=name)
243+
else:
244+
return bigframes.series.Series(data=self, index=Index(index), name=name)
245+
246+
def get_level_values(self, level) -> Index:
247+
level_n = level if isinstance(level, int) else self.names.index(level)
248+
block = self._block.drop_levels(
249+
[self._block.index_columns[i] for i in range(self.nlevels) if i != level_n]
250+
)
251+
return Index(block)
252+
160253
def _memory_usage(self) -> int:
161254
(n_rows,) = self.shape
162255
return sum(
@@ -180,7 +273,7 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"):
180273
order.OrderingColumnReference(column, direction=direction, na_last=na_last)
181274
for column in index_columns
182275
]
183-
return Index._from_block(self._block.order_by(ordering))
276+
return Index(self._block.order_by(ordering))
184277

185278
def astype(
186279
self,
@@ -269,7 +362,7 @@ def rename(self, name: Union[str, Sequence[str]]) -> Index:
269362
names = [name] if isinstance(name, str) else list(name)
270363
if len(names) != self.nlevels:
271364
raise ValueError("'name' must be same length as levels")
272-
return Index._from_block(self._block.with_index_labels(names))
365+
return Index(self._block.with_index_labels(names))
273366

274367
def drop(
275368
self,
@@ -291,17 +384,17 @@ def drop(
291384
)
292385
block = block.filter(condition_id, keep_null=True)
293386
block = block.drop_columns([condition_id])
294-
return Index._from_block(block)
387+
return Index(block)
295388

296389
def dropna(self, how: str = "any") -> Index:
297390
if how not in ("any", "all"):
298391
raise ValueError("'how' must be one of 'any', 'all'")
299392
result = block_ops.dropna(self._block, self._block.index_columns, how=how) # type: ignore
300-
return Index._from_block(result)
393+
return Index(result)
301394

302395
def drop_duplicates(self, *, keep: str = "first") -> Index:
303396
block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep)
304-
return Index._from_block(block)
397+
return Index(block)
305398

306399
def isin(self, values) -> Index:
307400
if not utils.is_list_like(values):
@@ -330,7 +423,7 @@ def _apply_unary_expr(
330423
result_ids.append(result_id)
331424

332425
block = block.set_index(result_ids, index_labels=self._block.index_labels)
333-
return Index._from_block(block)
426+
return Index(block)
334427

335428
def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any:
336429
if self.nlevels > 1:
@@ -344,7 +437,7 @@ def __getitem__(self, key: int) -> typing.Any:
344437
result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas()
345438
else: # special case, want [-1:] instead of [-1:0]
346439
result_pd_df, _ = self._block.slice(key).to_pandas()
347-
if result_pd_df.empty:
440+
if result_pd_df.index.empty:
348441
raise IndexError("single positional indexer is out-of-bounds")
349442
return result_pd_df.index[0]
350443
else:
@@ -367,11 +460,36 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray:
367460
def __len__(self):
368461
return self.shape[0]
369462

370-
@classmethod
371-
def _from_block(cls, block: blocks.Block) -> Index:
372-
import bigframes.dataframe as df
373463

374-
return Index(df.DataFrame(block))
464+
# Index that mutates the originating dataframe/series
465+
class FrameIndex(Index):
466+
def __init__(
467+
self,
468+
series_or_dataframe: typing.Union[
469+
bigframes.series.Series, bigframes.dataframe.DataFrame
470+
],
471+
):
472+
super().__init__(series_or_dataframe._block)
473+
self._whole_frame = series_or_dataframe
474+
475+
@property
476+
def name(self) -> blocks.Label:
477+
return self.names[0]
478+
479+
@name.setter
480+
def name(self, value: blocks.Label):
481+
self.names = [value]
482+
483+
@property
484+
def names(self) -> typing.Sequence[blocks.Label]:
485+
"""Returns the names of the Index."""
486+
return self._block._index_labels
487+
488+
@names.setter
489+
def names(self, values: typing.Sequence[blocks.Label]):
490+
new_block = self._whole_frame._get_block().with_index_labels(values)
491+
self._whole_frame._set_block(new_block)
492+
self._block = new_block
375493

376494

377495
class IndexValue:
@@ -406,15 +524,6 @@ def dtypes(
406524
def session(self) -> core.Session:
407525
return self._expr.session
408526

409-
def __repr__(self) -> str:
410-
"""Converts an Index to a string."""
411-
# TODO(swast): Add a timeout here? If the query is taking a long time,
412-
# maybe we just print the job metadata that we have so far?
413-
# TODO(swast): Avoid downloading the whole index by using job
414-
# metadata, like we do with DataFrame.
415-
preview = self.to_pandas()
416-
return repr(preview)
417-
418527
def to_pandas(self) -> pandas.Index:
419528
"""Executes deferred operations and downloads the results."""
420529
# Project down to only the index column. So the query can be cached to visualize other data.

0 commit comments

Comments
 (0)