Skip to content

Commit 7e59b20

Browse files
committed
add docstring for index_col_in_columns and fix tests
1 parent 645316d commit 7e59b20

File tree

3 files changed

+89
-18
lines changed

3 files changed

+89
-18
lines changed

‎bigframes/session/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1174,7 +1174,7 @@ def _read_csv_w_bigquery_engine(
11741174
index_col=index_col,
11751175
columns=columns,
11761176
names=names,
1177-
is_index_in_columns=True,
1177+
index_col_in_columns=True,
11781178
)
11791179

11801180
if dtype is not None:

‎bigframes/session/loader.py

Lines changed: 88 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,30 @@ def _to_index_cols(
9797

9898

9999
def _check_column_duplicates(
100-
index_cols: Iterable[str], columns: Iterable[str], is_index_in_columns: bool
100+
index_cols: Iterable[str], columns: Iterable[str], index_col_in_columns: bool
101101
) -> Iterable[str]:
102+
"""Validates and processes index and data columns for duplicates and overlap.
103+
104+
This function performs two main tasks:
105+
1. Ensures there are no duplicate column names within the `index_cols` list
106+
or within the `columns` list.
107+
2. Based on the `index_col_in_columns` flag, it validates the relationship
108+
between `index_cols` and `columns`.
109+
110+
Args:
111+
index_cols (Iterable[str]):
112+
An iterable of column names designated as the index.
113+
columns (Iterable[str]):
114+
An iterable of column names designated as the data columns.
115+
index_col_in_columns (bool):
116+
A flag indicating how to handle overlap between `index_cols` and
117+
`columns`.
118+
- If `False`, the two lists must be disjoint (contain no common
119+
elements). An error is raised if any overlap is found.
120+
- If `True`, `index_cols` is expected to be a subset of
121+
`columns`. An error is raised if an index column is not found
122+
in the `columns` list.
123+
"""
102124
index_cols_list = list(index_cols) if index_cols is not None else []
103125
columns_list = list(columns) if columns is not None else []
104126
set_index = set(index_cols_list)
@@ -119,7 +141,7 @@ def _check_column_duplicates(
119141
"All column names specified in 'columns' must be unique."
120142
)
121143

122-
if is_index_in_columns:
144+
if index_col_in_columns:
123145
if not set_index.issubset(set_columns):
124146
raise ValueError(
125147
f"The specified index column(s) were not found: {set_index - set_columns}. "
@@ -405,7 +427,7 @@ def read_gbq_table( # type: ignore[overload-overlap]
405427
dry_run: Literal[False] = ...,
406428
force_total_order: Optional[bool] = ...,
407429
n_rows: Optional[int] = None,
408-
is_index_in_columns: bool = False,
430+
index_col_in_columns: bool = False,
409431
) -> dataframe.DataFrame:
410432
...
411433

@@ -428,7 +450,7 @@ def read_gbq_table(
428450
dry_run: Literal[True] = ...,
429451
force_total_order: Optional[bool] = ...,
430452
n_rows: Optional[int] = None,
431-
is_index_in_columns: bool = False,
453+
index_col_in_columns: bool = False,
432454
) -> pandas.Series:
433455
...
434456

@@ -450,8 +472,67 @@ def read_gbq_table(
450472
dry_run: bool = False,
451473
force_total_order: Optional[bool] = None,
452474
n_rows: Optional[int] = None,
453-
is_index_in_columns: bool = False,
475+
index_col_in_columns: bool = False,
454476
) -> dataframe.DataFrame | pandas.Series:
477+
"""Read a BigQuery table into a BigQuery DataFrames DataFrame.
478+
479+
This method allows you to create a DataFrame from a BigQuery table.
480+
You can specify the columns to load, an index column, and apply
481+
filters.
482+
483+
Args:
484+
table_id (str):
485+
The identifier of the BigQuery table to read.
486+
index_col (Iterable[str] | str | Iterable[int] | int | bigframes.enums.DefaultIndexKind, optional):
487+
The column(s) to use as the index for the DataFrame. This can be
488+
a single column name or a list of column names. If not provided,
489+
a default index will be used based on the session's
490+
``default_index_type``.
491+
columns (Iterable[str], optional):
492+
The columns to read from the table. If not specified, all
493+
columns will be read.
494+
names (Optional[Iterable[str]], optional):
495+
A list of column names to use for the resulting DataFrame. This
496+
is useful if you want to rename the columns as you read the
497+
data.
498+
max_results (Optional[int], optional):
499+
The maximum number of rows to retrieve from the table. If not
500+
specified, all rows will be loaded.
501+
use_cache (bool, optional):
502+
Whether to use cached results for the query. Defaults to True.
503+
Setting this to False will force a re-execution of the query.
504+
filters (third_party_pandas_gbq.FiltersType, optional):
505+
A list of filters to apply to the data. Filters are specified
506+
as a list of tuples, where each tuple contains a column name,
507+
an operator (e.g., '==', '!='), and a value.
508+
enable_snapshot (bool, optional):
509+
If True, a snapshot of the table is used to ensure that the
510+
DataFrame is deterministic, even if the underlying table
511+
changes. Defaults to True.
512+
dry_run (bool, optional):
513+
If True, the function will not actually execute the query but
514+
will instead return statistics about the table. Defaults to False.
515+
force_total_order (Optional[bool], optional):
516+
If True, a total ordering is enforced on the DataFrame, which
517+
can be useful for operations that require a stable row order.
518+
If None, the session's default behavior is used.
519+
n_rows (Optional[int], optional):
520+
The number of rows to consider for type inference and other
521+
metadata operations. This does not limit the number of rows
522+
in the final DataFrame.
523+
index_col_in_columns (bool, optional):
524+
Specifies if the ``index_col`` is also present in the ``columns``
525+
list. Defaults to ``False``.
526+
527+
* If ``False``, ``index_col`` and ``columns`` must specify
528+
distinct sets of columns. An error will be raised if any
529+
column is found in both.
530+
* If ``True``, the column(s) in ``index_col`` are expected to
531+
also be present in the ``columns`` list. This is useful
532+
when the index is selected from the data columns (e.g., in a
533+
``read_csv`` scenario). The column will be used as the
534+
DataFrame's index and removed from the list of value columns.
535+
"""
455536
import bigframes._tools.strings
456537
import bigframes.dataframe as dataframe
457538

@@ -534,7 +615,7 @@ def read_gbq_table(
534615
names=names,
535616
)
536617
columns = list(
537-
_check_column_duplicates(index_cols, columns, is_index_in_columns)
618+
_check_column_duplicates(index_cols, columns, index_col_in_columns)
538619
)
539620

540621
for key in index_cols:
@@ -818,7 +899,7 @@ def read_gbq_query(
818899

819900
index_cols = _to_index_cols(index_col)
820901
columns = _check_column_duplicates(
821-
index_cols, columns, is_index_in_columns=False
902+
index_cols, columns, index_col_in_columns=False
822903
)
823904

824905
filters_copy1, filters_copy2 = itertools.tee(filters)

‎tests/system/small/test_session.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1320,10 +1320,6 @@ def test_read_csv_for_names_less_than_columns(session, df_and_gcs_csv_for_two_co
13201320
assert bf_df.shape == pd_df.shape
13211321
assert bf_df.columns.tolist() == pd_df.columns.tolist()
13221322

1323-
# BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs
1324-
# (b/280889935) or guarantee row ordering.
1325-
bf_df = bf_df.sort_index()
1326-
13271323
# Pandas's index name is None, while BigFrames's index name is "rowindex".
13281324
pd_df.index.name = "rowindex"
13291325
pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())
@@ -1527,9 +1523,6 @@ def test_read_csv_w_usecols_and_indexcol(session, df_and_local_csv):
15271523
assert bf_df.shape == pd_df.shape
15281524
assert bf_df.columns.tolist() == pd_df.columns.tolist()
15291525

1530-
# BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs
1531-
# (b/280889935) or guarantee row ordering.
1532-
bf_df = bf_df.sort_index()
15331526
pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())
15341527

15351528

@@ -1585,9 +1578,6 @@ def test_read_csv_local_w_encoding(session, penguins_pandas_df_default_index):
15851578
bf_df = session.read_csv(
15861579
path, engine="bigquery", index_col="rowindex", encoding="ISO-8859-1"
15871580
)
1588-
# BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs
1589-
# (b/280889935) or guarantee row ordering.
1590-
bf_df = bf_df.sort_index()
15911581
pd.testing.assert_frame_equal(
15921582
bf_df.to_pandas(), penguins_pandas_df_default_index
15931583
)

0 commit comments

Comments
 (0)