googleapis · tswast · May 17, 2024 · May 17, 2024 · May 17, 2024 · May 17, 2024
@@ -897,7 +897,7 @@ def to_sql(
             output_columns = [
                 col_id_overrides.get(col, col) for col in baked_ir.column_ids
             ]
-            sql = bigframes.core.sql.select_from(output_columns, sql)
+            sql = bigframes.core.sql.select_from_subquery(output_columns, sql)
 
             # Single row frames may not have any ordering columns
             if len(baked_ir._ordering.all_ordering_columns) > 0:

@@ -111,7 +111,7 @@ def infix_op(opname: str, left_arg: str, right_arg: str):
 
 
 ### Writing SELECT expressions
-def select_from(columns: Iterable[str], subquery: str, distinct: bool = False):
+def select_from_subquery(columns: Iterable[str], subquery: str, distinct: bool = False):
     selection = ", ".join(map(identifier, columns))
     distinct_clause = "DISTINCT " if distinct else ""
 
@@ -120,16 +120,27 @@ def select_from(columns: Iterable[str], subquery: str, distinct: bool = False):
     )
 
 
+def select_from_table_ref(
+    columns: Iterable[str], table_ref: bigquery.TableReference, distinct: bool = False
+):
+    selection = ", ".join(map(identifier, columns))
+    distinct_clause = "DISTINCT " if distinct else ""
+
+    return textwrap.dedent(
+        f"SELECT {distinct_clause}{selection}\nFROM {table_reference(table_ref)}"
+    )
+
+
 def select_table(table_ref: bigquery.TableReference):
     return textwrap.dedent(f"SELECT * FROM {table_reference(table_ref)}")
 
 
-def is_distinct_sql(columns: Iterable[str], table_sql: str) -> str:
+def is_distinct_sql(columns: Iterable[str], table_ref: bigquery.TableReference) -> str:
     is_unique_sql = f"""WITH full_table AS (
-        {select_from(columns, table_sql)}
+        {select_from_table_ref(columns, table_ref)}
     ),
     distinct_table AS (
-        {select_from(columns, table_sql, distinct=True)}
+        {select_from_table_ref(columns, table_ref, distinct=True)}
     )
 
     SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`,

@@ -779,7 +779,6 @@ def _read_gbq_table(
         # check.
         is_index_unique = bf_read_gbq_table.are_index_cols_unique(
             bqclient=self.bqclient,
-            ibis_client=self.ibis_client,
             table=table,
             index_cols=index_cols,
             api_name=api_name,

@@ -162,21 +162,21 @@ def get_ibis_time_travel_table(
 
 def are_index_cols_unique(
     bqclient: bigquery.Client,
-    ibis_client: ibis.BaseBackend,
     table: bigquery.table.Table,
     index_cols: List[str],
     api_name: str,
 ) -> bool:
+    if len(index_cols) == 0:
+        return False
     # If index_cols contain the primary_keys, the query engine assumes they are
     # provide a unique index.
     primary_keys = frozenset(_get_primary_keys(table))
-    if primary_keys <= frozenset(index_cols):
+    if (len(primary_keys) > 0) and primary_keys <= frozenset(index_cols):
         return True
 
     # TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring
     # table_expression only selects just index_cols.
-    table_sql = ibis_client.compile(table)
-    is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table_sql)
+    is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference)
     job_config = bigquery.QueryJobConfig()
     job_config.labels["bigframes-api"] = api_name
     results = bqclient.query_and_wait(is_unique_sql, job_config=job_config)

@@ -15,8 +15,11 @@
 """Unit tests for read_gbq_table helper functions."""
 
 import datetime
+import unittest.mock as mock
 
+import google.cloud.bigquery
 import google.cloud.bigquery as bigquery
+import pytest
 
 import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table
 
@@ -45,3 +48,71 @@ def test_get_ibis_time_travel_table_doesnt_timetravel_anonymous_datasets():
 
     # Need fully-qualified table name.
     assert "my-test-project" in sql
+
+
+@pytest.mark.parametrize(
+    ("index_cols", "primary_keys", "values_distinct", "expected"),
+    (
+        (["col1", "col2"], ["col1", "col2", "col3"], False, False),
+        (["col1", "col2", "col3"], ["col1", "col2", "col3"], True, True),
+        (
+            ["col2", "col3", "col1"],
+            [
+                "col3",
+                "col2",
+            ],
+            True,
+            True,
+        ),
+        (["col1", "col2"], [], False, False),
+        ([], ["col1", "col2", "col3"], False, False),
+        ([], [], False, False),
+    ),
+)
+def test_are_index_cols_unique(index_cols, primary_keys, values_distinct, expected):
+    """If a primary key is set on the table, we use that as the index column
+    by default, no error should be raised in this case.
+
+    See internal issue 335727141.
+    """
+    table = google.cloud.bigquery.Table.from_api_repr(
+        {
+            "tableReference": {
+                "projectId": "my-project",
+                "datasetId": "my_dataset",
+                "tableId": "my_table",
+            },
+            "clustering": {
+                "fields": ["col1", "col2"],
+            },
+        },
+    )
+    table.schema = (
+        google.cloud.bigquery.SchemaField("col1", "INT64"),
+        google.cloud.bigquery.SchemaField("col2", "INT64"),
+        google.cloud.bigquery.SchemaField("col3", "INT64"),
+        google.cloud.bigquery.SchemaField("col4", "INT64"),
+    )
+
+    # TODO(b/305264153): use setter for table_constraints in client library
+    # when available.
+    table._properties["tableConstraints"] = {
+        "primaryKey": {
+            "columns": primary_keys,
+        },
+    }
+    bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
+    bqclient.project = "test-project"
+    bqclient.get_table.return_value = table
+
+    bqclient.query_and_wait.return_value = (
+        {"total_count": 3, "distinct_count": 3 if values_distinct else 2},
+    )
+    session = resources.create_bigquery_session(
+        bqclient=bqclient, table_schema=table.schema
+    )
+    table._properties["location"] = session._location
+
+    result = bf_read_gbq_table.are_index_cols_unique(bqclient, table, index_cols, "")
+
+    assert result == expected
@@ -179,6 +179,9 @@ def get_table_mock(table_ref):
         return table
 
     session.bqclient.get_table = get_table_mock
+    session.bqclient.query_and_wait.return_value = (
+        {"total_count": 3, "distinct_count": 2},
+    )
 
     with pytest.warns(UserWarning, match=re.escape("use_cache=False")):
         df = session.read_gbq("my-project.my_dataset.my_table")
@@ -200,6 +203,7 @@ def test_default_index_warning_raised_by_read_gbq(table):
     bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
     bqclient.project = "test-project"
     bqclient.get_table.return_value = table
+    bqclient.query_and_wait.return_value = ({"total_count": 3, "distinct_count": 2},)
     session = resources.create_bigquery_session(bqclient=bqclient)
     table._properties["location"] = session._location
 
@@ -222,6 +226,7 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64
     bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
     bqclient.project = "test-project"
     bqclient.get_table.return_value = table
+    bqclient.query_and_wait.return_value = ({"total_count": 4, "distinct_count": 3},)
     session = resources.create_bigquery_session(bqclient=bqclient)
     table._properties["location"] = session._location