Skip to content

docs: add sample for getting started with BQML #141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c1573f0
docs: add sample for getting started with BQML
DevStephanie Oct 25, 2023
0b69c57
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Oct 25, 2023
4e7d81c
Creating clarifying comments
DevStephanie Oct 25, 2023
7e2094f
Merging comments with this branch
DevStephanie Oct 25, 2023
a22640b
Correcting comments, merging with first branch bqml_tutorial from bqm…
DevStephanie Oct 25, 2023
0fc8d09
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Oct 25, 2023
8751f50
corrections on comments
DevStephanie Oct 25, 2023
a67068e
Correcting code comments from BQ docs
DevStephanie Oct 26, 2023
9ec139e
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Oct 26, 2023
ec651b1
Fixing code comments to reflect BQML documentation
DevStephanie Nov 1, 2023
be95c76
Correcting comments to reflect BQML documentation
DevStephanie Nov 1, 2023
fbbe32b
Correcting code comments
DevStephanie Nov 6, 2023
d4591c8
Merge branch 'main' into bqml_tutorial
DevStephanie Nov 6, 2023
a2b7f2f
Correcting documentation code
DevStephanie Nov 7, 2023
c899565
Correcting documentation errors
DevStephanie Nov 7, 2023
8364454
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Nov 7, 2023
509c1f4
Correcting documentation comments and correcting features
DevStephanie Nov 7, 2023
16d4f18
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Nov 7, 2023
34eb65a
Correcting documention comments for code samples
DevStephanie Nov 7, 2023
9aa6e7a
Merge branch 'bqml_tutorial' of https://github.com/googleapis/python-…
DevStephanie Nov 7, 2023
0a9f06d
Merge branch 'main' into bqml_tutorial
DevStephanie Nov 10, 2023
c4a3b55
Merge branch 'bqml_tutorial' of https://github.com/googleapis/python-…
DevStephanie Nov 10, 2023
16c6fb0
Apply suggestions from code review
DevStephanie Nov 10, 2023
77c22b9
Correcting documentation comments
DevStephanie Nov 13, 2023
bcdc9e2
Merge branch 'bqml_tutorial' of https://github.com/googleapis/python-…
DevStephanie Nov 13, 2023
93f911d
Correcting documentation comments
DevStephanie Nov 13, 2023
f3aee5d
Apply suggestions from code review
tswast Nov 16, 2023
1ac855d
Apply suggestions from code review
tswast Nov 16, 2023
b25bb26
Merge branch 'main' into bqml_tutorial
tswast Nov 16, 2023
1f0910a
Merge branch 'main' into bqml_tutorial
tswast Dec 11, 2023
5494f46
Fixtures for temporary resources
DevStephanie Dec 12, 2023
a47a777
Merge remote-tracking branch 'origin' into bqml_tutorial
DevStephanie Dec 12, 2023
016e81c
Merge remote-tracking branch 'origin/bqml_tutorial' into bqml_tutorial
DevStephanie Dec 12, 2023
7a04299
Deleting files
DevStephanie Dec 12, 2023
fbf2527
Merge branch 'main' into bqml_tutorial
tswast Dec 12, 2023
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions samples/snippets/bqml_getting_started_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def test_bqml_getting_started(random_model_id):
your_model_id = random_model_id

# [START bigquery_dataframes_bqml_getting_started_tutorial]
from bigframes.ml.linear_model import LogisticRegression
import bigframes.pandas as bpd

# Start by selecting the data you'll use for training. `read_gbq` accepts
# either a SQL query or a table ID. Since this example selects from multiple
# tables via a wildcard, use SQL to define this data. Watch issue
# https://github.com/googleapis/python-bigquery-dataframes/issues/169
# for updates to `read_gbq` to support wildcard tables.

df = bpd.read_gbq(
"""
-- Since the order of rows isn't useful for the model training,
-- generate a random ID to use as the index for the DataFrame.
SELECT GENERATE_UUID() AS rowindex, *
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
""",
index_col="rowindex",
)

# Extract the total number of transactions within
# the Google Analytics session.
#
# Because the totals column is a STRUCT data type, call
# Series.struct.field("transactions") to extract the transactions field.
# See the reference documentation below:
# https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field
transactions = df["totals"].struct.field("transactions")

# The "label" values represent the outcome of the model's
# prediction. In this case, the model predicts if there are any
# ecommerce transactions within the Google Analytics session.
# If the number of transactions is NULL, the value in the label
# column is set to 0. Otherwise, it is set to 1.
label = transactions.notnull().map({True: 1, False: 0})

# Extract the operating system of the visitor's device.
operatingSystem = df["device"].struct.field("operatingSystem")
operatingSystem = operatingSystem.fillna("")

# Extract whether the visitor's device is a mobile device.
isMobile = df["device"].struct.field("isMobile")

# Extract the country from which the sessions originated, based on the IP address.
country = df["geoNetwork"].struct.field("country").fillna("")

# Extract the total number of page views within the session.
pageviews = df["totals"].struct.field("pageviews").fillna(0)

# Combine all the feature columns into a single DataFrame
# to use as training data.
features = bpd.DataFrame(
{
"os": operatingSystem,
"is_mobile": isMobile,
"country": country,
"pageviews": pageviews,
}
)

# Logistic Regression model splits data into two classes, giving the
# a confidence score that the data is in one of the classes.
model = LogisticRegression()
model.fit(features, label)

# The model.fit() call above created a temporary model.
# Use the to_gbq() method to write to a permanent location.
model.to_gbq(
your_model_id, # For example: "bqml_tutorial.sample_model",
replace=True,
)
# [END bigquery_dataframes_bqml_getting_started_tutorial]
66 changes: 66 additions & 0 deletions samples/snippets/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Iterator

from google.cloud import bigquery
import pytest
import test_utils.prefixer

prefixer = test_utils.prefixer.Prefixer(
"python-bigquery-dataframes", "samples/snippets"
)


@pytest.fixture(scope="session", autouse=True)
def cleanup_datasets(bigquery_client: bigquery.Client) -> None:
for dataset in bigquery_client.list_datasets():
if prefixer.should_cleanup(dataset.dataset_id):
bigquery_client.delete_dataset(
dataset, delete_contents=True, not_found_ok=True
)


@pytest.fixture(scope="session")
def bigquery_client() -> bigquery.Client:
bigquery_client = bigquery.Client()
return bigquery_client


@pytest.fixture(scope="session")
def project_id(bigquery_client: bigquery.Client) -> str:
return bigquery_client.project


@pytest.fixture(scope="session")
def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]:
dataset_id = prefixer.create_prefix()
full_dataset_id = f"{project_id}.{dataset_id}"
dataset = bigquery.Dataset(full_dataset_id)
bigquery_client.create_dataset(dataset)
yield dataset_id
bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)


@pytest.fixture
def random_model_id(
bigquery_client: bigquery.Client, project_id: str, dataset_id: str
) -> Iterator[str]:
"""Create a new table ID each time, so random_model_id can be used as
target for load jobs.
"""
random_model_id = prefixer.create_prefix()
full_model_id = f"{project_id}.{dataset_id}.{random_model_id}"
yield full_model_id
bigquery_client.delete_model(full_model_id, not_found_ok=True)