16
16
import re
17
17
import typing
18
18
from typing import List , Optional
19
+ import warnings
19
20
20
21
import numpy as np
21
22
@@ -39,6 +40,7 @@ def agg(
39
40
model ,
40
41
cluster_column : typing .Optional [str ] = None ,
41
42
max_agg_rows : int = 10 ,
43
+ ground_with_google_search : bool = False ,
42
44
):
43
45
"""
44
46
Performs an aggregation over all rows of the table.
@@ -90,6 +92,14 @@ def agg(
90
92
max_agg_rows (int, default 10):
91
93
The maxinum number of rows to be aggregated at a time.
92
94
95
+ ground_with_google_search (bool, default False):
96
+ Enables Grounding with Google Search for the GeminiTextGenerator model.
97
+ When set to True, the model incorporates relevant information from Google
98
+ Search results into its responses, enhancing their accuracy and factualness.
99
+ Note: Using this feature may impact billing costs. Refer to the pricing
100
+ page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
101
+ The default is `False`.
102
+
93
103
Returns:
94
104
bigframes.dataframe.DataFrame: A new DataFrame with the aggregated answers.
95
105
@@ -119,6 +129,12 @@ def agg(
119
129
)
120
130
column = columns [0 ]
121
131
132
+ if ground_with_google_search :
133
+ warnings .warn (
134
+ "Enables Grounding with Google Search may impact billing cost. See pricing "
135
+ "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
136
+ )
137
+
122
138
if max_agg_rows <= 1 :
123
139
raise ValueError (
124
140
f"Invalid value for `max_agg_rows`: { max_agg_rows } ."
@@ -191,7 +207,12 @@ def agg(
191
207
192
208
# Run model
193
209
predict_df = typing .cast (
194
- bigframes .dataframe .DataFrame , model .predict (prompt_s , temperature = 0.0 )
210
+ bigframes .dataframe .DataFrame ,
211
+ model .predict (
212
+ prompt_s ,
213
+ temperature = 0.0 ,
214
+ ground_with_google_search = ground_with_google_search ,
215
+ ),
195
216
)
196
217
agg_df [column ] = predict_df ["ml_generate_text_llm_result" ].combine_first (
197
218
single_row_df
@@ -284,7 +305,7 @@ def cluster_by(
284
305
df [output_column ] = clustered_result ["CENTROID_ID" ]
285
306
return df
286
307
287
- def filter (self , instruction : str , model ):
308
+ def filter (self , instruction : str , model , ground_with_google_search : bool = False ):
288
309
"""
289
310
Filters the DataFrame with the semantics of the user instruction.
290
311
@@ -305,18 +326,26 @@ def filter(self, instruction: str, model):
305
326
[1 rows x 2 columns]
306
327
307
328
Args:
308
- instruction:
329
+ instruction (str) :
309
330
An instruction on how to filter the data. This value must contain
310
331
column references by name, which should be wrapped in a pair of braces.
311
332
For example, if you have a column "food", you can refer to this column
312
333
in the instructions like:
313
334
"The {food} is healthy."
314
335
315
- model:
336
+ model (bigframes.ml.llm.GeminiTextGenerator) :
316
337
A GeminiTextGenerator provided by Bigframes ML package.
317
338
339
+ ground_with_google_search (bool, default False):
340
+ Enables Grounding with Google Search for the GeminiTextGenerator model.
341
+ When set to True, the model incorporates relevant information from Google
342
+ Search results into its responses, enhancing their accuracy and factualness.
343
+ Note: Using this feature may impact billing costs. Refer to the pricing
344
+ page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
345
+ The default is `False`.
346
+
318
347
Returns:
319
- DataFrame filtered by the instruction.
348
+ bigframes.pandas.DataFrame: DataFrame filtered by the instruction.
320
349
321
350
Raises:
322
351
NotImplementedError: when the semantic operator experiment is off.
@@ -332,6 +361,12 @@ def filter(self, instruction: str, model):
332
361
if column not in self ._df .columns :
333
362
raise ValueError (f"Column { column } not found." )
334
363
364
+ if ground_with_google_search :
365
+ warnings .warn (
366
+ "Enables Grounding with Google Search may impact billing cost. See pricing "
367
+ "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
368
+ )
369
+
335
370
df : bigframes .dataframe .DataFrame = self ._df [columns ].copy ()
336
371
for column in columns :
337
372
if df [column ].dtype != dtypes .STRING_DTYPE :
@@ -345,14 +380,21 @@ def filter(self, instruction: str, model):
345
380
model .predict (
346
381
self ._make_prompt (df , columns , user_instruction , output_instruction ),
347
382
temperature = 0.0 ,
383
+ ground_with_google_search = ground_with_google_search ,
348
384
),
349
385
)
350
386
351
387
return self ._df [
352
388
results ["ml_generate_text_llm_result" ].str .lower ().str .contains ("true" )
353
389
]
354
390
355
- def map (self , instruction : str , output_column : str , model ):
391
+ def map (
392
+ self ,
393
+ instruction : str ,
394
+ output_column : str ,
395
+ model ,
396
+ ground_with_google_search : bool = False ,
397
+ ):
356
398
"""
357
399
Maps the DataFrame with the semantics of the user instruction.
358
400
@@ -376,21 +418,29 @@ def map(self, instruction: str, output_column: str, model):
376
418
[2 rows x 3 columns]
377
419
378
420
Args:
379
- instruction:
421
+ instruction (str) :
380
422
An instruction on how to map the data. This value must contain
381
423
column references by name, which should be wrapped in a pair of braces.
382
424
For example, if you have a column "food", you can refer to this column
383
425
in the instructions like:
384
426
"Get the ingredients of {food}."
385
427
386
- output_column:
428
+ output_column (str) :
387
429
The column name of the mapping result.
388
430
389
- model:
431
+ model (bigframes.ml.llm.GeminiTextGenerator) :
390
432
A GeminiTextGenerator provided by Bigframes ML package.
391
433
434
+ ground_with_google_search (bool, default False):
435
+ Enables Grounding with Google Search for the GeminiTextGenerator model.
436
+ When set to True, the model incorporates relevant information from Google
437
+ Search results into its responses, enhancing their accuracy and factualness.
438
+ Note: Using this feature may impact billing costs. Refer to the pricing
439
+ page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
440
+ The default is `False`.
441
+
392
442
Returns:
393
- DataFrame with attached mapping results.
443
+ bigframes.pandas.DataFrame: DataFrame with attached mapping results.
394
444
395
445
Raises:
396
446
NotImplementedError: when the semantic operator experiment is off.
@@ -406,6 +456,12 @@ def map(self, instruction: str, output_column: str, model):
406
456
if column not in self ._df .columns :
407
457
raise ValueError (f"Column { column } not found." )
408
458
459
+ if ground_with_google_search :
460
+ warnings .warn (
461
+ "Enables Grounding with Google Search may impact billing cost. See pricing "
462
+ "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
463
+ )
464
+
409
465
df : bigframes .dataframe .DataFrame = self ._df [columns ].copy ()
410
466
for column in columns :
411
467
if df [column ].dtype != dtypes .STRING_DTYPE :
@@ -421,14 +477,22 @@ def map(self, instruction: str, output_column: str, model):
421
477
model .predict (
422
478
self ._make_prompt (df , columns , user_instruction , output_instruction ),
423
479
temperature = 0.0 ,
480
+ ground_with_google_search = ground_with_google_search ,
424
481
)["ml_generate_text_llm_result" ],
425
482
)
426
483
427
484
from bigframes .core .reshape import concat
428
485
429
486
return concat ([self ._df , results .rename (output_column )], axis = 1 )
430
487
431
- def join (self , other , instruction : str , model , max_rows : int = 1000 ):
488
+ def join (
489
+ self ,
490
+ other ,
491
+ instruction : str ,
492
+ model ,
493
+ max_rows : int = 1000 ,
494
+ ground_with_google_search : bool = False ,
495
+ ):
432
496
"""
433
497
Joines two dataframes by applying the instruction over each pair of rows from
434
498
the left and right table.
@@ -455,10 +519,10 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
455
519
[4 rows x 2 columns]
456
520
457
521
Args:
458
- other:
522
+ other (bigframes.pandas.DataFrame) :
459
523
The other dataframe.
460
524
461
- instruction:
525
+ instruction (str) :
462
526
An instruction on how left and right rows can be joined. This value must contain
463
527
column references by name. which should be wrapped in a pair of braces.
464
528
For example: "The {city} belongs to the {country}".
@@ -467,22 +531,36 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
467
531
self joins. For example: "The {left.employee_name} reports to {right.employee_name}"
468
532
For unique column names, this prefix is optional.
469
533
470
- model:
534
+ model (bigframes.ml.llm.GeminiTextGenerator) :
471
535
A GeminiTextGenerator provided by Bigframes ML package.
472
536
473
- max_rows:
537
+ max_rows (int, default 1000) :
474
538
The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method
475
539
call will end early with an error.
476
540
541
+ ground_with_google_search (bool, default False):
542
+ Enables Grounding with Google Search for the GeminiTextGenerator model.
543
+ When set to True, the model incorporates relevant information from Google
544
+ Search results into its responses, enhancing their accuracy and factualness.
545
+ Note: Using this feature may impact billing costs. Refer to the pricing
546
+ page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
547
+ The default is `False`.
548
+
477
549
Returns:
478
- The joined dataframe.
550
+ bigframes.pandas.DataFrame: The joined dataframe.
479
551
480
552
Raises:
481
553
ValueError if the amount of data that will be sent for LLM processing is larger than max_rows.
482
554
"""
483
555
self ._validate_model (model )
484
556
columns = self ._parse_columns (instruction )
485
557
558
+ if ground_with_google_search :
559
+ warnings .warn (
560
+ "Enables Grounding with Google Search may impact billing cost. See pricing "
561
+ "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
562
+ )
563
+
486
564
joined_table_rows = len (self ._df ) * len (other )
487
565
488
566
if joined_table_rows > max_rows :
@@ -545,7 +623,9 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
545
623
546
624
joined_df = self ._df .merge (other , how = "cross" , suffixes = ("_left" , "_right" ))
547
625
548
- return joined_df .semantics .filter (instruction , model ).reset_index (drop = True )
626
+ return joined_df .semantics .filter (
627
+ instruction , model , ground_with_google_search = ground_with_google_search
628
+ ).reset_index (drop = True )
549
629
550
630
def search (
551
631
self ,
@@ -644,7 +724,13 @@ def search(
644
724
645
725
return typing .cast (bigframes .dataframe .DataFrame , search_result )
646
726
647
- def top_k (self , instruction : str , model , k = 10 ):
727
+ def top_k (
728
+ self ,
729
+ instruction : str ,
730
+ model ,
731
+ k : int = 10 ,
732
+ ground_with_google_search : bool = False ,
733
+ ):
648
734
"""
649
735
Ranks each tuple and returns the k best according to the instruction.
650
736
@@ -682,6 +768,14 @@ def top_k(self, instruction: str, model, k=10):
682
768
k (int, default 10):
683
769
The number of rows to return.
684
770
771
+ ground_with_google_search (bool, default False):
772
+ Enables Grounding with Google Search for the GeminiTextGenerator model.
773
+ When set to True, the model incorporates relevant information from Google
774
+ Search results into its responses, enhancing their accuracy and factualness.
775
+ Note: Using this feature may impact billing costs. Refer to the pricing
776
+ page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
777
+ The default is `False`.
778
+
685
779
Returns:
686
780
bigframes.dataframe.DataFrame: A new DataFrame with the top k rows.
687
781
@@ -703,6 +797,12 @@ def top_k(self, instruction: str, model, k=10):
703
797
"Semantic aggregations are limited to a single column."
704
798
)
705
799
800
+ if ground_with_google_search :
801
+ warnings .warn (
802
+ "Enables Grounding with Google Search may impact billing cost. See pricing "
803
+ "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
804
+ )
805
+
706
806
df : bigframes .dataframe .DataFrame = self ._df [columns ].copy ()
707
807
column = columns [0 ]
708
808
if df [column ].dtype != dtypes .STRING_DTYPE :
@@ -743,6 +843,7 @@ def top_k(self, instruction: str, model, k=10):
743
843
user_instruction ,
744
844
model ,
745
845
k - num_selected ,
846
+ ground_with_google_search ,
746
847
)
747
848
num_selected += num_new_selected
748
849
@@ -757,7 +858,13 @@ def top_k(self, instruction: str, model, k=10):
757
858
758
859
@staticmethod
759
860
def _topk_partition (
760
- df , column : str , status_column : str , user_instruction : str , model , k
861
+ df ,
862
+ column : str ,
863
+ status_column : str ,
864
+ user_instruction : str ,
865
+ model ,
866
+ k : int ,
867
+ ground_with_google_search : bool ,
761
868
):
762
869
output_instruction = (
763
870
"Given a question and two documents, choose the document that best answers "
@@ -784,7 +891,12 @@ def _topk_partition(
784
891
import bigframes .dataframe
785
892
786
893
predict_df = typing .cast (
787
- bigframes .dataframe .DataFrame , model .predict (prompt_s , temperature = 0.0 )
894
+ bigframes .dataframe .DataFrame ,
895
+ model .predict (
896
+ prompt_s ,
897
+ temperature = 0.0 ,
898
+ ground_with_google_search = ground_with_google_search ,
899
+ ),
788
900
)
789
901
790
902
marks = predict_df ["ml_generate_text_llm_result" ].str .contains ("2" )
0 commit comments