googleapis · shobsi · May 10, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024
@@ -21,11 +21,13 @@
 
 from __future__ import annotations
 
+import ast
 import dataclasses
 import functools
 import itertools
 import os
 import random
+import textwrap
 import typing
 from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Union
 import warnings
@@ -44,8 +46,8 @@
 import bigframes.core.join_def as join_defs
 import bigframes.core.ordering as ordering
 import bigframes.core.schema as bf_schema
+import bigframes.core.sql as sql
 import bigframes.core.tree_properties as tree_properties
-import bigframes.core.utils
 import bigframes.core.utils as utils
 import bigframes.core.window_spec as window_specs
 import bigframes.dtypes
@@ -1437,9 +1439,7 @@ def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]:
         )
 
     def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
-        axis_number = bigframes.core.utils.get_axis_number(
-            "rows" if (axis is None) else axis
-        )
+        axis_number = utils.get_axis_number("rows" if (axis is None) else axis)
         if axis_number == 0:
             expr = self._expr
             for index_col in self._index_columns:
@@ -1460,9 +1460,7 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
             return self.rename(columns=lambda label: f"{prefix}{label}")
 
     def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block:
-        axis_number = bigframes.core.utils.get_axis_number(
-            "rows" if (axis is None) else axis
-        )
+        axis_number = utils.get_axis_number("rows" if (axis is None) else axis)
         if axis_number == 0:
             expr = self._expr
             for index_col in self._index_columns:
@@ -2072,6 +2070,95 @@ def _is_monotonic(
         self._stats_cache[column_name].update({op_name: result})
         return result
 
+    def _get_rows_as_json_values(self) -> Block:
+        # We want to preserve any ordering currently present before turning to
+        # direct SQL manipulation. We will restore the ordering when we rebuild
+        # expression.
+        # TODO(shobs): Replace direct SQL manipulation by structured expression
+        # manipulation
+        ordering_column_name = guid.generate_guid()
+        expr = self.session._cache_with_offsets(self.expr)
+        expr = expr.promote_offsets(ordering_column_name)
+        expr_sql = self.session._to_sql(expr)
+
+        # Names of the columns to serialize for the row.
+        # We will use the repr-eval pattern to serialize a value here and
+        # deserialize in the cloud function. Let's make sure that would work.
+        column_names = []
+        for col in list(self.index_columns) + [col for col in self.column_labels]:
+            serialized_column_name = repr(col)
+            try:
+                ast.literal_eval(serialized_column_name)
+            except Exception:
+                raise NameError(
+                    f"Column name type '{type(col).__name__}' is not supported for row serialization."
+                    " Please consider using a name for which literal_eval(repr(name)) works."
+                )
+
+            column_names.append(serialized_column_name)
+        column_names_csv = sql.csv(column_names, quoted=True)
+
+        # index columns count
+        index_columns_count = len(self.index_columns)
+
+        # column references to form the array of values for the row
+        column_references_csv = sql.csv(
+            [sql.cast_as_string(col) for col in self.expr.column_ids]
+        )
+
+        # types of the columns to serialize for the row
+        column_types = list(self.index.dtypes) + list(self.dtypes)
+        column_types_csv = sql.csv([str(typ) for typ in column_types], quoted=True)
+
+        # row dtype to use for deserializing the row as pandas series
+        pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types)
+        if pandas_row_dtype is None:
+            pandas_row_dtype = "object"
+        pandas_row_dtype = sql.quote(str(pandas_row_dtype))
+
+        # create a json column representing row through SQL manipulation
+        row_json_column_name = guid.generate_guid()
+        select_columns = (
+            [ordering_column_name] + list(self.index_columns) + [row_json_column_name]
+        )
+        select_columns_csv = sql.csv(
+            [sql.column_reference(col) for col in select_columns]
+        )
+        json_sql = f"""\
+With T0 AS (
+{textwrap.indent(expr_sql, "    ")}
+),
+T1 AS (
+    SELECT *,
+           JSON_OBJECT(
+               "names", [{column_names_csv}],
+               "types", [{column_types_csv}],
+               "values", [{column_references_csv}],
+               "indexlength", {index_columns_count},
+               "dtype", {pandas_row_dtype}
+           ) AS {row_json_column_name} FROM T0
+)
+SELECT {select_columns_csv} FROM T1
+"""
+        ibis_table = self.session.ibis_client.sql(json_sql)
+        order_for_ibis_table = ordering.ExpressionOrdering.from_offset_col(
+            ordering_column_name
+        )
+        expr = core.ArrayValue.from_ibis(
+            self.session,
+            ibis_table,
+            [ibis_table[col] for col in select_columns if col != ordering_column_name],
+            hidden_ordering_columns=[ibis_table[ordering_column_name]],
+            ordering=order_for_ibis_table,
+        )
+        block = Block(
+            expr,
+            index_columns=self.index_columns,
+            column_labels=[row_json_column_name],
+            index_labels=self._index_labels,
+        )
+        return block
+
 
 class BlockIndexProperties:
     """Accessor for the index-related block properties."""

@@ -0,0 +1,59 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Utility functions for SQL construction.
+"""
+
+from typing import Iterable
+
+
+def quote(value: str):
+    """Return quoted input string."""
+
+    # Let's use repr which also escapes any special characters
+    #
+    # >>> for val in [
+    # ...     "123",
+    # ...     "str with no special chars",
+    # ...     "str with special chars.,'\"/\\"
+    # ... ]:
+    # ...     print(f"{val} -> {repr(val)}")
+    # ...
+    # 123 -> '123'
+    # str with no special chars -> 'str with no special chars'
+    # str with special chars.,'"/\ -> 'str with special chars.,\'"/\\'
+
+    return repr(value)
+
+
+def column_reference(column_name: str):
+    """Return a string representing column reference in a SQL."""
+
+    return f"`{column_name}`"
+
+
+def cast_as_string(column_name: str):
+    """Return a string representing string casting of a column."""
+
+    return f"CAST({column_reference(column_name)} AS STRING)"
+
+
+def csv(values: Iterable[str], quoted=False):
+    """Return a string of comma separated values."""
+
+    if quoted:
+        values = [quote(val) for val in values]
+
+    return ", ".join(values)
@@ -34,6 +34,7 @@
     Tuple,
     Union,
 )
+import warnings
 
 import bigframes_vendored.pandas.core.frame as vendored_pandas_frame
 import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing
@@ -61,6 +62,7 @@
 import bigframes.core.window
 import bigframes.core.window_spec as window_spec
 import bigframes.dtypes
+import bigframes.exceptions
 import bigframes.formatting_helpers as formatter
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
@@ -3308,7 +3310,59 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
             ops.RemoteFunctionOp(func=func, apply_on_null=(na_action is None))
         )
 
-    def apply(self, func, *, args: typing.Tuple = (), **kwargs):
+    def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
+        if utils.get_axis_number(axis) == 1:
+            warnings.warn(
+                "axis=1 scenario is in preview.",
+                category=bigframes.exceptions.PreviewWarning,
+            )
+
+            # Early check whether the dataframe dtypes are currently supported
+            # in the remote function
+            # NOTE: Keep in sync with the value converters used in the gcf code
+            # generated in generate_cloud_function_main_code in remote_function.py
+            remote_function_supported_dtypes = (
+                bigframes.dtypes.INT_DTYPE,
+                bigframes.dtypes.FLOAT_DTYPE,
+                bigframes.dtypes.BOOL_DTYPE,
+                bigframes.dtypes.STRING_DTYPE,
+            )
+            supported_dtypes_types = tuple(
+                type(dtype) for dtype in remote_function_supported_dtypes
+            )
+            supported_dtypes_hints = tuple(
+                str(dtype) for dtype in remote_function_supported_dtypes
+            )
+
+            for dtype in self.dtypes:
+                if not isinstance(dtype, supported_dtypes_types):
+                    raise NotImplementedError(
+                        f"DataFrame has a column of dtype '{dtype}' which is not supported with axis=1."
+                        f" Supported dtypes are {supported_dtypes_hints}."
+                    )
+
+            # Check if the function is a remote function
+            if not hasattr(func, "bigframes_remote_function"):
+                raise ValueError("For axis=1 a remote function must be used.")
+
+            # Serialize the rows as json values
+            block = self._get_block()
+            rows_as_json_series = bigframes.series.Series(
+                block._get_rows_as_json_values()
+            )
+
+            # Apply the function
+            result_series = rows_as_json_series._apply_unary_op(
+                ops.RemoteFunctionOp(func=func, apply_on_null=True)
+            )
+            result_series.name = None
+
+            # Return Series with materialized result so that any error in the remote
+            # function is caught early
+            materialized_series = result_series.cache()
+            return materialized_series
+
+        # Per-column apply
         results = {name: func(col, *args, **kwargs) for name, col in self.items()}
         if all(
             [

@@ -33,3 +33,7 @@ class CleanupFailedWarning(Warning):
 
 class DefaultIndexWarning(Warning):
     """Default index may cause unexpected costs."""
+
+
+class PreviewWarning(Warning):
+    """The feature is in preview."""