Skip to content

feat: implement bigframes.bigquery.json_extract_array #910

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 23, 2024
Merged
32 changes: 32 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,38 @@ def json_extract(
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))


def json_extract_array(
series: series.Series,
json_path: str = "$",
) -> series.Series:
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
values. This function uses single quotes and brackets to escape invalid JSONPath
characters in JSON keys.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
>>> bbq.json_extract_array(s)
0 ['1' '2' '3']
1 ['4' '5']
dtype: list<item: string>[pyarrow]

Args:
series (bigframes.series.Series):
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
json_path (str):
The JSON path identifying the data that you want to obtain from the input.

Returns:
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
"""
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))


# Search functions defined from
# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions

Expand Down
12 changes: 12 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -947,6 +947,11 @@ def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
return json_extract(json_obj=x, json_path=op.json_path)


@scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True)
def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray):
return json_extract_array(json_obj=x, json_path=op.json_path)


### Binary Ops
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
Expand Down Expand Up @@ -1581,6 +1586,13 @@ def json_extract(
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""


@ibis.udf.scalar.builtin(name="json_extract_array")
def json_extract_array(
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
) -> ibis_dtypes.Array[ibis_dtypes.String]:
"""Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values."""


@ibis.udf.scalar.builtin(name="ML.DISTANCE")
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""
17 changes: 17 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,23 @@ def output_type(self, *input_types):
return input_type


@dataclasses.dataclass(frozen=True)
class JSONExtractArray(UnaryOp):
name: typing.ClassVar[str] = "json_extract_array"
json_path: str

def output_type(self, *input_types):
input_type = input_types[0]
if not dtypes.is_json_like(input_type):
raise TypeError(
"Input type must be an valid JSON object or JSON-formatted string type."
+ f" Received type: {input_type}"
)
return pd.ArrowDtype(
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
)


# Binary Ops
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
Expand Down
25 changes: 25 additions & 0 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,28 @@ def test_json_extract_from_string():
def test_json_extract_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_extract(bpd.Series([1, 2]), "$.a")


def test_json_extract_array_from_json_strings():
s = bpd.Series(['{"a": [1, 2, 3]}', '{"a": []}', '{"a": [4,5]}'])
actual = bbq.json_extract_array(s, "$.a")
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_array_from_array_strings():
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
actual = bbq.json_extract_array(s)
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_array_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_extract_array(bpd.Series([1, 2]))