Skip to content

feat: support datetime related casting in (Series|DataFrame|Index).astype #442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 15, 2024
59 changes: 46 additions & 13 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,11 +634,56 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp):
return struct_value[name].name(name)


def numeric_to_datatime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue:
if not isinstance(x, ibis_types.IntegerValue) and not isinstance(
x, ibis_types.FloatingValue
):
raise TypeError("Non-numerical types are not supposed to reach this function.")

if unit not in UNIT_TO_US_CONVERSION_FACTORS:
raise ValueError(f"Cannot convert input with unit '{unit}'.")
x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit]
x_converted = x_converted.cast(ibis_dtypes.int64)

# Note: Due to an issue where casting directly to a timestamp
# without a timezone does not work, we first cast to UTC. This
# approach appears to bypass a potential bug in Ibis's cast function,
# allowing for subsequent casting to a timestamp type without timezone
# information. Further investigation is needed to confirm this behavior.
return x_converted.to_timestamp(unit="us").cast(
ibis_dtypes.Timestamp(timezone="UTC")
)


@scalar_op_compiler.register_unary_op(ops.AsTypeOp, pass_op=True)
def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp):
to_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(op.to_type)
if isinstance(x, ibis_types.NullScalar):
return ibis_types.null().cast(to_type)

# When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first.
if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp:
x_converted = x.cast(ibis_dtypes.Timestamp(timezone="UTC"))
return bigframes.dtypes.cast_ibis_value(x_converted, to_type)

if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time:
# The conversion unit is set to "us" (microseconds) for consistency
# with pandas converting time64[us][pyarrow] to int64[pyarrow].
return x.delta(ibis.time("00:00:00"), part="microsecond")

if x.type() == ibis_dtypes.int64:
# The conversion unit is set to "us" (microseconds) for consistency
# with pandas converting int64[pyarrow] to timestamp[us][pyarrow],
# timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow].
unit = "us"
x_converted = numeric_to_datatime(x, unit)
if to_type == ibis_dtypes.timestamp:
return x_converted.cast(ibis_dtypes.Timestamp())
elif to_type == ibis_dtypes.Timestamp(timezone="UTC"):
return x_converted
elif to_type == ibis_dtypes.time:
return x_converted.time()

return bigframes.dtypes.cast_ibis_value(x, to_type)


Expand Down Expand Up @@ -677,19 +722,7 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp):
# The default unit is set to "ns" (nanoseconds) for consistency
# with pandas, where "ns" is the default unit for datetime operations.
unit = op.unit or "ns"
if unit not in UNIT_TO_US_CONVERSION_FACTORS:
raise ValueError(f"Cannot convert input with unit '{unit}'.")
x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit]
x_converted = x_converted.cast(ibis_dtypes.int64)

# Note: Due to an issue where casting directly to a timestamp
# without a timezone does not work, we first cast to UTC. This
# approach appears to bypass a potential bug in Ibis's cast function,
# allowing for subsequent casting to a timestamp type without timezone
# information. Further investigation is needed to confirm this behavior.
x = x_converted.to_timestamp(unit="us").cast(
ibis_dtypes.Timestamp(timezone="UTC")
)
x = numeric_to_datatime(x, unit)

return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None))

Expand Down
48 changes: 39 additions & 9 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
"boolean",
"Float64",
"Int64",
"int64[pyarrow]",
"string",
"string[pyarrow]",
"timestamp[us, tz=UTC][pyarrow]",
Expand Down Expand Up @@ -173,6 +174,9 @@
# "string" and "string[pyarrow]" are accepted
BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow")

# special case - both "Int64" and "int64[pyarrow]" are accepted
BIGFRAMES_STRING_TO_BIGFRAMES["int64[pyarrow]"] = pd.Int64Dtype()

# For the purposes of dataframe.memory_usage
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes
DTYPE_BYTE_SIZES = {
Expand Down Expand Up @@ -310,11 +314,12 @@ def bigframes_dtype_to_ibis_dtype(
textwrap.dedent(
f"""
Unexpected data type {bigframes_dtype}. The following
str dtypes are supppted: 'boolean','Float64','Int64', 'string',
'string[pyarrow]','timestamp[us, tz=UTC][pyarrow]',
'timestamp[us][pyarrow]','date32[day][pyarrow]',
'time64[us][pyarrow]'. The following pandas.ExtensionDtype are
supported: pandas.BooleanDtype(), pandas.Float64Dtype(),
str dtypes are supppted: 'boolean','Float64','Int64',
'int64[pyarrow]','string','string[pyarrow]',
'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
'date32[day][pyarrow]','time64[us][pyarrow]'.
The following pandas.ExtensionDtype are supported:
pandas.BooleanDtype(), pandas.Float64Dtype(),
pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
pd.ArrowDtype(pa.timestamp("us")),
Expand Down Expand Up @@ -434,6 +439,9 @@ def cast_ibis_value(
ibis_dtypes.string,
ibis_dtypes.Decimal(precision=38, scale=9),
ibis_dtypes.Decimal(precision=76, scale=38),
ibis_dtypes.time,
ibis_dtypes.timestamp,
ibis_dtypes.Timestamp(timezone="UTC"),
),
ibis_dtypes.float64: (
ibis_dtypes.string,
Expand All @@ -447,8 +455,15 @@ def cast_ibis_value(
ibis_dtypes.Decimal(precision=38, scale=9),
ibis_dtypes.Decimal(precision=76, scale=38),
ibis_dtypes.binary,
ibis_dtypes.date,
ibis_dtypes.timestamp,
ibis_dtypes.Timestamp(timezone="UTC"),
),
ibis_dtypes.date: (
ibis_dtypes.string,
ibis_dtypes.timestamp,
ibis_dtypes.Timestamp(timezone="UTC"),
),
ibis_dtypes.date: (ibis_dtypes.string,),
ibis_dtypes.Decimal(precision=38, scale=9): (
ibis_dtypes.float64,
ibis_dtypes.Decimal(precision=76, scale=38),
Expand All @@ -457,9 +472,24 @@ def cast_ibis_value(
ibis_dtypes.float64,
ibis_dtypes.Decimal(precision=38, scale=9),
),
ibis_dtypes.time: (),
ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),),
ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,),
ibis_dtypes.time: (
ibis_dtypes.int64,
ibis_dtypes.string,
),
ibis_dtypes.timestamp: (
ibis_dtypes.date,
ibis_dtypes.int64,
ibis_dtypes.string,
ibis_dtypes.time,
ibis_dtypes.Timestamp(timezone="UTC"),
),
ibis_dtypes.Timestamp(timezone="UTC"): (
ibis_dtypes.date,
ibis_dtypes.int64,
ibis_dtypes.string,
ibis_dtypes.time,
ibis_dtypes.timestamp,
),
ibis_dtypes.binary: (ibis_dtypes.string,),
}

Expand Down
98 changes: 98 additions & 0 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2625,6 +2625,9 @@ def foo(x):
("int64_col", "boolean"),
("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))),
("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))),
("int64_col", pd.ArrowDtype(pa.timestamp("us"))),
("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))),
("int64_col", "time64[us][pyarrow]"),
("bool_col", "Int64"),
("bool_col", "string[pyarrow]"),
("string_col", "binary[pyarrow]"),
Expand All @@ -2633,9 +2636,17 @@ def foo(x):
# raises a deprecation warning to use tz_localize/tz_convert instead,
# but BigQuery always stores values as UTC and doesn't have to deal
# with timezone conversions, so we'll allow it.
("timestamp_col", "date32[day][pyarrow]"),
("timestamp_col", "time64[us][pyarrow]"),
("timestamp_col", pd.ArrowDtype(pa.timestamp("us"))),
("datetime_col", "date32[day][pyarrow]"),
("datetime_col", "string[pyarrow]"),
("datetime_col", "time64[us][pyarrow]"),
("datetime_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))),
("date_col", "string[pyarrow]"),
("date_col", pd.ArrowDtype(pa.timestamp("us"))),
("date_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))),
("time_col", "string[pyarrow]"),
# TODO(bmil): fix Ibis bug: BigQuery backend rounds to nearest int
# ("float64_col", "Int64"),
# TODO(bmil): decide whether to fix Ibis bug: BigQuery backend
Expand All @@ -2653,6 +2664,24 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type):
pd.testing.assert_series_equal(bf_result, pd_result)


@pytest.mark.parametrize(
("column", "to_type"),
[
("timestamp_col", "int64[pyarrow]"),
("datetime_col", "int64[pyarrow]"),
("time_col", "int64[pyarrow]"),
],
)
@skip_legacy_pandas
def test_date_time_astype_int(
scalars_df_index, scalars_pandas_df_index, column, to_type
):
bf_result = scalars_df_index[column].astype(to_type).to_pandas()
pd_result = scalars_pandas_df_index[column].astype(to_type)
pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add another assert to make sure the type of bf_result is expected, though it may be different as pd_result?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea!

assert bf_result.dtype == "Int64"


def test_string_astype_int():
pd_series = pd.Series(["4", "-7", "0", " -03"])
bf_series = series.Series(pd_series)
Expand All @@ -2676,6 +2705,75 @@ def test_string_astype_float():
pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_string_astype_date():
pd_series = pd.Series(["2014-08-15", "2215-08-15", "2016-02-29"]).astype(
pd.ArrowDtype(pa.string())
)

bf_series = series.Series(pd_series)

pd_result = pd_series.astype("date32[day][pyarrow]")
bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas()

pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_string_astype_datetime():
pd_series = pd.Series(
["2014-08-15 08:15:12", "2015-08-15 08:15:12.654754", "2016-02-29 00:00:00"]
).astype(pd.ArrowDtype(pa.string()))

bf_series = series.Series(pd_series)

pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us")))
bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas()

pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_string_astype_timestamp():
pd_series = pd.Series(
[
"2014-08-15 08:15:12+00:00",
"2015-08-15 08:15:12.654754+05:00",
"2016-02-29 00:00:00+08:00",
]
).astype(pd.ArrowDtype(pa.string()))

bf_series = series.Series(pd_series)

pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC")))
bf_result = bf_series.astype(
pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
).to_pandas()

pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_timestamp_astype_string():
bf_series = series.Series(
[
"2014-08-15 08:15:12+00:00",
"2015-08-15 08:15:12.654754+05:00",
"2016-02-29 00:00:00+08:00",
]
).astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC")))

expected_result = pd.Series(
[
"2014-08-15 08:15:12+00",
"2015-08-15 03:15:12.654754+00",
"2016-02-28 16:00:00+00",
]
)
bf_result = bf_series.astype(pa.string()).to_pandas()

pd.testing.assert_series_equal(
bf_result, expected_result, check_index_type=False, check_dtype=False
)
assert bf_result.dtype == "string[pyarrow]"


@pytest.mark.parametrize(
"index",
[0, 5, -2],
Expand Down