"""
**File:** ``coercion.py``
**Region:** ``ds_provider_azure_py_lib/serde/coercion``
Coercion functions to convert between pandas/numpy/pyarrow types and Azure Table Storage-compatible types.
"""
import base64
import contextlib
import json
from datetime import date, datetime, time, timezone
from typing import Any
from uuid import UUID
import pandas as pd
from azure.data.tables import EdmType
_INT32_MIN = -(2**31)
_INT32_MAX = 2**31 - 1
[docs]
def _coerce_for_json(value: Any) -> Any: # noqa: PLR0912
"""
Recursively coerce a value into a JSON-serializable form.
This helper is intended for preparing values to be passed to ``json.dumps``.
It preserves the overall structure of the input while converting unsupported
types into JSON-friendly representations. Containers (lists, tuples, and
dicts) are processed recursively.
Args:
value: Any Python, pandas, NumPy, or PyArrow value to coerce. May be a
scalar (e.g. ``int``, ``str``, ``pd.Timestamp``), a container
(``list``, ``tuple``, ``dict``), or a library-specific scalar
(e.g. ``numpy.scalar``, ``pyarrow.Scalar``, ``pd.Timedelta``).
Returns:
A JSON-serializable value with the same logical content as ``value``,
where:
* ``None`` and pandas/NumPy/pyarrow missing values are returned as
``None``.
* ``list`` and ``tuple`` inputs become lists whose elements have been
recursively coerced.
* ``dict`` inputs become dicts whose values have been recursively
coerced.
* ``pyarrow`` scalars are converted via ``.as_py()`` and then coerced
again.
* ``numpy`` scalars are converted via ``.item()`` and then coerced
again.
* ``pd.Timestamp`` values are converted to ISO 8601 strings, with
timezone-naive timestamps localized to UTC before conversion.
* ``pd.Timedelta`` values are converted to ISO 8601 duration strings
of the form ``"PT<seconds>S"`` (including a leading ``-`` for
negative values).
* ``uuid.UUID``, :class:`datetime.date`, :class:`datetime.datetime`,
and :class:`datetime.time` values are converted to their ISO format
string representations.
* ``bytes`` values are base64-encoded and returned as UTF-8 strings.
* Integer values outside the 32-bit signed range are returned as
``[value, "EdmType.INT64"]`` to preserve precision when encoded as
JSON.
* All other values are returned unchanged.
Examples:
Basic scalar coercion:
>>> _coerce_for_json(42)
42
>>> _coerce_for_json(None)
None
Large integer (outside 32-bit range) is wrapped for INT64 handling:
>>> _coerce_for_json(2**40)
[1099511627776, "EdmType.INT64"]
Timestamp and date/time coercion:
>>> _coerce_for_json(pd.Timestamp("2024-01-01T00:00:00Z"))
'2024-01-01T00:00:00+00:00'
>>> _coerce_for_json(datetime(2024, 1, 1, 12, 0, 0))
'2024-01-01T12:00:00'
Bytes and nested structures:
>>> _coerce_for_json({"data": b"hello", "ids": (1, 2, 3)})
{'data': 'aGVsbG8=', 'ids': [1, 2, 3]}
"""
if value is None:
return None
# Handle lists
if isinstance(value, list):
return [_coerce_for_json(item) for item in value]
# Handle tuples (convert to list for JSON)
if isinstance(value, tuple):
return [_coerce_for_json(item) for item in value]
# Handle dicts
if isinstance(value, dict):
return {k: _coerce_for_json(v) for k, v in value.items()}
# For other types, apply the standard coercion logic
# (but skip the top-level list/tuple/dict checks to avoid recursion)
# PyArrow scalar → native Python type via .as_py()
if hasattr(value, "as_py") and hasattr(value, "type"):
with contextlib.suppress(TypeError, ValueError, AttributeError):
return _coerce_for_json(value.as_py())
try:
na_result = pd.isna(value)
# Handle both scalar and array-like results
# For array-like, check size to avoid ambiguity warning
if hasattr(na_result, "size") and na_result.size > 0:
# For single-element arrays, check the value
if na_result.size == 1 and na_result.item():
return None
elif isinstance(na_result, bool) and na_result:
# For scalar bool results
return None
except (ValueError, TypeError):
pass
# pd.Timestamp → ISO format string (tz-naive gets localized to UTC)
if isinstance(value, pd.Timestamp):
if value.tzinfo is None:
value = value.tz_localize("UTC")
return value.isoformat()
# pd.Timedelta
if isinstance(value, pd.Timedelta):
seconds = value.total_seconds()
sign = "-" if seconds < 0 else ""
seconds = abs(seconds)
seconds_str = str(int(seconds)) if seconds.is_integer() else str(seconds)
return f"{sign}PT{seconds_str}S"
# UUID
if isinstance(value, UUID):
return str(value)
# date (but not datetime)
if isinstance(value, date) and not isinstance(value, datetime):
return value.isoformat()
# datetime → ISO format string
if isinstance(value, datetime):
return value.isoformat()
# time
if isinstance(value, time):
return value.isoformat()
# bytes
if isinstance(value, bytes):
return base64.b64encode(value).decode("utf-8")
# numpy/pyarrow scalar
if hasattr(value, "item") and not isinstance(value, (bytes, memoryview)):
return _coerce_for_json(value.item())
# Large ints
if isinstance(value, int) and not isinstance(value, bool) and (value < _INT32_MIN or value > _INT32_MAX):
# Return as list for JSON serialization (will be wrapped as tuple at top-level)
return [value, "EdmType.INT64"]
return value
[docs]
def _coerce_value(value: Any) -> Any: # noqa: PLR0912
"""
Convert a pandas / numpy / pyarrow scalar to a type the Azure Table SDK accepts.
Args:
value: The value to coerce.
Returns:
A value that the Azure Table SDK can serialize.
"""
# PyArrow scalar → native Python type via .as_py()
# Check for PyArrow scalars first (they have type and as_py attributes)
if hasattr(value, "as_py") and hasattr(value, "type"):
with contextlib.suppress(TypeError, ValueError, AttributeError):
value = value.as_py()
# numpy / pyarrow scalar → native Python type via .item()
# Do this FIRST before any other checks to ensure proper type handling
if hasattr(value, "item") and not isinstance(value, (bytes, memoryview)):
value = value.item()
# NA-like values (NaT, NaN, pd.NA) → None (property omitted from entity)
try:
na_result = pd.isna(value)
# Handle both scalar and array-like results
# For array-like, check size to avoid ambiguity warning
if hasattr(na_result, "size") and na_result.size > 0:
# For single-element arrays, check the value
if na_result.size == 1 and na_result.item():
return None
elif isinstance(na_result, bool) and na_result:
# For scalar bool results
return None
except (ValueError, TypeError):
pass
# Sequences (lists, tuples) → JSON string
if isinstance(value, (list, tuple)):
coerced = _coerce_for_json(value)
return json.dumps(coerced)
# Nested dicts → JSON string
if isinstance(value, dict):
coerced = _coerce_for_json(value)
return json.dumps(coerced)
# pd.Timestamp → native datetime.datetime (tz-naive gets localized to UTC)
if isinstance(value, pd.Timestamp):
if value.tzinfo is None:
value = value.tz_localize("UTC")
return value.to_pydatetime()
# pd.Timedelta → ISO 8601 duration string (e.g., "PT86400S" for 1 day)
if isinstance(value, pd.Timedelta):
seconds = value.total_seconds()
sign = "-" if seconds < 0 else ""
seconds = abs(seconds)
seconds_str = str(int(seconds)) if seconds.is_integer() else str(seconds)
return f"{sign}PT{seconds_str}S"
# UUID → string representation
if isinstance(value, UUID):
return str(value)
# date (but not datetime) → ISO 8601 date string
if isinstance(value, date) and not isinstance(value, datetime):
return value.isoformat()
# time → ISO 8601 time string
if isinstance(value, time):
return value.isoformat()
# bytes → base64 string (standard for binary data in APIs)
if isinstance(value, bytes):
return base64.b64encode(value).decode("utf-8")
# After scalar unboxing, handle datetime objects (e.g., from numpy.datetime64)
if isinstance(value, datetime) and value.tzinfo is None:
# Localize naive datetime to UTC
return value.replace(tzinfo=timezone.utc)
# Large ints that overflow Azure Table's default Int32 → explicit Int64
if isinstance(value, int) and not isinstance(value, bool) and (value < _INT32_MIN or value > _INT32_MAX):
return (value, EdmType.INT64)
return value