my.core.serialize: orjson with additional default and _serialize hook (#140)
basic orjson serialize, json.dumps fallback
Lots of surrounding changes from this discussion:
0593c69056
This commit is contained in:
parent
02a9fb5e8f
commit
eb26cf8633
8 changed files with 224 additions and 24 deletions
|
@ -531,7 +531,13 @@ def test_guess_datetime() -> None:
|
||||||
# TODO test @property?
|
# TODO test @property?
|
||||||
|
|
||||||
|
|
||||||
def asdict(thing) -> Json:
|
def is_namedtuple(thing: Any) -> bool:
|
||||||
|
# basic check to see if this is namedtuple-like
|
||||||
|
_asdict = getattr(thing, '_asdict', None)
|
||||||
|
return _asdict and callable(_asdict)
|
||||||
|
|
||||||
|
|
||||||
|
def asdict(thing: Any) -> Json:
|
||||||
# todo primitive?
|
# todo primitive?
|
||||||
# todo exception?
|
# todo exception?
|
||||||
if isinstance(thing, dict):
|
if isinstance(thing, dict):
|
||||||
|
@ -539,19 +545,11 @@ def asdict(thing) -> Json:
|
||||||
import dataclasses as D
|
import dataclasses as D
|
||||||
if D.is_dataclass(thing):
|
if D.is_dataclass(thing):
|
||||||
return D.asdict(thing)
|
return D.asdict(thing)
|
||||||
# must be a NT otherwise?
|
if is_namedtuple(thing):
|
||||||
# todo add a proper check.. ()
|
|
||||||
return thing._asdict()
|
return thing._asdict()
|
||||||
|
raise TypeError(f'Could not convert object {thing} to dict')
|
||||||
|
|
||||||
|
|
||||||
# todo not sure about naming
|
|
||||||
def to_jsons(it) -> Iterable[Json]:
|
|
||||||
from .error import error_to_json # prevent circular import
|
|
||||||
for r in it:
|
|
||||||
if isinstance(r, Exception):
|
|
||||||
yield error_to_json(r)
|
|
||||||
else:
|
|
||||||
yield asdict(r)
|
|
||||||
|
|
||||||
|
|
||||||
datetime_naive = datetime
|
datetime_naive = datetime
|
||||||
|
|
|
@ -145,15 +145,9 @@ def extract_error_datetime(e: Exception) -> Optional[datetime]:
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
from .common import Json
|
from .common import Json
|
||||||
def error_to_json(e: Exception, *, dt_col: str='dt', tz=None) -> Json:
|
def error_to_json(e: Exception) -> Json:
|
||||||
edt = extract_error_datetime(e)
|
|
||||||
if edt is not None and edt.tzinfo is None and tz is not None:
|
|
||||||
edt = edt.replace(tzinfo=tz)
|
|
||||||
estr = ''.join(traceback.format_exception(Exception, e, e.__traceback__))
|
estr = ''.join(traceback.format_exception(Exception, e, e.__traceback__))
|
||||||
return {
|
return {'error': estr}
|
||||||
'error': estr,
|
|
||||||
dt_col : edt,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def test_datetime_errors() -> None:
|
def test_datetime_errors() -> None:
|
||||||
|
|
|
@ -7,7 +7,7 @@ from datetime import datetime
|
||||||
from pprint import pformat
|
from pprint import pformat
|
||||||
from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, List, Dict
|
from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, List, Dict
|
||||||
from . import warnings, Res
|
from . import warnings, Res
|
||||||
from .common import LazyLogger
|
from .common import LazyLogger, Json, asdict
|
||||||
|
|
||||||
logger = LazyLogger(__name__)
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
|
@ -97,8 +97,23 @@ def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing',
|
||||||
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
|
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
|
||||||
|
|
||||||
|
|
||||||
from .error import error_to_json
|
def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Json:
|
||||||
error_to_row = error_to_json # todo deprecate?
|
from .error import error_to_json, extract_error_datetime
|
||||||
|
edt = extract_error_datetime(e)
|
||||||
|
if edt is not None and edt.tzinfo is None and tz is not None:
|
||||||
|
edt = edt.replace(tzinfo=tz)
|
||||||
|
err_dict: Json = error_to_json(e)
|
||||||
|
err_dict[dt_col] = edt
|
||||||
|
return err_dict
|
||||||
|
|
||||||
|
|
||||||
|
# todo not sure about naming
|
||||||
|
def to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]:
|
||||||
|
for r in it:
|
||||||
|
if isinstance(r, Exception):
|
||||||
|
yield error_to_row(r)
|
||||||
|
else:
|
||||||
|
yield asdict(r)
|
||||||
|
|
||||||
|
|
||||||
# mm. https://github.com/python/mypy/issues/8564
|
# mm. https://github.com/python/mypy/issues/8564
|
||||||
|
@ -111,6 +126,7 @@ def _as_columns(s: Schema) -> Dict[str, Type]:
|
||||||
if D.is_dataclass(s):
|
if D.is_dataclass(s):
|
||||||
return {f.name: f.type for f in D.fields(s)}
|
return {f.name: f.type for f in D.fields(s)}
|
||||||
# else must be NamedTuple??
|
# else must be NamedTuple??
|
||||||
|
# todo assert my.core.common.is_namedtuple?
|
||||||
return getattr(s, '_field_types')
|
return getattr(s, '_field_types')
|
||||||
|
|
||||||
|
|
||||||
|
@ -124,7 +140,6 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataF
|
||||||
# https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
|
# https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
|
||||||
# same for NamedTuple -- seems that it takes whatever schema the first NT has
|
# same for NamedTuple -- seems that it takes whatever schema the first NT has
|
||||||
# so we need to convert each individually... sigh
|
# so we need to convert each individually... sigh
|
||||||
from .common import to_jsons
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
columns = None if schema is None else list(_as_columns(schema).keys())
|
columns = None if schema is None else list(_as_columns(schema).keys())
|
||||||
return pd.DataFrame(to_jsons(it), columns=columns)
|
return pd.DataFrame(to_jsons(it), columns=columns)
|
||||||
|
|
189
my/core/serialize.py
Normal file
189
my/core/serialize.py
Normal file
|
@ -0,0 +1,189 @@
|
||||||
|
import datetime
|
||||||
|
from typing import Any, Optional, Callable
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
|
from .common import is_namedtuple
|
||||||
|
from .error import error_to_json
|
||||||
|
|
||||||
|
# note: it would be nice to combine the 'asdict' and _default_encode to some function
|
||||||
|
# that takes a complex python object and returns JSON-compatible fields, while still
|
||||||
|
# being a dictionary.
|
||||||
|
# a workaround is to encode with dumps below and then json.loads it immediately
|
||||||
|
|
||||||
|
|
||||||
|
DefaultEncoder = Callable[[Any], Any]
|
||||||
|
|
||||||
|
|
||||||
|
def _default_encode(obj: Any) -> Any:
|
||||||
|
"""
|
||||||
|
Encodes complex python datatypes to simpler representations,
|
||||||
|
before they're serialized to JSON string
|
||||||
|
"""
|
||||||
|
# orjson doesn't serialize namedtuples to avoid serializing
|
||||||
|
# them as tuples (arrays), since they're technically a subclass
|
||||||
|
if is_namedtuple(obj):
|
||||||
|
return obj._asdict()
|
||||||
|
if isinstance(obj, datetime.timedelta):
|
||||||
|
return obj.total_seconds()
|
||||||
|
if isinstance(obj, Exception):
|
||||||
|
return error_to_json(obj)
|
||||||
|
# note: _serialize would only be called for items which aren't already
|
||||||
|
# serialized as a dataclass or namedtuple
|
||||||
|
# discussion: https://github.com/karlicoss/HPI/issues/138#issuecomment-801704929
|
||||||
|
if hasattr(obj, '_serialize') and callable(obj._serialize):
|
||||||
|
return obj._serialize()
|
||||||
|
raise TypeError(f"Could not serialize object of type {type(obj).__name__}")
|
||||||
|
|
||||||
|
|
||||||
|
# could possibly run multiple times/raise warning if you provide different 'default'
|
||||||
|
# functions or change the kwargs? The alternative is to maintain all of this at the module
|
||||||
|
# level, which is just as annoying
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def _dumps_factory(**kwargs) -> Callable[[Any], str]:
|
||||||
|
use_default: DefaultEncoder = _default_encode
|
||||||
|
# if the user passed an additional 'default' parameter,
|
||||||
|
# try using that to serialize before before _default_encode
|
||||||
|
_additional_default: Optional[DefaultEncoder] = kwargs.get("default")
|
||||||
|
if _additional_default is not None and callable(_additional_default):
|
||||||
|
|
||||||
|
def wrapped_default(obj: Any) -> Any:
|
||||||
|
try:
|
||||||
|
# hmm... shouldn't mypy know that _additional_default is not None here?
|
||||||
|
# assert _additional_default is not None
|
||||||
|
return _additional_default(obj) # type: ignore[misc]
|
||||||
|
except TypeError:
|
||||||
|
# expected TypeError, signifies couldn't be encoded by custom
|
||||||
|
# serializer function. Try _default_encode from here
|
||||||
|
return _default_encode(obj)
|
||||||
|
|
||||||
|
use_default = wrapped_default
|
||||||
|
|
||||||
|
kwargs["default"] = use_default
|
||||||
|
|
||||||
|
try:
|
||||||
|
import orjson
|
||||||
|
|
||||||
|
# todo: add orjson.OPT_NON_STR_KEYS? would require some bitwise ops
|
||||||
|
# most keys are typically attributes from a NT/Dataclass,
|
||||||
|
# so most seem to work: https://github.com/ijl/orjson#opt_non_str_keys
|
||||||
|
def _orjson_dumps(obj: Any) -> str:
|
||||||
|
# orjson returns json as bytes, encode to string
|
||||||
|
return orjson.dumps(obj, **kwargs).decode('utf-8')
|
||||||
|
|
||||||
|
return _orjson_dumps
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
import json
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
warnings.warn("You might want to install 'orjson' to support serialization for lots more types!")
|
||||||
|
|
||||||
|
def _stdlib_dumps(obj: Any) -> str:
|
||||||
|
return json.dumps(obj, **kwargs)
|
||||||
|
|
||||||
|
return _stdlib_dumps
|
||||||
|
|
||||||
|
|
||||||
|
def dumps(
|
||||||
|
obj: Any,
|
||||||
|
default: Optional[DefaultEncoder] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Any additional arguments are forwarded -- either to orjson.dumps
|
||||||
|
or json.dumps if orjson is not installed
|
||||||
|
|
||||||
|
You can pass the 'option' kwarg to orjson, see here for possible options:
|
||||||
|
https://github.com/ijl/orjson#option
|
||||||
|
|
||||||
|
Any class/instance can implement a `_serialize` function, which is used
|
||||||
|
to convert it to a JSON-compatible representation.
|
||||||
|
If present, it is called during _default_encode
|
||||||
|
|
||||||
|
'default' is called before _default_encode, and should raise a TypeError if
|
||||||
|
its not able to serialize the type. As an example:
|
||||||
|
|
||||||
|
from my.core.serialize import dumps
|
||||||
|
|
||||||
|
class MyClass:
|
||||||
|
def __init__(self, x):
|
||||||
|
self.x = x
|
||||||
|
|
||||||
|
def serialize_default(o: Any) -> Any:
|
||||||
|
if isinstance(o, MyClass):
|
||||||
|
return {"x": o.x}
|
||||||
|
raise TypeError("Could not serialize...")
|
||||||
|
|
||||||
|
dumps({"info": MyClass(5)}, default=serialize_default)
|
||||||
|
"""
|
||||||
|
return _dumps_factory(default=default, **kwargs)(obj)
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_fallback() -> None:
|
||||||
|
import json as jsn # dont cause possible conflicts with module code
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# cant use a namedtuple here, since the default json.dump serializer
|
||||||
|
# serializes namedtuples as tuples, which become arrays
|
||||||
|
# just test with an array of mixed objects
|
||||||
|
X = [5, datetime.timedelta(seconds=5.0)]
|
||||||
|
|
||||||
|
# ignore warnings. depending on test order,
|
||||||
|
# the lru_cache'd warning may have already been sent,
|
||||||
|
# so checking may be nondeterministic?
|
||||||
|
with pytest.warns(None):
|
||||||
|
res = jsn.loads(dumps(X))
|
||||||
|
assert res == [5, 5.0]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_nt_serialize() -> None:
|
||||||
|
import json as jsn # dont cause possible conflicts with module code
|
||||||
|
import orjson # import to make sure this is installed
|
||||||
|
|
||||||
|
from typing import NamedTuple
|
||||||
|
|
||||||
|
class A(NamedTuple):
|
||||||
|
x: int
|
||||||
|
y: float
|
||||||
|
|
||||||
|
res: str = dumps(A(x=1, y=2.0))
|
||||||
|
assert res == '{"x":1,"y":2.0}'
|
||||||
|
|
||||||
|
# test orjson option kwarg
|
||||||
|
data = {datetime.date(year=1970, month=1, day=1): 5}
|
||||||
|
res = jsn.loads(dumps(data, option=orjson.OPT_NON_STR_KEYS))
|
||||||
|
assert res == {'1970-01-01': 5}
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_serializer() -> None:
|
||||||
|
import pytest
|
||||||
|
import json as jsn # dont cause possible conflicts with module code
|
||||||
|
|
||||||
|
class Unserializable:
|
||||||
|
def __init__(self, x: int):
|
||||||
|
self.x = x
|
||||||
|
# add something handled by the _default_encode function
|
||||||
|
self.y = datetime.timedelta(seconds=float(x))
|
||||||
|
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
dumps(Unserializable(5))
|
||||||
|
|
||||||
|
class WithUnderscoreSerialize(Unserializable):
|
||||||
|
def _serialize(self) -> Any:
|
||||||
|
return {"x": self.x, "y": self.y}
|
||||||
|
|
||||||
|
res = jsn.loads(dumps(WithUnderscoreSerialize(6)))
|
||||||
|
assert res == {"x": 6, "y": 6.0}
|
||||||
|
|
||||||
|
# test passing additional 'default' func
|
||||||
|
def _serialize_with_default(o: Any) -> Any:
|
||||||
|
if isinstance(o, Unserializable):
|
||||||
|
return {"x": o.x, "y": o.y}
|
||||||
|
raise TypeError("Couldnt serialize")
|
||||||
|
|
||||||
|
# this serializes both Unserializable, which is a custom type otherwise
|
||||||
|
# not handled, and timedelta, which is handled by the '_default_encode'
|
||||||
|
# in the 'wrapped_default' function
|
||||||
|
res2 = jsn.loads(dumps(Unserializable(10), default=_serialize_with_default))
|
||||||
|
assert res2 == {"x": 10, "y": 10.0}
|
1
setup.py
1
setup.py
|
@ -55,6 +55,7 @@ def main():
|
||||||
'optional': [
|
'optional': [
|
||||||
# todo document these?
|
# todo document these?
|
||||||
'logzero',
|
'logzero',
|
||||||
|
'orjson',
|
||||||
'cachew>=0.8.0',
|
'cachew>=0.8.0',
|
||||||
'mypy', # used for config checks
|
'mypy', # used for config checks
|
||||||
],
|
],
|
||||||
|
|
|
@ -18,3 +18,4 @@ from my.core.util import *
|
||||||
from my.core.discovery_pure import *
|
from my.core.discovery_pure import *
|
||||||
from my.core.types import *
|
from my.core.types import *
|
||||||
from my.core.stats import *
|
from my.core.stats import *
|
||||||
|
from my.core.serialize import test_serialize_fallback
|
||||||
|
|
1
tests/serialize.py
Normal file
1
tests/serialize.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from my.core.serialize import *
|
1
tox.ini
1
tox.ini
|
@ -23,6 +23,7 @@ setenv = MY_CONFIG = nonexistent
|
||||||
commands =
|
commands =
|
||||||
pip install -e .[testing]
|
pip install -e .[testing]
|
||||||
pip install cachew
|
pip install cachew
|
||||||
|
pip install orjson
|
||||||
|
|
||||||
hpi module install my.location.google
|
hpi module install my.location.google
|
||||||
pip install ijson # optional dependency
|
pip install ijson # optional dependency
|
||||||
|
|
Loading…
Add table
Reference in a new issue