282 lines
9.1 KiB
Python
282 lines
9.1 KiB
Python
'''
|
|
Various pandas helpers and convenience functions
|
|
'''
|
|
|
|
from __future__ import annotations
|
|
|
|
# todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential
|
|
# NOTE: this file is meant to be importable without Pandas installed
|
|
import dataclasses
|
|
from collections.abc import Iterable, Iterator
|
|
from datetime import datetime, timezone
|
|
from pprint import pformat
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Callable,
|
|
Literal,
|
|
TypeVar,
|
|
)
|
|
|
|
from decorator import decorator
|
|
|
|
from . import warnings
|
|
from .error import Res, error_to_json, extract_error_datetime
|
|
from .logging import make_logger
|
|
from .types import Json, asdict
|
|
|
|
logger = make_logger(__name__)
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
import pandas as pd
|
|
|
|
DataFrameT = pd.DataFrame
|
|
SeriesT = pd.Series
|
|
from pandas._typing import S1 # meh
|
|
|
|
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
|
|
# huh interesting -- with from __future__ import annotations don't even need else clause here?
|
|
# but still if other modules import these we do need some fake runtime types here..
|
|
else:
|
|
from typing import Optional
|
|
|
|
DataFrameT = Any
|
|
SeriesT = Optional # just some type with one argument
|
|
S1 = Any
|
|
|
|
|
|
def _check_dateish(s: SeriesT[S1]) -> Iterable[str]:
|
|
import pandas as pd # noqa: F811 not actually a redefinition
|
|
|
|
ctype = s.dtype
|
|
if str(ctype).startswith('datetime64'):
|
|
return
|
|
s = s.dropna()
|
|
if len(s) == 0:
|
|
return
|
|
all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all()
|
|
if not all_timestamps:
|
|
return # not sure why it would happen, but ok
|
|
tzs = s.map(lambda x: x.tzinfo).drop_duplicates() # type: ignore[union-attr, var-annotated, arg-type, return-value, unused-ignore]
|
|
examples = s[tzs.index]
|
|
# todo not so sure this warning is that useful... except for stuff without tz
|
|
yield f'''
|
|
All values are timestamp-like, but dtype is not datetime. Most likely, you have mixed timezones:
|
|
{pformat(list(zip(examples, tzs)))}
|
|
'''.strip()
|
|
|
|
|
|
def test_check_dateish() -> None:
|
|
import pandas as pd
|
|
|
|
from .compat import fromisoformat
|
|
|
|
# empty series shouldn't warn
|
|
assert list(_check_dateish(pd.Series([]))) == []
|
|
|
|
# if no dateimes, shouldn't return any warnings
|
|
assert list(_check_dateish(pd.Series([1, 2, 3]))) == []
|
|
|
|
# all values are datetimes, shouldn't warn
|
|
# fmt: off
|
|
assert list(_check_dateish(pd.Series([
|
|
fromisoformat('2024-08-19T01:02:03'),
|
|
fromisoformat('2024-08-19T03:04:05'),
|
|
]))) == []
|
|
# fmt: on
|
|
|
|
# mixture of timezones -- should warn
|
|
# fmt: off
|
|
assert len(list(_check_dateish(pd.Series([
|
|
fromisoformat('2024-08-19T01:02:03'),
|
|
fromisoformat('2024-08-19T03:04:05Z'),
|
|
])))) == 1
|
|
# fmt: on
|
|
|
|
# TODO hmm. maybe this should actually warn?
|
|
# fmt: off
|
|
assert len(list(_check_dateish(pd.Series([
|
|
'whatever',
|
|
fromisoformat('2024-08-19T01:02:03'),
|
|
])))) == 0
|
|
# fmt: on
|
|
|
|
|
|
# fmt: off
|
|
ErrorColPolicy = Literal[
|
|
'add_if_missing', # add error column if it's missing
|
|
'warn' , # warn, but do not modify
|
|
'ignore' , # no warnings
|
|
]
|
|
# fmt: on
|
|
|
|
|
|
def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]:
|
|
if 'error' in df:
|
|
return
|
|
if policy == 'ignore':
|
|
return
|
|
|
|
wmsg = '''
|
|
No 'error' column detected. You probably forgot to handle errors defensively, which means a single bad entry might bring the whole dataframe down.
|
|
'''.strip()
|
|
if policy == 'add_if_missing':
|
|
# todo maybe just add the warnings text as well?
|
|
df['error'] = None
|
|
wmsg += "\nAdding empty 'error' column (see 'error_col_policy' if you want to change this behaviour)"
|
|
pass
|
|
|
|
yield wmsg
|
|
|
|
|
|
# TODO ugh. typing this is a mess... perhaps should use .compat.ParamSpec?
|
|
@decorator
|
|
def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy = 'add_if_missing', *args, **kwargs) -> DataFrameT:
|
|
df: DataFrameT = f(*args, **kwargs)
|
|
tag = '{f.__module__}:{f.__name__}'
|
|
# makes sense to keep super defensive
|
|
try:
|
|
for col, data in df.reset_index().items():
|
|
for w in _check_dateish(data):
|
|
warnings.low(f"{tag}, column '{col}': {w}")
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
try:
|
|
for w in check_error_column(df, policy=error_col_policy):
|
|
warnings.low(f"{tag}, {w}")
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
return df
|
|
|
|
|
|
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
|
|
|
|
|
|
def error_to_row(e: Exception, *, dt_col: str = 'dt', tz: timezone | None = None) -> Json:
|
|
edt = extract_error_datetime(e)
|
|
if edt is not None and edt.tzinfo is None and tz is not None:
|
|
edt = edt.replace(tzinfo=tz)
|
|
err_dict: Json = error_to_json(e)
|
|
err_dict[dt_col] = edt
|
|
return err_dict
|
|
|
|
|
|
def _to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]:
|
|
for r in it:
|
|
if isinstance(r, Exception):
|
|
yield error_to_row(r)
|
|
else:
|
|
yield asdict(r)
|
|
|
|
|
|
# mm. https://github.com/python/mypy/issues/8564
|
|
# no type for dataclass?
|
|
Schema = Any
|
|
|
|
|
|
def _as_columns(s: Schema) -> dict[str, type]:
|
|
# todo would be nice to extract properties; add tests for this as well
|
|
if dataclasses.is_dataclass(s):
|
|
return {f.name: f.type for f in dataclasses.fields(s)} # type: ignore[misc] # ugh, why mypy thinks f.type can return str??
|
|
# else must be NamedTuple??
|
|
# todo assert my.core.common.is_namedtuple?
|
|
return getattr(s, '_field_types')
|
|
|
|
|
|
# todo add proper types
|
|
@check_dataframe
|
|
def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFrameT:
|
|
# todo warn if schema isn't specified?
|
|
# ok nice supports dataframe/NT natively
|
|
# https://github.com/pandas-dev/pandas/pull/27999
|
|
# but it dispatches dataclass based on the first entry...
|
|
# https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
|
|
# same for NamedTuple -- seems that it takes whatever schema the first NT has
|
|
# so we need to convert each individually... sigh
|
|
import pandas as pd # noqa: F811 not actually a redefinition
|
|
|
|
columns = None if schema is None else list(_as_columns(schema).keys())
|
|
return pd.DataFrame(_to_jsons(it), columns=columns)
|
|
|
|
|
|
# ugh. in principle this could be inside the test
|
|
# might be due to use of from __future__ import annotations
|
|
# can quickly reproduce by running pytest tests/tz.py tests/core/test_pandas.py
|
|
# possibly will be resolved after fix in pytest?
|
|
# see https://github.com/pytest-dev/pytest/issues/7856
|
|
@dataclasses.dataclass
|
|
class _X:
|
|
# FIXME try moving inside?
|
|
x: int
|
|
|
|
|
|
def test_as_dataframe() -> None:
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
from pandas.testing import assert_frame_equal
|
|
|
|
from .compat import fromisoformat
|
|
|
|
it = ({'i': i, 's': f'str{i}'} for i in range(5))
|
|
with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841
|
|
df: DataFrameT = as_dataframe(it)
|
|
# todo test other error col policies
|
|
|
|
# fmt: off
|
|
assert_frame_equal(
|
|
df,
|
|
pd.DataFrame({
|
|
'i' : [0 , 1 , 2 , 3 , 4 ],
|
|
's' : ['str0', 'str1', 'str2', 'str3', 'str4'],
|
|
# NOTE: error column is always added
|
|
'error': [None , None , None , None , None ],
|
|
}),
|
|
)
|
|
# fmt: on
|
|
assert_frame_equal(as_dataframe([]), pd.DataFrame(columns=['error']))
|
|
|
|
df2: DataFrameT = as_dataframe([], schema=_X)
|
|
assert_frame_equal(
|
|
df2,
|
|
# FIXME hmm. x column type should be an int?? and error should be string (or object??)
|
|
pd.DataFrame(columns=['x', 'error']),
|
|
)
|
|
|
|
@dataclasses.dataclass
|
|
class S:
|
|
value: str
|
|
|
|
def it2() -> Iterator[Res[S]]:
|
|
yield S(value='test')
|
|
yield RuntimeError('i failed')
|
|
|
|
df = as_dataframe(it2())
|
|
# fmt: off
|
|
assert_frame_equal(
|
|
df,
|
|
pd.DataFrame(data={
|
|
'value': ['test', np.nan ],
|
|
'error': [np.nan, 'RuntimeError: i failed\n'],
|
|
'dt' : [np.nan, np.nan ],
|
|
}).astype(dtype={'dt': 'float'}), # FIXME should be datetime64 as below
|
|
)
|
|
# fmt: on
|
|
|
|
def it3() -> Iterator[Res[S]]:
|
|
yield S(value='aba')
|
|
yield RuntimeError('whoops')
|
|
yield S(value='cde')
|
|
yield RuntimeError('exception with datetime', fromisoformat('2024-08-19T22:47:01Z'))
|
|
|
|
df = as_dataframe(it3())
|
|
|
|
# fmt: off
|
|
assert_frame_equal(df, pd.DataFrame(data={
|
|
'value': ['aba' , np.nan , 'cde' , np.nan ],
|
|
'error': [np.nan, 'RuntimeError: whoops\n', np.nan, "RuntimeError: ('exception with datetime', datetime.datetime(2024, 8, 19, 22, 47, 1, tzinfo=datetime.timezone.utc))\n"],
|
|
# note: dt column is added even if errors don't have an associated datetime
|
|
'dt' : [np.nan, np.nan , np.nan, '2024-08-19 22:47:01+00:00'],
|
|
}).astype(dtype={'dt': 'datetime64[ns, UTC]'}))
|
|
# fmt: on
|