my.core.pandas: rely on typing annotations from types-pandas

This commit is contained in:
Dima Gerasimov 2023-05-18 02:17:59 +01:00 committed by karlicoss
parent fe88380499
commit a98bc6daca

View file

@ -1,32 +1,46 @@
''' '''
Various pandas helpers and convenience functions Various pandas helpers and convenience functions
''' '''
from __future__ import annotations
# todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential # todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential
# NOTE: this file is meant to be importable without Pandas installed # NOTE: this file is meant to be importable without Pandas installed
from datetime import datetime import dataclasses
from datetime import datetime, timezone
from pprint import pformat from pprint import pformat
from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, Dict, Literal from typing import TYPE_CHECKING, Any, Iterable, Type, Dict, Literal, Callable, TypeVar
from decorator import decorator
from . import warnings, Res from . import warnings, Res
from .common import LazyLogger, Json, asdict from .common import LazyLogger, Json, asdict
from .error import error_to_json, extract_error_datetime
logger = LazyLogger(__name__) logger = LazyLogger(__name__)
if TYPE_CHECKING: if TYPE_CHECKING:
# this is kinda pointless at the moment, but handy to annotate DF returning methods now
# later will be unignored when they implement type annotations
import pandas as pd import pandas as pd
# DataFrameT = pd.DataFrame
# TODO ugh. pretty annoying, having any is not very useful since it would allow arbitrary coercions.. DataFrameT = pd.DataFrame
# ideally want to use a type that's like Any but doesn't allow arbitrary coercions?? SeriesT = pd.Series
DataFrameT = Any from pandas._typing import S1 # meh
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
# huh interesting -- with from __future__ import annotations don't even need else clause here?
# but still if other modules import these we do need some fake runtime types here..
else: else:
# in runtime, make it defensive so it works without pandas from typing import Optional
DataFrameT = Any DataFrameT = Any
SeriesT = Optional # just some type with one argument
S1 = Any
def check_dateish(s) -> Iterable[str]: def check_dateish(s: SeriesT[S1]) -> Iterable[str]:
import pandas as pd # noqa: F811 not actually a redefinition import pandas as pd # noqa: F811 not actually a redefinition
ctype = s.dtype ctype = s.dtype
if str(ctype).startswith('datetime64'): if str(ctype).startswith('datetime64'):
return return
@ -35,7 +49,7 @@ def check_dateish(s) -> Iterable[str]:
return return
all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all() all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all()
if not all_timestamps: if not all_timestamps:
return # not sure why it would happen, but ok return # not sure why it would happen, but ok
tzs = s.map(lambda x: x.tzinfo).drop_duplicates() tzs = s.map(lambda x: x.tzinfo).drop_duplicates()
examples = s[tzs.index] examples = s[tzs.index]
# todo not so sure this warning is that useful... except for stuff without tz # todo not so sure this warning is that useful... except for stuff without tz
@ -45,11 +59,22 @@ def check_dateish(s) -> Iterable[str]:
'''.strip() '''.strip()
def test_check_dateish() -> None:
import pandas as pd
# todo just a dummy test to check it doesn't crash, need something meaningful
s1 = pd.Series([1, 2, 3])
list(check_dateish(s1))
# fmt: off
ErrorColPolicy = Literal[ ErrorColPolicy = Literal[
'add_if_missing', # add error column if it's missing 'add_if_missing', # add error column if it's missing
'warn' , # warn, but do not modify 'warn' , # warn, but do not modify
'ignore' , # no warnings 'ignore' , # no warnings
] ]
# fmt: on
def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]: def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]:
if 'error' in df: if 'error' in df:
@ -69,18 +94,14 @@ No 'error' column detected. You probably forgot to handle errors defensively, wh
yield wmsg yield wmsg
from typing import Any, Callable, TypeVar # TODO ugh. typing this is a mess... perhaps should use .compat.ParamSpec?
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
# TODO ugh. typing this is a mess... should I use mypy_extensions.VarArg/KwArgs?? or what??
from decorator import decorator
@decorator @decorator
def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', *args, **kwargs) -> DataFrameT: def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy = 'add_if_missing', *args, **kwargs) -> DataFrameT:
df = f(*args, **kwargs) df: DataFrameT = f(*args, **kwargs)
tag = '{f.__module__}:{f.__name__}' tag = '{f.__module__}:{f.__name__}'
# makes sense to keep super defensive # makes sense to keep super defensive
try: try:
for col, data in df.reset_index().iteritems(): for col, data in df.reset_index().items():
for w in check_dateish(data): for w in check_dateish(data):
warnings.low(f"{tag}, column '{col}': {w}") warnings.low(f"{tag}, column '{col}': {w}")
except Exception as e: except Exception as e:
@ -92,11 +113,11 @@ def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing',
logger.exception(e) logger.exception(e)
return df return df
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type? # todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Json: def error_to_row(e: Exception, *, dt_col: str = 'dt', tz: timezone | None = None) -> Json:
from .error import error_to_json, extract_error_datetime
edt = extract_error_datetime(e) edt = extract_error_datetime(e)
if edt is not None and edt.tzinfo is None and tz is not None: if edt is not None and edt.tzinfo is None and tz is not None:
edt = edt.replace(tzinfo=tz) edt = edt.replace(tzinfo=tz)
@ -118,11 +139,11 @@ def to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]:
# no type for dataclass? # no type for dataclass?
Schema = Any Schema = Any
def _as_columns(s: Schema) -> Dict[str, Type]: def _as_columns(s: Schema) -> Dict[str, Type]:
# todo would be nice to extract properties; add tests for this as well # todo would be nice to extract properties; add tests for this as well
import dataclasses as D if dataclasses.is_dataclass(s):
if D.is_dataclass(s): return {f.name: f.type for f in dataclasses.fields(s)}
return {f.name: f.type for f in D.fields(s)}
# else must be NamedTuple?? # else must be NamedTuple??
# todo assert my.core.common.is_namedtuple? # todo assert my.core.common.is_namedtuple?
return getattr(s, '_field_types') return getattr(s, '_field_types')
@ -130,7 +151,7 @@ def _as_columns(s: Schema) -> Dict[str, Type]:
# todo add proper types # todo add proper types
@check_dataframe @check_dataframe
def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataFrameT: def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFrameT:
# todo warn if schema isn't specified? # todo warn if schema isn't specified?
# ok nice supports dataframe/NT natively # ok nice supports dataframe/NT natively
# https://github.com/pandas-dev/pandas/pull/27999 # https://github.com/pandas-dev/pandas/pull/27999
@ -138,27 +159,27 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataF
# https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562 # https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
# same for NamedTuple -- seems that it takes whatever schema the first NT has # same for NamedTuple -- seems that it takes whatever schema the first NT has
# so we need to convert each individually... sigh # so we need to convert each individually... sigh
import pandas as pd # noqa: F811 not actually a redefinition import pandas as pd # noqa: F811 not actually a redefinition
columns = None if schema is None else list(_as_columns(schema).keys()) columns = None if schema is None else list(_as_columns(schema).keys())
return pd.DataFrame(to_jsons(it), columns=columns) return pd.DataFrame(to_jsons(it), columns=columns)
def test_as_dataframe() -> None: def test_as_dataframe() -> None:
import pytest import pytest
it = (dict(i=i, s=f'str{i}') for i in range(10)) it = (dict(i=i, s=f'str{i}') for i in range(10))
with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841 with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841
df = as_dataframe(it) df: DataFrameT = as_dataframe(it)
# todo test other error col policies # todo test other error col policies
assert list(df.columns) == ['i', 's', 'error'] assert list(df.columns) == ['i', 's', 'error']
assert len(as_dataframe([])) == 0 assert len(as_dataframe([])) == 0
from dataclasses import dataclass @dataclasses.dataclass
@dataclass
class X: class X:
x: int x: int
# makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable # makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable
df = as_dataframe([], schema=X) df2: DataFrameT = as_dataframe([], schema=X)
assert list(df.columns) == ['x', 'error'] assert list(df2.columns) == ['x', 'error']