my.core.pandas: rely on typing annotations from types-pandas
This commit is contained in:
parent
fe88380499
commit
a98bc6daca
1 changed files with 56 additions and 35 deletions
|
@ -1,32 +1,46 @@
|
|||
'''
|
||||
Various pandas helpers and convenience functions
|
||||
'''
|
||||
from __future__ import annotations
|
||||
|
||||
# todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential
|
||||
# NOTE: this file is meant to be importable without Pandas installed
|
||||
from datetime import datetime
|
||||
import dataclasses
|
||||
from datetime import datetime, timezone
|
||||
from pprint import pformat
|
||||
from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, Dict, Literal
|
||||
from typing import TYPE_CHECKING, Any, Iterable, Type, Dict, Literal, Callable, TypeVar
|
||||
|
||||
from decorator import decorator
|
||||
|
||||
from . import warnings, Res
|
||||
from .common import LazyLogger, Json, asdict
|
||||
from .error import error_to_json, extract_error_datetime
|
||||
|
||||
|
||||
logger = LazyLogger(__name__)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# this is kinda pointless at the moment, but handy to annotate DF returning methods now
|
||||
# later will be unignored when they implement type annotations
|
||||
import pandas as pd
|
||||
# DataFrameT = pd.DataFrame
|
||||
# TODO ugh. pretty annoying, having any is not very useful since it would allow arbitrary coercions..
|
||||
# ideally want to use a type that's like Any but doesn't allow arbitrary coercions??
|
||||
DataFrameT = Any
|
||||
|
||||
DataFrameT = pd.DataFrame
|
||||
SeriesT = pd.Series
|
||||
from pandas._typing import S1 # meh
|
||||
|
||||
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
|
||||
# huh interesting -- with from __future__ import annotations don't even need else clause here?
|
||||
# but still if other modules import these we do need some fake runtime types here..
|
||||
else:
|
||||
# in runtime, make it defensive so it works without pandas
|
||||
from typing import Optional
|
||||
|
||||
DataFrameT = Any
|
||||
SeriesT = Optional # just some type with one argument
|
||||
S1 = Any
|
||||
|
||||
|
||||
def check_dateish(s) -> Iterable[str]:
|
||||
def check_dateish(s: SeriesT[S1]) -> Iterable[str]:
|
||||
import pandas as pd # noqa: F811 not actually a redefinition
|
||||
|
||||
ctype = s.dtype
|
||||
if str(ctype).startswith('datetime64'):
|
||||
return
|
||||
|
@ -35,7 +49,7 @@ def check_dateish(s) -> Iterable[str]:
|
|||
return
|
||||
all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all()
|
||||
if not all_timestamps:
|
||||
return # not sure why it would happen, but ok
|
||||
return # not sure why it would happen, but ok
|
||||
tzs = s.map(lambda x: x.tzinfo).drop_duplicates()
|
||||
examples = s[tzs.index]
|
||||
# todo not so sure this warning is that useful... except for stuff without tz
|
||||
|
@ -45,11 +59,22 @@ def check_dateish(s) -> Iterable[str]:
|
|||
'''.strip()
|
||||
|
||||
|
||||
def test_check_dateish() -> None:
|
||||
import pandas as pd
|
||||
|
||||
# todo just a dummy test to check it doesn't crash, need something meaningful
|
||||
s1 = pd.Series([1, 2, 3])
|
||||
list(check_dateish(s1))
|
||||
|
||||
|
||||
# fmt: off
|
||||
ErrorColPolicy = Literal[
|
||||
'add_if_missing', # add error column if it's missing
|
||||
'warn' , # warn, but do not modify
|
||||
'ignore' , # no warnings
|
||||
'add_if_missing', # add error column if it's missing
|
||||
'warn' , # warn, but do not modify
|
||||
'ignore' , # no warnings
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]:
|
||||
if 'error' in df:
|
||||
|
@ -69,18 +94,14 @@ No 'error' column detected. You probably forgot to handle errors defensively, wh
|
|||
yield wmsg
|
||||
|
||||
|
||||
from typing import Any, Callable, TypeVar
|
||||
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
|
||||
|
||||
# TODO ugh. typing this is a mess... should I use mypy_extensions.VarArg/KwArgs?? or what??
|
||||
from decorator import decorator
|
||||
# TODO ugh. typing this is a mess... perhaps should use .compat.ParamSpec?
|
||||
@decorator
|
||||
def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', *args, **kwargs) -> DataFrameT:
|
||||
df = f(*args, **kwargs)
|
||||
def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy = 'add_if_missing', *args, **kwargs) -> DataFrameT:
|
||||
df: DataFrameT = f(*args, **kwargs)
|
||||
tag = '{f.__module__}:{f.__name__}'
|
||||
# makes sense to keep super defensive
|
||||
try:
|
||||
for col, data in df.reset_index().iteritems():
|
||||
for col, data in df.reset_index().items():
|
||||
for w in check_dateish(data):
|
||||
warnings.low(f"{tag}, column '{col}': {w}")
|
||||
except Exception as e:
|
||||
|
@ -92,11 +113,11 @@ def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing',
|
|||
logger.exception(e)
|
||||
return df
|
||||
|
||||
|
||||
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
|
||||
|
||||
|
||||
def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Json:
|
||||
from .error import error_to_json, extract_error_datetime
|
||||
def error_to_row(e: Exception, *, dt_col: str = 'dt', tz: timezone | None = None) -> Json:
|
||||
edt = extract_error_datetime(e)
|
||||
if edt is not None and edt.tzinfo is None and tz is not None:
|
||||
edt = edt.replace(tzinfo=tz)
|
||||
|
@ -118,11 +139,11 @@ def to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]:
|
|||
# no type for dataclass?
|
||||
Schema = Any
|
||||
|
||||
|
||||
def _as_columns(s: Schema) -> Dict[str, Type]:
|
||||
# todo would be nice to extract properties; add tests for this as well
|
||||
import dataclasses as D
|
||||
if D.is_dataclass(s):
|
||||
return {f.name: f.type for f in D.fields(s)}
|
||||
if dataclasses.is_dataclass(s):
|
||||
return {f.name: f.type for f in dataclasses.fields(s)}
|
||||
# else must be NamedTuple??
|
||||
# todo assert my.core.common.is_namedtuple?
|
||||
return getattr(s, '_field_types')
|
||||
|
@ -130,7 +151,7 @@ def _as_columns(s: Schema) -> Dict[str, Type]:
|
|||
|
||||
# todo add proper types
|
||||
@check_dataframe
|
||||
def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataFrameT:
|
||||
def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFrameT:
|
||||
# todo warn if schema isn't specified?
|
||||
# ok nice supports dataframe/NT natively
|
||||
# https://github.com/pandas-dev/pandas/pull/27999
|
||||
|
@ -138,27 +159,27 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataF
|
|||
# https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
|
||||
# same for NamedTuple -- seems that it takes whatever schema the first NT has
|
||||
# so we need to convert each individually... sigh
|
||||
import pandas as pd # noqa: F811 not actually a redefinition
|
||||
import pandas as pd # noqa: F811 not actually a redefinition
|
||||
|
||||
columns = None if schema is None else list(_as_columns(schema).keys())
|
||||
return pd.DataFrame(to_jsons(it), columns=columns)
|
||||
|
||||
|
||||
def test_as_dataframe() -> None:
|
||||
import pytest
|
||||
|
||||
it = (dict(i=i, s=f'str{i}') for i in range(10))
|
||||
with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841
|
||||
df = as_dataframe(it)
|
||||
df: DataFrameT = as_dataframe(it)
|
||||
# todo test other error col policies
|
||||
assert list(df.columns) == ['i', 's', 'error']
|
||||
|
||||
assert len(as_dataframe([])) == 0
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
@dataclasses.dataclass
|
||||
class X:
|
||||
x: int
|
||||
|
||||
# makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable
|
||||
df = as_dataframe([], schema=X)
|
||||
assert list(df.columns) == ['x', 'error']
|
||||
df2: DataFrameT = as_dataframe([], schema=X)
|
||||
assert list(df2.columns) == ['x', 'error']
|
||||
|
|
Loading…
Add table
Reference in a new issue