diff --git a/my/core/pandas.py b/my/core/pandas.py index 4ce62fe..efd8c48 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -1,32 +1,46 @@ ''' Various pandas helpers and convenience functions ''' +from __future__ import annotations + # todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential # NOTE: this file is meant to be importable without Pandas installed -from datetime import datetime +import dataclasses +from datetime import datetime, timezone from pprint import pformat -from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, Dict, Literal +from typing import TYPE_CHECKING, Any, Iterable, Type, Dict, Literal, Callable, TypeVar + +from decorator import decorator + from . import warnings, Res from .common import LazyLogger, Json, asdict +from .error import error_to_json, extract_error_datetime + logger = LazyLogger(__name__) if TYPE_CHECKING: - # this is kinda pointless at the moment, but handy to annotate DF returning methods now - # later will be unignored when they implement type annotations import pandas as pd - # DataFrameT = pd.DataFrame - # TODO ugh. pretty annoying, having any is not very useful since it would allow arbitrary coercions.. - # ideally want to use a type that's like Any but doesn't allow arbitrary coercions?? - DataFrameT = Any + + DataFrameT = pd.DataFrame + SeriesT = pd.Series + from pandas._typing import S1 # meh + + FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT]) + # huh interesting -- with from __future__ import annotations don't even need else clause here? + # but still if other modules import these we do need some fake runtime types here.. else: - # in runtime, make it defensive so it works without pandas + from typing import Optional + DataFrameT = Any + SeriesT = Optional # just some type with one argument + S1 = Any -def check_dateish(s) -> Iterable[str]: +def check_dateish(s: SeriesT[S1]) -> Iterable[str]: import pandas as pd # noqa: F811 not actually a redefinition + ctype = s.dtype if str(ctype).startswith('datetime64'): return @@ -35,7 +49,7 @@ def check_dateish(s) -> Iterable[str]: return all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all() if not all_timestamps: - return # not sure why it would happen, but ok + return # not sure why it would happen, but ok tzs = s.map(lambda x: x.tzinfo).drop_duplicates() examples = s[tzs.index] # todo not so sure this warning is that useful... except for stuff without tz @@ -45,11 +59,22 @@ def check_dateish(s) -> Iterable[str]: '''.strip() +def test_check_dateish() -> None: + import pandas as pd + + # todo just a dummy test to check it doesn't crash, need something meaningful + s1 = pd.Series([1, 2, 3]) + list(check_dateish(s1)) + + +# fmt: off ErrorColPolicy = Literal[ - 'add_if_missing', # add error column if it's missing - 'warn' , # warn, but do not modify - 'ignore' , # no warnings + 'add_if_missing', # add error column if it's missing + 'warn' , # warn, but do not modify + 'ignore' , # no warnings ] +# fmt: on + def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]: if 'error' in df: @@ -69,18 +94,14 @@ No 'error' column detected. You probably forgot to handle errors defensively, wh yield wmsg -from typing import Any, Callable, TypeVar -FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT]) - -# TODO ugh. typing this is a mess... should I use mypy_extensions.VarArg/KwArgs?? or what?? -from decorator import decorator +# TODO ugh. typing this is a mess... perhaps should use .compat.ParamSpec? @decorator -def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', *args, **kwargs) -> DataFrameT: - df = f(*args, **kwargs) +def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy = 'add_if_missing', *args, **kwargs) -> DataFrameT: + df: DataFrameT = f(*args, **kwargs) tag = '{f.__module__}:{f.__name__}' # makes sense to keep super defensive try: - for col, data in df.reset_index().iteritems(): + for col, data in df.reset_index().items(): for w in check_dateish(data): warnings.low(f"{tag}, column '{col}': {w}") except Exception as e: @@ -92,11 +113,11 @@ def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', logger.exception(e) return df + # todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type? -def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Json: - from .error import error_to_json, extract_error_datetime +def error_to_row(e: Exception, *, dt_col: str = 'dt', tz: timezone | None = None) -> Json: edt = extract_error_datetime(e) if edt is not None and edt.tzinfo is None and tz is not None: edt = edt.replace(tzinfo=tz) @@ -118,11 +139,11 @@ def to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]: # no type for dataclass? Schema = Any + def _as_columns(s: Schema) -> Dict[str, Type]: # todo would be nice to extract properties; add tests for this as well - import dataclasses as D - if D.is_dataclass(s): - return {f.name: f.type for f in D.fields(s)} + if dataclasses.is_dataclass(s): + return {f.name: f.type for f in dataclasses.fields(s)} # else must be NamedTuple?? # todo assert my.core.common.is_namedtuple? return getattr(s, '_field_types') @@ -130,7 +151,7 @@ def _as_columns(s: Schema) -> Dict[str, Type]: # todo add proper types @check_dataframe -def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataFrameT: +def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFrameT: # todo warn if schema isn't specified? # ok nice supports dataframe/NT natively # https://github.com/pandas-dev/pandas/pull/27999 @@ -138,27 +159,27 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataF # https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562 # same for NamedTuple -- seems that it takes whatever schema the first NT has # so we need to convert each individually... sigh - import pandas as pd # noqa: F811 not actually a redefinition + import pandas as pd # noqa: F811 not actually a redefinition + columns = None if schema is None else list(_as_columns(schema).keys()) return pd.DataFrame(to_jsons(it), columns=columns) def test_as_dataframe() -> None: import pytest + it = (dict(i=i, s=f'str{i}') for i in range(10)) with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841 - df = as_dataframe(it) + df: DataFrameT = as_dataframe(it) # todo test other error col policies assert list(df.columns) == ['i', 's', 'error'] assert len(as_dataframe([])) == 0 - from dataclasses import dataclass - - @dataclass + @dataclasses.dataclass class X: x: int # makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable - df = as_dataframe([], schema=X) - assert list(df.columns) == ['x', 'error'] + df2: DataFrameT = as_dataframe([], schema=X) + assert list(df2.columns) == ['x', 'error']