my.core.pandas: rely on typing annotations from types-pandas

2023-05-18 02:17:59 +01:00 · 2023-05-18 02:17:59 +01:00 · a98bc6daca
commit a98bc6daca
parent fe88380499
1 changed files with 56 additions and 35 deletions
--- a/my/core/pandas.py
+++ b/my/core/pandas.py
@ -1,32 +1,46 @@
 '''
 Various pandas helpers and convenience functions
 '''
+from __future__ import annotations
+
 # todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential
 # NOTE: this file is meant to be importable without Pandas installed
-from datetime import datetime
+import dataclasses
+from datetime import datetime, timezone
 from pprint import pformat
-from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, Dict, Literal
+from typing import TYPE_CHECKING, Any, Iterable, Type, Dict, Literal, Callable, TypeVar
+
+from decorator import decorator
+
 from . import warnings, Res
 from .common import LazyLogger, Json, asdict
+from .error import error_to_json, extract_error_datetime
+

 logger = LazyLogger(__name__)


 if TYPE_CHECKING:
-    # this is kinda pointless at the moment, but handy to annotate DF returning methods now
-    # later will be unignored when they implement type annotations
    import pandas as pd
-    # DataFrameT = pd.DataFrame
-    # TODO ugh. pretty annoying, having any is not very useful since it would allow arbitrary coercions..
-    # ideally want to use a type that's like Any but doesn't allow arbitrary coercions??
-    DataFrameT = Any
+
+    DataFrameT = pd.DataFrame
+    SeriesT = pd.Series
+    from pandas._typing import S1  # meh
+
+    FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
+    # huh interesting -- with from __future__ import annotations don't even need else clause here?
+    # but still if other modules import these we do need some fake runtime types here..
 else:
-    # in runtime, make it defensive so it works without pandas
+    from typing import Optional
+
    DataFrameT = Any
+    SeriesT = Optional  # just some type with one argument
+    S1 = Any


-def check_dateish(s) -> Iterable[str]:
+def check_dateish(s: SeriesT[S1]) -> Iterable[str]:
    import pandas as pd  # noqa: F811 not actually a redefinition
+
    ctype = s.dtype
    if str(ctype).startswith('datetime64'):
        return
@ -45,11 +59,22 @@ def check_dateish(s) -> Iterable[str]:
    '''.strip()


+def test_check_dateish() -> None:
+    import pandas as pd
+
+    # todo just a dummy test to check it doesn't crash, need something meaningful
+    s1 = pd.Series([1, 2, 3])
+    list(check_dateish(s1))
+
+
+# fmt: off
 ErrorColPolicy = Literal[
    'add_if_missing',  # add error column if it's missing
    'warn'          ,  # warn, but do not modify
    'ignore'        ,  # no warnings
 ]
+# fmt: on
+

 def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]:
    if 'error' in df:
@ -69,18 +94,14 @@ No 'error' column detected. You probably forgot to handle errors defensively, wh
    yield wmsg


-from typing import Any, Callable, TypeVar
-FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
-
-# TODO ugh. typing this is a mess... should I use mypy_extensions.VarArg/KwArgs?? or what??
-from decorator import decorator
+# TODO ugh. typing this is a mess... perhaps should use .compat.ParamSpec?
@decorator
-def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', *args, **kwargs) -> DataFrameT:
-    df = f(*args, **kwargs)
+def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy = 'add_if_missing', *args, **kwargs) -> DataFrameT:
+    df: DataFrameT = f(*args, **kwargs)
    tag = '{f.__module__}:{f.__name__}'
    # makes sense to keep super defensive
    try:
-        for col, data in df.reset_index().iteritems():
+        for col, data in df.reset_index().items():
            for w in check_dateish(data):
                warnings.low(f"{tag}, column '{col}': {w}")
    except Exception as e:
@ -92,11 +113,11 @@ def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing',
        logger.exception(e)
    return df

+
 # todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?


-def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Json:
-    from .error import error_to_json, extract_error_datetime
+def error_to_row(e: Exception, *, dt_col: str = 'dt', tz: timezone | None = None) -> Json:
    edt = extract_error_datetime(e)
    if edt is not None and edt.tzinfo is None and tz is not None:
        edt = edt.replace(tzinfo=tz)
@ -118,11 +139,11 @@ def to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]:
 # no type for dataclass?
 Schema = Any

+
 def _as_columns(s: Schema) -> Dict[str, Type]:
    # todo would be nice to extract properties; add tests for this as well
-    import dataclasses as D
-    if D.is_dataclass(s):
-        return {f.name: f.type for f in D.fields(s)}
+    if dataclasses.is_dataclass(s):
+        return {f.name: f.type for f in dataclasses.fields(s)}
    # else must be NamedTuple??
    # todo assert my.core.common.is_namedtuple?
    return getattr(s, '_field_types')
@ -130,7 +151,7 @@ def _as_columns(s: Schema) -> Dict[str, Type]:

 # todo add proper types
@check_dataframe
-def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataFrameT:
+def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFrameT:
    # todo warn if schema isn't specified?
    # ok nice supports dataframe/NT natively
    # https://github.com/pandas-dev/pandas/pull/27999
@ -139,26 +160,26 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataF
    # same for NamedTuple -- seems that it takes whatever schema the first NT has
    # so we need to convert each individually... sigh
    import pandas as pd  # noqa: F811 not actually a redefinition
+
    columns = None if schema is None else list(_as_columns(schema).keys())
    return pd.DataFrame(to_jsons(it), columns=columns)


 def test_as_dataframe() -> None:
    import pytest
+
    it = (dict(i=i, s=f'str{i}') for i in range(10))
    with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings:  # noqa: F841
-        df = as_dataframe(it)
+        df: DataFrameT = as_dataframe(it)
        # todo test other error col policies
    assert list(df.columns) == ['i', 's', 'error']

    assert len(as_dataframe([])) == 0

-    from dataclasses import dataclass
-
-    @dataclass
+    @dataclasses.dataclass
    class X:
        x: int

    # makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable
-    df = as_dataframe([], schema=X)
-    assert list(df.columns) == ['x', 'error']
+    df2: DataFrameT = as_dataframe([], schema=X)
+    assert list(df2.columns) == ['x', 'error']