core.pandas: add check for 'error' column + add empty one by default

2020-12-17 04:47:05 +00:00 · 2020-12-17 04:47:05 +00:00 · df9a7f7390
commit df9a7f7390
parent 3a1e21635a
7 changed files with 82 additions and 30 deletions
--- a/my/core/main.py
+++ b/my/core/main.py
@ -168,7 +168,7 @@ def config_ok(args) -> bool:
        sys.exit(1)
    cfg_path = cfg.__file__# todo might be better to use __path__?
-    info(f"config file: {cfg_path}")
+    info(f"config file : {cfg_path}")
    import my.core as core
    try:
@ -195,7 +195,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module
    if mres is not None: # has mypy
        rc = mres.returncode
        if rc == 0:
-            info('mypy check: success')
+            info('mypy check  : success')
        else:
            error('mypy check: failed')
            errors.append(RuntimeError('mypy failed'))
--- a/my/core/common.py
+++ b/my/core/common.py
@ -292,16 +292,6 @@ else:
    from .py37 import fromisoformat
 if sys.version_info[:2] >= (3, 8):
    from typing import Literal
 else:
    if TYPE_CHECKING:
        from typing_extensions import Literal
    else:
        # erm.. I guess as long as it's not crashing, whatever...
        Literal = Union
 # TODO doctests?
 def isoparse(s: str) -> tzdatetime:
    """
@ -313,6 +303,8 @@ def isoparse(s: str) -> tzdatetime:
    s = s[:-1] + '+00:00'
    return fromisoformat(s)
 from .compat import Literal
 import re
 # https://stackoverflow.com/a/295466/706389
--- a/my/core/compat.py
+++ b/my/core/compat.py
@ -47,3 +47,16 @@ def _get_dal(cfg, module_name: str):
        from importlib import import_module
        return import_module(f'my.config.repos.{module_name}.dal')
 import sys
 from typing import TYPE_CHECKING
 if sys.version_info[:2] >= (3, 8):
    from typing import Literal
 else:
    if TYPE_CHECKING:
        from typing_extensions import Literal
    else:
        from typing import Union
        # erm.. I guess as long as it's not crashing, whatever...
        Literal = Union
--- a/my/core/pandas.py
+++ b/my/core/pandas.py
@ -7,6 +7,9 @@ from datetime import datetime
 from pprint import pformat
 from typing import Optional, TYPE_CHECKING, Any, Iterable
 from . import warnings
 from .common import LazyLogger
 logger = LazyLogger(__name__)
 if TYPE_CHECKING:
@ -14,6 +17,8 @@ if TYPE_CHECKING:
    # later will be unignored when they implement type annotations
    import pandas as pd # type: ignore
    # DataFrameT = pd.DataFrame
    # TODO ugh. pretty annoying, having any is not very useful since it would allow arbitrary coercions..
    # ideally want to use a type that's like Any but doesn't allow arbitrary coercions??
    DataFrameT = Any
 else:
    # in runtime, make it defensive so it works without pandas
@ -40,21 +45,54 @@ def check_dateish(s) -> Iterable[str]:
    '''.strip()
 from .compat import Literal
 ErrorColPolicy = Literal[
    'add_if_missing', # add error column if it's missing
    'warn'          , # warn, but do not modify
    'ignore'        , # no warnings
 ]
 def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]:
    if 'error' in df:
        return
    if policy == 'ignore':
        return
    wmsg = '''
 No 'error' column detected. You probably forgot to handle errors defensively, which means a single bad entry might bring the whole dataframe down.
 '''.strip()
    if policy == 'add_if_missing':
        # todo maybe just add the warnings text as well?
        df['error'] = None
        wmsg += "\nAdding empty 'error' column (see 'error_col_policy' if you want to change this behaviour)"
        pass
    yield wmsg
 from typing import Any, Callable, TypeVar
 FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
-def check_dataframe(f: FuncT) -> FuncT:
+# TODO ugh. typing this is a mess... shoul I use mypy_extensions.VarArg/KwArgs?? or what??
-    from functools import wraps
+from decorator import decorator
-    @wraps(f)
+@decorator
-    def wrapper(*args, **kwargs) -> DataFrameT:
+def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', *args, **kwargs) -> DataFrameT:
    df = f(*args, **kwargs)
-        # todo make super defensive?
+    tag = '{f.__module__}:{f.__name__}'
    # makes sense to keep super defensive
    try:
        for col, data in df.reset_index().iteritems():
            for w in check_dateish(data):
-                warnings.low(f"{f.__module__}:{f.__name__}, column '{col}': {w}")
+                warnings.low(f"{tag}, column '{col}': {w}")
    except Exception as e:
        logger.exception(e)
    try:
        for w in check_error_column(df, policy=error_col_policy):
            warnings.low(f"{tag}, {w}")
    except Exception as e:
        logger.exception(e)
    return df
    # https://github.com/python/mypy/issues/1927
    return wrapper # type: ignore[return-value]
 # todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
--- a/my/core/warnings.py
+++ b/my/core/warnings.py
@ -36,7 +36,7 @@ def _warn(message: str, *args, color=None, **kwargs) -> None:
 def low(message: str, *args, **kwargs) -> None:
-    kwargs['color'] = 'grey'
+    # kwargs['color'] = 'grey' # eh, grey is way too pale
    _warn(message, *args, **kwargs)
--- a/my/endomondo.py
+++ b/my/endomondo.py
@ -11,8 +11,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Sequence, Iterable
-from .core.common import Paths, get_files
+from .core import Paths, get_files
 from .core.error import Res
 from my.config import endomondo as user_config
@ -35,13 +34,17 @@ import endoexport.dal as dal
 from endoexport.dal import Point, Workout
 from .core import Res
 # todo cachew?
 def workouts() -> Iterable[Res[Workout]]:
    _dal = dal.DAL(inputs())
    yield from _dal.workouts()
-def dataframe(defensive=True):
+from .core.pandas import check_dataframe, DataFrameT
@check_dataframe
 def dataframe(defensive: bool=True) -> DataFrameT:
    def it():
        for w in workouts():
            if isinstance(w, Exception):
@ -67,13 +70,18 @@ def dataframe(defensive=True):
    df = pd.DataFrame(it())
    # pandas guesses integer, which is pointless for this field (might get coerced to float too)
    df['id'] = df['id'].astype(str)
    if 'error' not in df:
        df['error'] = None
    return df
-
+from .core import stat, Stats
-def stats():
+def stats() -> Stats:
-    from .core import stat
+    return {
-    return stat(workouts)
+        # todo pretty print stats?
        **stat(workouts),
        **stat(dataframe),
    }
 # TODO make sure it's possible to 'advise' functions and override stuff
--- a/setup.py
+++ b/setup.py
@ -7,6 +7,7 @@ INSTALL_REQUIRES = [
    'pytz',           # even though it's not needed by the core, it's so common anyway...
    'appdirs',        # very common, and makes it portable
    'more-itertools', # it's just too useful and very common anyway
    'decorator'     , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
 ]