core.pandas: add check for 'error' column + add empty one by default

This commit is contained in:
Dima Gerasimov 2020-12-17 04:47:05 +00:00 committed by karlicoss
parent 3a1e21635a
commit df9a7f7390
7 changed files with 82 additions and 30 deletions

View file

@ -168,7 +168,7 @@ def config_ok(args) -> bool:
sys.exit(1) sys.exit(1)
cfg_path = cfg.__file__# todo might be better to use __path__? cfg_path = cfg.__file__# todo might be better to use __path__?
info(f"config file: {cfg_path}") info(f"config file : {cfg_path}")
import my.core as core import my.core as core
try: try:
@ -195,7 +195,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module
if mres is not None: # has mypy if mres is not None: # has mypy
rc = mres.returncode rc = mres.returncode
if rc == 0: if rc == 0:
info('mypy check: success') info('mypy check : success')
else: else:
error('mypy check: failed') error('mypy check: failed')
errors.append(RuntimeError('mypy failed')) errors.append(RuntimeError('mypy failed'))

View file

@ -292,16 +292,6 @@ else:
from .py37 import fromisoformat from .py37 import fromisoformat
if sys.version_info[:2] >= (3, 8):
from typing import Literal
else:
if TYPE_CHECKING:
from typing_extensions import Literal
else:
# erm.. I guess as long as it's not crashing, whatever...
Literal = Union
# TODO doctests? # TODO doctests?
def isoparse(s: str) -> tzdatetime: def isoparse(s: str) -> tzdatetime:
""" """
@ -313,6 +303,8 @@ def isoparse(s: str) -> tzdatetime:
s = s[:-1] + '+00:00' s = s[:-1] + '+00:00'
return fromisoformat(s) return fromisoformat(s)
from .compat import Literal
import re import re
# https://stackoverflow.com/a/295466/706389 # https://stackoverflow.com/a/295466/706389

View file

@ -47,3 +47,16 @@ def _get_dal(cfg, module_name: str):
from importlib import import_module from importlib import import_module
return import_module(f'my.config.repos.{module_name}.dal') return import_module(f'my.config.repos.{module_name}.dal')
import sys
from typing import TYPE_CHECKING
if sys.version_info[:2] >= (3, 8):
from typing import Literal
else:
if TYPE_CHECKING:
from typing_extensions import Literal
else:
from typing import Union
# erm.. I guess as long as it's not crashing, whatever...
Literal = Union

View file

@ -7,6 +7,9 @@ from datetime import datetime
from pprint import pformat from pprint import pformat
from typing import Optional, TYPE_CHECKING, Any, Iterable from typing import Optional, TYPE_CHECKING, Any, Iterable
from . import warnings from . import warnings
from .common import LazyLogger
logger = LazyLogger(__name__)
if TYPE_CHECKING: if TYPE_CHECKING:
@ -14,6 +17,8 @@ if TYPE_CHECKING:
# later will be unignored when they implement type annotations # later will be unignored when they implement type annotations
import pandas as pd # type: ignore import pandas as pd # type: ignore
# DataFrameT = pd.DataFrame # DataFrameT = pd.DataFrame
# TODO ugh. pretty annoying, having any is not very useful since it would allow arbitrary coercions..
# ideally want to use a type that's like Any but doesn't allow arbitrary coercions??
DataFrameT = Any DataFrameT = Any
else: else:
# in runtime, make it defensive so it works without pandas # in runtime, make it defensive so it works without pandas
@ -40,21 +45,54 @@ def check_dateish(s) -> Iterable[str]:
'''.strip() '''.strip()
from .compat import Literal
ErrorColPolicy = Literal[
'add_if_missing', # add error column if it's missing
'warn' , # warn, but do not modify
'ignore' , # no warnings
]
def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]:
if 'error' in df:
return
if policy == 'ignore':
return
wmsg = '''
No 'error' column detected. You probably forgot to handle errors defensively, which means a single bad entry might bring the whole dataframe down.
'''.strip()
if policy == 'add_if_missing':
# todo maybe just add the warnings text as well?
df['error'] = None
wmsg += "\nAdding empty 'error' column (see 'error_col_policy' if you want to change this behaviour)"
pass
yield wmsg
from typing import Any, Callable, TypeVar from typing import Any, Callable, TypeVar
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT]) FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
def check_dataframe(f: FuncT) -> FuncT: # TODO ugh. typing this is a mess... shoul I use mypy_extensions.VarArg/KwArgs?? or what??
from functools import wraps from decorator import decorator
@wraps(f) @decorator
def wrapper(*args, **kwargs) -> DataFrameT: def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', *args, **kwargs) -> DataFrameT:
df = f(*args, **kwargs) df = f(*args, **kwargs)
# todo make super defensive? tag = '{f.__module__}:{f.__name__}'
# makes sense to keep super defensive
try:
for col, data in df.reset_index().iteritems(): for col, data in df.reset_index().iteritems():
for w in check_dateish(data): for w in check_dateish(data):
warnings.low(f"{f.__module__}:{f.__name__}, column '{col}': {w}") warnings.low(f"{tag}, column '{col}': {w}")
except Exception as e:
logger.exception(e)
try:
for w in check_error_column(df, policy=error_col_policy):
warnings.low(f"{tag}, {w}")
except Exception as e:
logger.exception(e)
return df return df
# https://github.com/python/mypy/issues/1927
return wrapper # type: ignore[return-value]
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type? # todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?

View file

@ -36,7 +36,7 @@ def _warn(message: str, *args, color=None, **kwargs) -> None:
def low(message: str, *args, **kwargs) -> None: def low(message: str, *args, **kwargs) -> None:
kwargs['color'] = 'grey' # kwargs['color'] = 'grey' # eh, grey is way too pale
_warn(message, *args, **kwargs) _warn(message, *args, **kwargs)

View file

@ -11,8 +11,7 @@ from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Sequence, Iterable from typing import Sequence, Iterable
from .core.common import Paths, get_files from .core import Paths, get_files
from .core.error import Res
from my.config import endomondo as user_config from my.config import endomondo as user_config
@ -35,13 +34,17 @@ import endoexport.dal as dal
from endoexport.dal import Point, Workout from endoexport.dal import Point, Workout
from .core import Res
# todo cachew? # todo cachew?
def workouts() -> Iterable[Res[Workout]]: def workouts() -> Iterable[Res[Workout]]:
_dal = dal.DAL(inputs()) _dal = dal.DAL(inputs())
yield from _dal.workouts() yield from _dal.workouts()
def dataframe(defensive=True): from .core.pandas import check_dataframe, DataFrameT
@check_dataframe
def dataframe(defensive: bool=True) -> DataFrameT:
def it(): def it():
for w in workouts(): for w in workouts():
if isinstance(w, Exception): if isinstance(w, Exception):
@ -67,13 +70,18 @@ def dataframe(defensive=True):
df = pd.DataFrame(it()) df = pd.DataFrame(it())
# pandas guesses integer, which is pointless for this field (might get coerced to float too) # pandas guesses integer, which is pointless for this field (might get coerced to float too)
df['id'] = df['id'].astype(str) df['id'] = df['id'].astype(str)
if 'error' not in df:
df['error'] = None
return df return df
from .core import stat, Stats
def stats(): def stats() -> Stats:
from .core import stat return {
return stat(workouts) # todo pretty print stats?
**stat(workouts),
**stat(dataframe),
}
# TODO make sure it's possible to 'advise' functions and override stuff # TODO make sure it's possible to 'advise' functions and override stuff

View file

@ -7,6 +7,7 @@ INSTALL_REQUIRES = [
'pytz', # even though it's not needed by the core, it's so common anyway... 'pytz', # even though it's not needed by the core, it's so common anyway...
'appdirs', # very common, and makes it portable 'appdirs', # very common, and makes it portable
'more-itertools', # it's just too useful and very common anyway 'more-itertools', # it's just too useful and very common anyway
'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
] ]