core.pandas: add check for 'error' column + add empty one by default
This commit is contained in:
parent
3a1e21635a
commit
df9a7f7390
7 changed files with 82 additions and 30 deletions
|
@ -292,16 +292,6 @@ else:
|
||||||
from .py37 import fromisoformat
|
from .py37 import fromisoformat
|
||||||
|
|
||||||
|
|
||||||
if sys.version_info[:2] >= (3, 8):
|
|
||||||
from typing import Literal
|
|
||||||
else:
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from typing_extensions import Literal
|
|
||||||
else:
|
|
||||||
# erm.. I guess as long as it's not crashing, whatever...
|
|
||||||
Literal = Union
|
|
||||||
|
|
||||||
|
|
||||||
# TODO doctests?
|
# TODO doctests?
|
||||||
def isoparse(s: str) -> tzdatetime:
|
def isoparse(s: str) -> tzdatetime:
|
||||||
"""
|
"""
|
||||||
|
@ -313,6 +303,8 @@ def isoparse(s: str) -> tzdatetime:
|
||||||
s = s[:-1] + '+00:00'
|
s = s[:-1] + '+00:00'
|
||||||
return fromisoformat(s)
|
return fromisoformat(s)
|
||||||
|
|
||||||
|
from .compat import Literal
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
# https://stackoverflow.com/a/295466/706389
|
# https://stackoverflow.com/a/295466/706389
|
||||||
|
|
|
@ -47,3 +47,16 @@ def _get_dal(cfg, module_name: str):
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
return import_module(f'my.config.repos.{module_name}.dal')
|
return import_module(f'my.config.repos.{module_name}.dal')
|
||||||
|
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 8):
|
||||||
|
from typing import Literal
|
||||||
|
else:
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from typing_extensions import Literal
|
||||||
|
else:
|
||||||
|
from typing import Union
|
||||||
|
# erm.. I guess as long as it's not crashing, whatever...
|
||||||
|
Literal = Union
|
||||||
|
|
|
@ -7,6 +7,9 @@ from datetime import datetime
|
||||||
from pprint import pformat
|
from pprint import pformat
|
||||||
from typing import Optional, TYPE_CHECKING, Any, Iterable
|
from typing import Optional, TYPE_CHECKING, Any, Iterable
|
||||||
from . import warnings
|
from . import warnings
|
||||||
|
from .common import LazyLogger
|
||||||
|
|
||||||
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -14,6 +17,8 @@ if TYPE_CHECKING:
|
||||||
# later will be unignored when they implement type annotations
|
# later will be unignored when they implement type annotations
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
# DataFrameT = pd.DataFrame
|
# DataFrameT = pd.DataFrame
|
||||||
|
# TODO ugh. pretty annoying, having any is not very useful since it would allow arbitrary coercions..
|
||||||
|
# ideally want to use a type that's like Any but doesn't allow arbitrary coercions??
|
||||||
DataFrameT = Any
|
DataFrameT = Any
|
||||||
else:
|
else:
|
||||||
# in runtime, make it defensive so it works without pandas
|
# in runtime, make it defensive so it works without pandas
|
||||||
|
@ -40,21 +45,54 @@ def check_dateish(s) -> Iterable[str]:
|
||||||
'''.strip()
|
'''.strip()
|
||||||
|
|
||||||
|
|
||||||
|
from .compat import Literal
|
||||||
|
|
||||||
|
ErrorColPolicy = Literal[
|
||||||
|
'add_if_missing', # add error column if it's missing
|
||||||
|
'warn' , # warn, but do not modify
|
||||||
|
'ignore' , # no warnings
|
||||||
|
]
|
||||||
|
|
||||||
|
def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]:
|
||||||
|
if 'error' in df:
|
||||||
|
return
|
||||||
|
if policy == 'ignore':
|
||||||
|
return
|
||||||
|
|
||||||
|
wmsg = '''
|
||||||
|
No 'error' column detected. You probably forgot to handle errors defensively, which means a single bad entry might bring the whole dataframe down.
|
||||||
|
'''.strip()
|
||||||
|
if policy == 'add_if_missing':
|
||||||
|
# todo maybe just add the warnings text as well?
|
||||||
|
df['error'] = None
|
||||||
|
wmsg += "\nAdding empty 'error' column (see 'error_col_policy' if you want to change this behaviour)"
|
||||||
|
pass
|
||||||
|
|
||||||
|
yield wmsg
|
||||||
|
|
||||||
|
|
||||||
from typing import Any, Callable, TypeVar
|
from typing import Any, Callable, TypeVar
|
||||||
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
|
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
|
||||||
|
|
||||||
def check_dataframe(f: FuncT) -> FuncT:
|
# TODO ugh. typing this is a mess... shoul I use mypy_extensions.VarArg/KwArgs?? or what??
|
||||||
from functools import wraps
|
from decorator import decorator
|
||||||
@wraps(f)
|
@decorator
|
||||||
def wrapper(*args, **kwargs) -> DataFrameT:
|
def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', *args, **kwargs) -> DataFrameT:
|
||||||
df = f(*args, **kwargs)
|
df = f(*args, **kwargs)
|
||||||
# todo make super defensive?
|
tag = '{f.__module__}:{f.__name__}'
|
||||||
|
# makes sense to keep super defensive
|
||||||
|
try:
|
||||||
for col, data in df.reset_index().iteritems():
|
for col, data in df.reset_index().iteritems():
|
||||||
for w in check_dateish(data):
|
for w in check_dateish(data):
|
||||||
warnings.low(f"{f.__module__}:{f.__name__}, column '{col}': {w}")
|
warnings.low(f"{tag}, column '{col}': {w}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
|
try:
|
||||||
|
for w in check_error_column(df, policy=error_col_policy):
|
||||||
|
warnings.low(f"{tag}, {w}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
return df
|
return df
|
||||||
# https://github.com/python/mypy/issues/1927
|
|
||||||
return wrapper # type: ignore[return-value]
|
|
||||||
|
|
||||||
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
|
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ def _warn(message: str, *args, color=None, **kwargs) -> None:
|
||||||
|
|
||||||
|
|
||||||
def low(message: str, *args, **kwargs) -> None:
|
def low(message: str, *args, **kwargs) -> None:
|
||||||
kwargs['color'] = 'grey'
|
# kwargs['color'] = 'grey' # eh, grey is way too pale
|
||||||
_warn(message, *args, **kwargs)
|
_warn(message, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,8 +11,7 @@ from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Sequence, Iterable
|
from typing import Sequence, Iterable
|
||||||
|
|
||||||
from .core.common import Paths, get_files
|
from .core import Paths, get_files
|
||||||
from .core.error import Res
|
|
||||||
|
|
||||||
from my.config import endomondo as user_config
|
from my.config import endomondo as user_config
|
||||||
|
|
||||||
|
@ -35,13 +34,17 @@ import endoexport.dal as dal
|
||||||
from endoexport.dal import Point, Workout
|
from endoexport.dal import Point, Workout
|
||||||
|
|
||||||
|
|
||||||
|
from .core import Res
|
||||||
# todo cachew?
|
# todo cachew?
|
||||||
def workouts() -> Iterable[Res[Workout]]:
|
def workouts() -> Iterable[Res[Workout]]:
|
||||||
_dal = dal.DAL(inputs())
|
_dal = dal.DAL(inputs())
|
||||||
yield from _dal.workouts()
|
yield from _dal.workouts()
|
||||||
|
|
||||||
|
|
||||||
def dataframe(defensive=True):
|
from .core.pandas import check_dataframe, DataFrameT
|
||||||
|
|
||||||
|
@check_dataframe
|
||||||
|
def dataframe(defensive: bool=True) -> DataFrameT:
|
||||||
def it():
|
def it():
|
||||||
for w in workouts():
|
for w in workouts():
|
||||||
if isinstance(w, Exception):
|
if isinstance(w, Exception):
|
||||||
|
@ -67,13 +70,18 @@ def dataframe(defensive=True):
|
||||||
df = pd.DataFrame(it())
|
df = pd.DataFrame(it())
|
||||||
# pandas guesses integer, which is pointless for this field (might get coerced to float too)
|
# pandas guesses integer, which is pointless for this field (might get coerced to float too)
|
||||||
df['id'] = df['id'].astype(str)
|
df['id'] = df['id'].astype(str)
|
||||||
|
if 'error' not in df:
|
||||||
|
df['error'] = None
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
from .core import stat, Stats
|
||||||
def stats():
|
def stats() -> Stats:
|
||||||
from .core import stat
|
return {
|
||||||
return stat(workouts)
|
# todo pretty print stats?
|
||||||
|
**stat(workouts),
|
||||||
|
**stat(dataframe),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# TODO make sure it's possible to 'advise' functions and override stuff
|
# TODO make sure it's possible to 'advise' functions and override stuff
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -7,6 +7,7 @@ INSTALL_REQUIRES = [
|
||||||
'pytz', # even though it's not needed by the core, it's so common anyway...
|
'pytz', # even though it's not needed by the core, it's so common anyway...
|
||||||
'appdirs', # very common, and makes it portable
|
'appdirs', # very common, and makes it portable
|
||||||
'more-itertools', # it's just too useful and very common anyway
|
'more-itertools', # it's just too useful and very common anyway
|
||||||
|
'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue