core: add pandas utils

This commit is contained in:
Dima Gerasimov 2020-09-14 20:42:56 +01:00 committed by karlicoss
parent 63b848087d
commit 132db1dc0c
2 changed files with 45 additions and 2 deletions

View file

@ -126,9 +126,9 @@ def config_check(args):
return return
rc = mres.returncode rc = mres.returncode
if rc == 0: if rc == 0:
info('mypy check: success') info('mypy config check: success')
else: else:
error('mypy check: failed') error('mypy config check: failed')
sys.stderr.write(indent(mres.stderr.decode('utf8'))) sys.stderr.write(indent(mres.stderr.decode('utf8')))
sys.stderr.write(indent(mres.stdout.decode('utf8'))) sys.stderr.write(indent(mres.stdout.decode('utf8')))

43
my/core/pandas.py Normal file
View file

@ -0,0 +1,43 @@
'''
Various pandas helpers and convenience functions
'''
# todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential
from typing import Optional
import warnings
# FIXME need to make sure check_dataframe decorator can be used without actually importing pandas
# so need to move this import drom top level
import pandas as pd # type: ignore
# todo special warning type?
def check_dateish(s) -> Optional[str]:
ctype = s.dtype
if str(ctype).startswith('datetime64'):
return None
s = s.dropna()
if len(s) == 0:
return None
all_timestamps = s.apply(lambda x: isinstance(x, pd.Timestamp)).all()
if all_timestamps:
return 'All values are pd.Timestamp, but dtype is not datetime. Most likely, you have mixed timezones'
return None
def check_dataframe(f):
from functools import wraps
@wraps(f)
def wrapper(*args, **kwargs) -> pd.DataFrame:
df = f(*args, **kwargs)
# todo make super defensive?
# TODO check index as well?
for col, data in df.iteritems():
res = check_dateish(data)
if res is not None:
warnings.warn(f"{f.__name__}, column '{col}': {res}")
return df
return wrapper
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?