From 132db1dc0c457430bd75b4e829d89fdd25a657ee Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 14 Sep 2020 20:42:56 +0100 Subject: [PATCH] core: add pandas utils --- my/core/__main__.py | 4 ++-- my/core/pandas.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 my/core/pandas.py diff --git a/my/core/__main__.py b/my/core/__main__.py index a209e72..a7ba6d4 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -126,9 +126,9 @@ def config_check(args): return rc = mres.returncode if rc == 0: - info('mypy check: success') + info('mypy config check: success') else: - error('mypy check: failed') + error('mypy config check: failed') sys.stderr.write(indent(mres.stderr.decode('utf8'))) sys.stderr.write(indent(mres.stdout.decode('utf8'))) diff --git a/my/core/pandas.py b/my/core/pandas.py new file mode 100644 index 0000000..28b8de1 --- /dev/null +++ b/my/core/pandas.py @@ -0,0 +1,43 @@ +''' +Various pandas helpers and convenience functions +''' +# todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential +from typing import Optional +import warnings + + +# FIXME need to make sure check_dataframe decorator can be used without actually importing pandas +# so need to move this import drom top level +import pandas as pd # type: ignore + +# todo special warning type? + + +def check_dateish(s) -> Optional[str]: + ctype = s.dtype + if str(ctype).startswith('datetime64'): + return None + s = s.dropna() + if len(s) == 0: + return None + all_timestamps = s.apply(lambda x: isinstance(x, pd.Timestamp)).all() + if all_timestamps: + return 'All values are pd.Timestamp, but dtype is not datetime. Most likely, you have mixed timezones' + return None + + +def check_dataframe(f): + from functools import wraps + @wraps(f) + def wrapper(*args, **kwargs) -> pd.DataFrame: + df = f(*args, **kwargs) + # todo make super defensive? + # TODO check index as well? + for col, data in df.iteritems(): + res = check_dateish(data) + if res is not None: + warnings.warn(f"{f.__name__}, column '{col}': {res}") + return df + return wrapper + +# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?