From 07f901e1e5fb2bd3009561c84cc4efd311c94733 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 13 Feb 2021 17:13:05 +0000 Subject: [PATCH] core: helpers for automatic dataframes from sequences of NamedTuple/dataclass also use in my.rescuetime --- misc/rescuetime_cleanup.py | 36 ++++++++++++++++++++++++++++++++++++ my/core/common.py | 13 +++++++++++++ my/core/logging.py | 4 +++- my/core/pandas.py | 18 +++++++++++++++++- my/rescuetime.py | 20 ++++++++------------ 5 files changed, 77 insertions(+), 14 deletions(-) create mode 100644 misc/rescuetime_cleanup.py diff --git a/misc/rescuetime_cleanup.py b/misc/rescuetime_cleanup.py new file mode 100644 index 0000000..356211e --- /dev/null +++ b/misc/rescuetime_cleanup.py @@ -0,0 +1,36 @@ +# M-x run-python (raise window so it doesn't hide) +# ?? python-shell-send-defun +# C-c C-r python-shell-send-region +# shit, it isn't autoscrolling?? +# maybe add hook +# (setq comint-move-point-for-output t) ;; https://github.com/jorgenschaefer/elpy/issues/1641#issuecomment-528355368 +# +from importlib import reload +import sys + +# todo function to reload hpi? +todel = [m for m in sys.modules if m.startswith('my.')] +# for m in todel: del sys.modules[m] + +import my +import my.rescuetime as M + +from itertools import islice, groupby +from more_itertools import ilen, bucket + +print(M.dataframe()) + +e = M.entries() +e = list(islice(e, 0, 10)) + + +key = lambda x: 'ERROR' if isinstance(x, Exception) else x.activity + +# TODO move to errors module? how to preserve type signature? +# b = bucket(e, key=key) +# for k in b: +# g = b[k] # meh? should maybe sort +# print(k, ilen(g)) + +from collections import Counter +print(Counter(key(x) for x in e)) diff --git a/my/core/common.py b/my/core/common.py index 17d0db9..27e9d9b 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -458,3 +458,16 @@ def guess_datetime(x: Any) -> Optional[datetime]: if isinstance(v, datetime): return v return None + + +def asdict(thing) -> Json: + # todo primitive? + # todo exception? + if isinstance(thing, dict): + return thing + import dataclasses as D + if D.is_dataclass(thing): + return D.asdict(thing) + # must be a NT otherwise? + # todo add a proper check.. () + return thing._asdict() diff --git a/my/core/logging.py b/my/core/logging.py index bc10dc0..8e382e5 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 ''' -Default logger is a bit, see 'test'/run this file for a demo +Default logger is a bit meh, see 'test'/run this file for a demo +TODO name 'klogging' to avoid possible conflict with default 'logging' module +TODO shit. too late already? maybe use fallback & deprecate ''' def test() -> None: diff --git a/my/core/pandas.py b/my/core/pandas.py index f58a894..f1d4c5c 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -6,7 +6,7 @@ Various pandas helpers and convenience functions from datetime import datetime from pprint import pformat from typing import Optional, TYPE_CHECKING, Any, Iterable -from . import warnings +from . import warnings, Res from .common import LazyLogger logger = LazyLogger(__name__) @@ -109,3 +109,19 @@ def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Dict[str, Any]: 'error': estr, dt_col : edt, } + + +# todo add proper types +@check_dataframe +def as_dataframe(it: Iterable[Res[Any]]) -> DataFrameT: + # ok nice supports dataframe/NT natively + # https://github.com/pandas-dev/pandas/pull/27999 + # but it dispatches dataclass based on the first entry... + # https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562 + # same for NamedTuple -- seems that it takes whatever schema the first NT has + # so we need to convert each individually... sigh + from .common import asdict + ie = (error_to_row(r) if isinstance(r, Exception) else asdict(r) for r in it) + # TODO just add tests for it? + import pandas as pd + return pd.DataFrame(ie) diff --git a/my/rescuetime.py b/my/rescuetime.py index 45f8f58..a616c33 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -12,7 +12,6 @@ from typing import Sequence, Iterable from .core import get_files, LazyLogger from .core.common import mcachew from .core.error import Res, split_errors -from .core.pandas import check_dataframe as cdf, DataFrameT from my.config import rescuetime as config @@ -29,7 +28,7 @@ DAL = dal.DAL Entry = dal.Entry -@mcachew(hashf=lambda: inputs()) +@mcachew(depends_on=lambda: inputs()) def entries() -> Iterable[Res[Entry]]: dal = DAL(inputs()) it = dal.entries() @@ -44,17 +43,10 @@ def groups(gap: timedelta=timedelta(hours=3)) -> Iterable[Res[Sequence[Entry]]]: yield from split_when(vit, lambda a, b: (b.dt - a.dt) > gap) -@cdf +# todo automatic dataframe interface? +from .core.pandas import DataFrameT, as_dataframe def dataframe() -> DataFrameT: - import pandas as pd # type: ignore - # type: ignore[call-arg, attr-defined] - def it(): - for e in entries(): - if isinstance(e, Exception): - yield dict(error=str(e)) - else: - yield e._asdict() - return pd.DataFrame(it()) + return as_dataframe(entries()) from .core import stat, Stats @@ -89,6 +81,8 @@ def fake_data(rows: int=1000) -> Iterator[None]: # todo not sure if I want to keep these here? vvv +# guess should move to core? or to 'ext' module, i.e. interfaces? +# make automatic def fill_influxdb(): from influxdb import InfluxDBClient # type: ignore client = InfluxDBClient() @@ -106,3 +100,5 @@ def fill_influxdb(): } for e in vit] client.write_points(jsons, database=db) # TODO?? + +# TODO lots of garbage in dir()? maybe need to del the imports...