core: helpers for automatic dataframes from sequences of NamedTuple/dataclass

also use in my.rescuetime
2021-02-13 17:13:05 +00:00 · 2021-02-13 17:13:05 +00:00 · 07f901e1e5
commit 07f901e1e5
parent df9a7f7390
5 changed files with 77 additions and 14 deletions
--- a/misc/rescuetime_cleanup.py
+++ b/misc/rescuetime_cleanup.py
@ -0,0 +1,36 @@
 # M-x run-python (raise window so it doesn't hide)
 # ?? python-shell-send-defun
 # C-c C-r python-shell-send-region
 # shit, it isn't autoscrolling??
 #    maybe add hook
 #    (setq comint-move-point-for-output t) ;; https://github.com/jorgenschaefer/elpy/issues/1641#issuecomment-528355368
 #
 from importlib import reload
 import sys
 # todo function to reload hpi?
 todel = [m for m in sys.modules if m.startswith('my.')]
 # for m in todel: del sys.modules[m]
 import my
 import my.rescuetime as M
 from itertools import islice, groupby
 from more_itertools import ilen, bucket
 print(M.dataframe())
 e = M.entries()
 e = list(islice(e, 0, 10))
 key = lambda x: 'ERROR' if isinstance(x, Exception) else x.activity
 # TODO move to errors module? how to preserve type signature?
 # b = bucket(e, key=key)
 # for k in b:
 #     g = b[k] # meh? should maybe sort
 #     print(k, ilen(g))
 from collections import Counter
 print(Counter(key(x) for x in e))
--- a/my/core/common.py
+++ b/my/core/common.py
@ -458,3 +458,16 @@ def guess_datetime(x: Any) -> Optional[datetime]:
        if isinstance(v, datetime):
            return v
    return None
 def asdict(thing) -> Json:
    # todo primitive?
    # todo exception?
    if isinstance(thing, dict):
        return thing
    import dataclasses as D
    if D.is_dataclass(thing):
        return D.asdict(thing)
    # must be a NT otherwise?
    # todo add a proper check.. ()
    return thing._asdict()
--- a/my/core/logging.py
+++ b/my/core/logging.py
@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 '''
-Default logger is a bit, see 'test'/run this file for a demo
+Default logger is a bit meh, see 'test'/run this file for a demo
 TODO name 'klogging' to avoid possible conflict with default 'logging' module
 TODO shit. too late already? maybe use fallback & deprecate
 '''
 def test() -> None:
--- a/my/core/pandas.py
+++ b/my/core/pandas.py
@ -6,7 +6,7 @@ Various pandas helpers and convenience functions
 from datetime import datetime
 from pprint import pformat
 from typing import Optional, TYPE_CHECKING, Any, Iterable
-from . import warnings
+from . import warnings, Res
 from .common import LazyLogger
 logger = LazyLogger(__name__)
@ -109,3 +109,19 @@ def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Dict[str, Any]:
        'error': estr,
        dt_col : edt,
    }
 # todo add proper types
@check_dataframe
 def as_dataframe(it: Iterable[Res[Any]]) -> DataFrameT:
    # ok nice supports dataframe/NT natively
    # https://github.com/pandas-dev/pandas/pull/27999
    #    but it dispatches dataclass based on the first entry...
    #    https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
    # same for NamedTuple -- seems that it takes whatever schema the first NT has
    # so we need to convert each individually... sigh
    from .common import asdict
    ie = (error_to_row(r) if isinstance(r, Exception) else asdict(r) for r in it)
    # TODO just add tests for it?
    import pandas as pd
    return pd.DataFrame(ie)
--- a/my/rescuetime.py
+++ b/my/rescuetime.py
@ -12,7 +12,6 @@ from typing import Sequence, Iterable
 from .core import get_files, LazyLogger
 from .core.common import mcachew
 from .core.error import Res, split_errors
 from .core.pandas import check_dataframe as cdf, DataFrameT
 from my.config import rescuetime as config
@ -29,7 +28,7 @@ DAL = dal.DAL
 Entry = dal.Entry
-@mcachew(hashf=lambda: inputs())
+@mcachew(depends_on=lambda: inputs())
 def entries() -> Iterable[Res[Entry]]:
    dal = DAL(inputs())
    it = dal.entries()
@ -44,17 +43,10 @@ def groups(gap: timedelta=timedelta(hours=3)) -> Iterable[Res[Sequence[Entry]]]:
    yield from split_when(vit, lambda a, b: (b.dt - a.dt) > gap)
-@cdf
+# todo automatic dataframe interface?
 from .core.pandas import DataFrameT, as_dataframe
 def dataframe() -> DataFrameT:
-    import pandas as pd # type: ignore
+    return as_dataframe(entries())
    # type: ignore[call-arg, attr-defined]
    def it():
        for e in entries():
            if isinstance(e, Exception):
                yield dict(error=str(e))
            else:
                yield e._asdict()
    return pd.DataFrame(it())
 from .core import stat, Stats
@ -89,6 +81,8 @@ def fake_data(rows: int=1000) -> Iterator[None]:
 # todo not sure if I want to keep these here? vvv
 # guess should move to core? or to 'ext' module, i.e. interfaces?
 # make automatic
 def fill_influxdb():
    from influxdb import InfluxDBClient # type: ignore
    client = InfluxDBClient()
@ -106,3 +100,5 @@ def fill_influxdb():
    } for e in vit]
    client.write_points(jsons, database=db) # TODO??
 # TODO lots of garbage in dir()? maybe need to del the imports...