From 07f901e1e5fb2bd3009561c84cc4efd311c94733 Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Sat, 13 Feb 2021 17:13:05 +0000
Subject: [PATCH] core: helpers for automatic dataframes from sequences of
 NamedTuple/dataclass

also use in my.rescuetime
---
 misc/rescuetime_cleanup.py | 36 ++++++++++++++++++++++++++++++++++++
 my/core/common.py          | 13 +++++++++++++
 my/core/logging.py         |  4 +++-
 my/core/pandas.py          | 18 +++++++++++++++++-
 my/rescuetime.py           | 20 ++++++++------------
 5 files changed, 77 insertions(+), 14 deletions(-)
 create mode 100644 misc/rescuetime_cleanup.py

diff --git a/misc/rescuetime_cleanup.py b/misc/rescuetime_cleanup.py
new file mode 100644
index 0000000..356211e
--- /dev/null
+++ b/misc/rescuetime_cleanup.py
@@ -0,0 +1,36 @@
+# M-x run-python (raise window so it doesn't hide)
+# ?? python-shell-send-defun
+# C-c C-r python-shell-send-region
+# shit, it isn't autoscrolling??
+#    maybe add hook
+#    (setq comint-move-point-for-output t) ;; https://github.com/jorgenschaefer/elpy/issues/1641#issuecomment-528355368
+#
+from importlib import reload
+import sys
+
+# todo function to reload hpi?
+todel = [m for m in sys.modules if m.startswith('my.')]
+# for m in todel: del sys.modules[m]
+
+import my
+import my.rescuetime as M
+
+from itertools import islice, groupby
+from more_itertools import ilen, bucket
+
+print(M.dataframe())
+
+e = M.entries()
+e = list(islice(e, 0, 10))
+
+
+key = lambda x: 'ERROR' if isinstance(x, Exception) else x.activity
+
+# TODO move to errors module? how to preserve type signature?
+# b = bucket(e, key=key)
+# for k in b:
+#     g = b[k] # meh? should maybe sort
+#     print(k, ilen(g))
+
+from collections import Counter
+print(Counter(key(x) for x in e))
diff --git a/my/core/common.py b/my/core/common.py
index 17d0db9..27e9d9b 100644
--- a/my/core/common.py
+++ b/my/core/common.py
@@ -458,3 +458,16 @@ def guess_datetime(x: Any) -> Optional[datetime]:
         if isinstance(v, datetime):
             return v
     return None
+
+
+def asdict(thing) -> Json:
+    # todo primitive?
+    # todo exception?
+    if isinstance(thing, dict):
+        return thing
+    import dataclasses as D
+    if D.is_dataclass(thing):
+        return D.asdict(thing)
+    # must be a NT otherwise?
+    # todo add a proper check.. ()
+    return thing._asdict()
diff --git a/my/core/logging.py b/my/core/logging.py
index bc10dc0..8e382e5 100644
--- a/my/core/logging.py
+++ b/my/core/logging.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 '''
-Default logger is a bit, see 'test'/run this file for a demo
+Default logger is a bit meh, see 'test'/run this file for a demo
+TODO name 'klogging' to avoid possible conflict with default 'logging' module
+TODO shit. too late already? maybe use fallback & deprecate
 '''
 
 def test() -> None:
diff --git a/my/core/pandas.py b/my/core/pandas.py
index f58a894..f1d4c5c 100644
--- a/my/core/pandas.py
+++ b/my/core/pandas.py
@@ -6,7 +6,7 @@ Various pandas helpers and convenience functions
 from datetime import datetime
 from pprint import pformat
 from typing import Optional, TYPE_CHECKING, Any, Iterable
-from . import warnings
+from . import warnings, Res
 from .common import LazyLogger
 
 logger = LazyLogger(__name__)
@@ -109,3 +109,19 @@ def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Dict[str, Any]:
         'error': estr,
         dt_col : edt,
     }
+
+
+# todo add proper types
+@check_dataframe
+def as_dataframe(it: Iterable[Res[Any]]) -> DataFrameT:
+    # ok nice supports dataframe/NT natively
+    # https://github.com/pandas-dev/pandas/pull/27999
+    #    but it dispatches dataclass based on the first entry...
+    #    https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
+    # same for NamedTuple -- seems that it takes whatever schema the first NT has
+    # so we need to convert each individually... sigh
+    from .common import asdict
+    ie = (error_to_row(r) if isinstance(r, Exception) else asdict(r) for r in it)
+    # TODO just add tests for it?
+    import pandas as pd
+    return pd.DataFrame(ie)
diff --git a/my/rescuetime.py b/my/rescuetime.py
index 45f8f58..a616c33 100644
--- a/my/rescuetime.py
+++ b/my/rescuetime.py
@@ -12,7 +12,6 @@ from typing import Sequence, Iterable
 from .core import get_files, LazyLogger
 from .core.common import mcachew
 from .core.error import Res, split_errors
-from .core.pandas import check_dataframe as cdf, DataFrameT
 
 from my.config import rescuetime as config
 
@@ -29,7 +28,7 @@ DAL = dal.DAL
 Entry = dal.Entry
 
 
-@mcachew(hashf=lambda: inputs())
+@mcachew(depends_on=lambda: inputs())
 def entries() -> Iterable[Res[Entry]]:
     dal = DAL(inputs())
     it = dal.entries()
@@ -44,17 +43,10 @@ def groups(gap: timedelta=timedelta(hours=3)) -> Iterable[Res[Sequence[Entry]]]:
     yield from split_when(vit, lambda a, b: (b.dt - a.dt) > gap)
 
 
-@cdf
+# todo automatic dataframe interface?
+from .core.pandas import DataFrameT, as_dataframe
 def dataframe() -> DataFrameT:
-    import pandas as pd # type: ignore
-    # type: ignore[call-arg, attr-defined]
-    def it():
-        for e in entries():
-            if isinstance(e, Exception):
-                yield dict(error=str(e))
-            else:
-                yield e._asdict()
-    return pd.DataFrame(it())
+    return as_dataframe(entries())
 
 
 from .core import stat, Stats
@@ -89,6 +81,8 @@ def fake_data(rows: int=1000) -> Iterator[None]:
 
 # todo not sure if I want to keep these here? vvv
 
+# guess should move to core? or to 'ext' module, i.e. interfaces?
+# make automatic
 def fill_influxdb():
     from influxdb import InfluxDBClient # type: ignore
     client = InfluxDBClient()
@@ -106,3 +100,5 @@ def fill_influxdb():
     } for e in vit]
     client.write_points(jsons, database=db) # TODO??
 
+
+# TODO lots of garbage in dir()? maybe need to del the imports...