core: helpers for automatic dataframes from sequences of NamedTuple/dataclass

also use in my.rescuetime
This commit is contained in:
Dima Gerasimov 2021-02-13 17:13:05 +00:00 committed by karlicoss
parent df9a7f7390
commit 07f901e1e5
5 changed files with 77 additions and 14 deletions

View file

@ -0,0 +1,36 @@
# M-x run-python (raise window so it doesn't hide)
# ?? python-shell-send-defun
# C-c C-r python-shell-send-region
# shit, it isn't autoscrolling??
# maybe add hook
# (setq comint-move-point-for-output t) ;; https://github.com/jorgenschaefer/elpy/issues/1641#issuecomment-528355368
#
from importlib import reload
import sys
# todo function to reload hpi?
todel = [m for m in sys.modules if m.startswith('my.')]
# for m in todel: del sys.modules[m]
import my
import my.rescuetime as M
from itertools import islice, groupby
from more_itertools import ilen, bucket
print(M.dataframe())
e = M.entries()
e = list(islice(e, 0, 10))
key = lambda x: 'ERROR' if isinstance(x, Exception) else x.activity
# TODO move to errors module? how to preserve type signature?
# b = bucket(e, key=key)
# for k in b:
# g = b[k] # meh? should maybe sort
# print(k, ilen(g))
from collections import Counter
print(Counter(key(x) for x in e))

View file

@ -458,3 +458,16 @@ def guess_datetime(x: Any) -> Optional[datetime]:
if isinstance(v, datetime): if isinstance(v, datetime):
return v return v
return None return None
def asdict(thing) -> Json:
# todo primitive?
# todo exception?
if isinstance(thing, dict):
return thing
import dataclasses as D
if D.is_dataclass(thing):
return D.asdict(thing)
# must be a NT otherwise?
# todo add a proper check.. ()
return thing._asdict()

View file

@ -1,6 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
''' '''
Default logger is a bit, see 'test'/run this file for a demo Default logger is a bit meh, see 'test'/run this file for a demo
TODO name 'klogging' to avoid possible conflict with default 'logging' module
TODO shit. too late already? maybe use fallback & deprecate
''' '''
def test() -> None: def test() -> None:

View file

@ -6,7 +6,7 @@ Various pandas helpers and convenience functions
from datetime import datetime from datetime import datetime
from pprint import pformat from pprint import pformat
from typing import Optional, TYPE_CHECKING, Any, Iterable from typing import Optional, TYPE_CHECKING, Any, Iterable
from . import warnings from . import warnings, Res
from .common import LazyLogger from .common import LazyLogger
logger = LazyLogger(__name__) logger = LazyLogger(__name__)
@ -109,3 +109,19 @@ def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Dict[str, Any]:
'error': estr, 'error': estr,
dt_col : edt, dt_col : edt,
} }
# todo add proper types
@check_dataframe
def as_dataframe(it: Iterable[Res[Any]]) -> DataFrameT:
# ok nice supports dataframe/NT natively
# https://github.com/pandas-dev/pandas/pull/27999
# but it dispatches dataclass based on the first entry...
# https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
# same for NamedTuple -- seems that it takes whatever schema the first NT has
# so we need to convert each individually... sigh
from .common import asdict
ie = (error_to_row(r) if isinstance(r, Exception) else asdict(r) for r in it)
# TODO just add tests for it?
import pandas as pd
return pd.DataFrame(ie)

View file

@ -12,7 +12,6 @@ from typing import Sequence, Iterable
from .core import get_files, LazyLogger from .core import get_files, LazyLogger
from .core.common import mcachew from .core.common import mcachew
from .core.error import Res, split_errors from .core.error import Res, split_errors
from .core.pandas import check_dataframe as cdf, DataFrameT
from my.config import rescuetime as config from my.config import rescuetime as config
@ -29,7 +28,7 @@ DAL = dal.DAL
Entry = dal.Entry Entry = dal.Entry
@mcachew(hashf=lambda: inputs()) @mcachew(depends_on=lambda: inputs())
def entries() -> Iterable[Res[Entry]]: def entries() -> Iterable[Res[Entry]]:
dal = DAL(inputs()) dal = DAL(inputs())
it = dal.entries() it = dal.entries()
@ -44,17 +43,10 @@ def groups(gap: timedelta=timedelta(hours=3)) -> Iterable[Res[Sequence[Entry]]]:
yield from split_when(vit, lambda a, b: (b.dt - a.dt) > gap) yield from split_when(vit, lambda a, b: (b.dt - a.dt) > gap)
@cdf # todo automatic dataframe interface?
from .core.pandas import DataFrameT, as_dataframe
def dataframe() -> DataFrameT: def dataframe() -> DataFrameT:
import pandas as pd # type: ignore return as_dataframe(entries())
# type: ignore[call-arg, attr-defined]
def it():
for e in entries():
if isinstance(e, Exception):
yield dict(error=str(e))
else:
yield e._asdict()
return pd.DataFrame(it())
from .core import stat, Stats from .core import stat, Stats
@ -89,6 +81,8 @@ def fake_data(rows: int=1000) -> Iterator[None]:
# todo not sure if I want to keep these here? vvv # todo not sure if I want to keep these here? vvv
# guess should move to core? or to 'ext' module, i.e. interfaces?
# make automatic
def fill_influxdb(): def fill_influxdb():
from influxdb import InfluxDBClient # type: ignore from influxdb import InfluxDBClient # type: ignore
client = InfluxDBClient() client = InfluxDBClient()
@ -106,3 +100,5 @@ def fill_influxdb():
} for e in vit] } for e in vit]
client.write_points(jsons, database=db) # TODO?? client.write_points(jsons, database=db) # TODO??
# TODO lots of garbage in dir()? maybe need to del the imports...