influxdb: add helper to core + use it in bluemaestro/lastfm/rescuetime

2021-02-21 20:59:03 +00:00 · 2021-02-21 20:59:03 +00:00 · bfec6b975f
commit bfec6b975f
parent 271cd7feef
5 changed files with 95 additions and 75 deletions
--- a/my/bluemaestro.py
+++ b/my/bluemaestro.py
@ -192,32 +192,8 @@ def dataframe() -> DataFrameT:
 def fill_influxdb() -> None:
-    from itertools import islice
+    from .core import influxdb
-    from .core.common import asdict
+    influxdb.fill(measurements(), measurement=__name__)
    from influxdb import InfluxDBClient # type: ignore
    client = InfluxDBClient()
    db = 'db'
    mname = __name__.replace('.', '_')
    client.delete_series(database=db,  measurement=mname)
    def dissoc(d, k):
        del d[k]
        return d # meh
    jsons = ({
        'measurement': mname,
        # todo maybe good idea to tags with database file/name? to inspect inconsistencies etc..
        # 'tags': {'activity': e.activity},
        'time': e.dt.isoformat(),
        'fields': dissoc(asdict(e), 'dt'),
    } for e in measurements())
    from more_itertools import chunked
    # "The optimal batch size is 5000 lines of line protocol."
    # some chunking is def necessary, otherwise it fails
    for chunk in chunked(jsons, n=5000):
        cl = list(chunk)
        logger.debug('writing next chunk %s', cl[-1])
        client.write_points(cl, database=db)
    # todo "Specify timestamp precision when writing to InfluxDB."?
 def check() -> None:
--- a/my/core/influxdb.py
+++ b/my/core/influxdb.py
@ -0,0 +1,66 @@
 '''
 TODO doesn't really belong to 'core' morally, but can think of moving out later
 '''
 from typing import Iterable, Any, Optional
 from .common import LazyLogger, asdict, Json
 logger = LazyLogger(__name__)
 class config:
    db = 'db'
 def fill(it: Iterable[Any], *, measurement: str, reset: bool=False) -> None:
    # todo infer dt column automatically, reuse in stat?
    # it doesn't like dots, ends up some syntax error?
    measurement = measurement.replace('.', '_')
    # todo autoinfer measurement?
    db = config.db
    from influxdb import InfluxDBClient # type: ignore
    client = InfluxDBClient()
    # todo maybe create if not exists?
    # client.create_database(db)
    # todo should be it be env variable?
    if reset:
        client.delete_series(database=db, measurement=measurement)
    def dit() -> Iterable[Json]:
        for i in it:
            d = asdict(i)
            tags: Optional[Json] = None
            tags = d.get('tags') # meh... handle in a more robust manner
            if tags is not None:
                del d['tags']
            # TODO what to do with exceptions??
            # todo handle errors.. not sure how? maybe add tag for 'error' and fill with emtpy data?
            dt = d['dt'].isoformat()
            del d['dt']
            fields = d
            yield dict(
                measurement=measurement,
                # TODO maybe good idea to tag with database file/name? to inspect inconsistencies etc..
                # hmm, so tags are autoindexed and might be faster?
                # not sure what's the big difference though
                # "fields are data and tags are metadata"
                tags=tags,
                time=dt,
                fields=d,
            )
    from more_itertools import chunked
    # "The optimal batch size is 5000 lines of line protocol."
    # some chunking is def necessary, otherwise it fails
    for chi in chunked(dit(), n=5000):
        chl = list(chi)
        logger.debug('writing next chunk %s', chl[-1])
        client.write_points(chl, database=db)
    # todo "Specify timestamp precision when writing to InfluxDB."?
--- a/my/lastfm/init.py
+++ b/my/lastfm/init.py
@ -2,8 +2,7 @@
 Last.fm scrobbles
 '''
-from ..core.common import Paths
+from .core import Paths, dataclass
 from dataclasses import dataclass
 from my.config import lastfm as user_config
@dataclass
@ -14,7 +13,7 @@ class lastfm(user_config):
    export_path: Paths
-from ..core.cfg import make_config
+from .core.cfg import make_config
 config = make_config(lastfm)
@ -25,7 +24,8 @@ from typing import NamedTuple, Any, Sequence, Iterable
 import pytz
-from ..core.common import mcachew, Json, get_files
+from .core.common import mcachew, Json, get_files
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)
@ -63,10 +63,25 @@ class Scrobble(NamedTuple):
    # TODO could also be nice to make generic? maybe even depending on eagerness
-@mcachew(hashf=lambda: inputs())
+@mcachew(depends_on=inputs)
 def scrobbles() -> Iterable[Scrobble]:
    last = max(inputs())
    j = json.loads(last.read_text())
    for raw in reversed(j):
        yield Scrobble(raw=raw)
 from .core import stat, Stats
 def stats() -> Stats:
    return stat(scrobbles)
 def fill_influxdb() -> None:
    from .core import influxdb
    # todo needs to be more automatic
    sd = (dict(
        dt=x.dt,
        track=x.track,
    ) for x in scrobbles())
    influxdb.fill(sd, measurement=__name__)
--- a/my/lastfm/fill_influxdb.py
+++ b/my/lastfm/fill_influxdb.py
@ -1,17 +0,0 @@
 #!/usr/bin/env python3
 # pip install influxdb
 from influxdb import InfluxDBClient # type: ignore
 from my.lastfm import scrobbles
 def main() -> None:
    scrobbles = scrobbles()
    client = InfluxDBClient()
    # TODO client.create_database('lastfm')
    jsons = [{"measurement": 'scrobble', "tags": {}, "time": str(sc.dt), "fields": {"name": sc.track}} for sc in scrobbles]
    client.write_points(jsons, database='lastfm')
 if __name__ == '__main__':
    main()
--- a/my/rescuetime.py
+++ b/my/rescuetime.py
@ -79,34 +79,14 @@ def fake_data(rows: int=1000) -> Iterator[None]:
 # todo would be kinda nice if doctor could run against the fake data, to have a basic health check of the module?
 # todo not sure if I want to keep these here? vvv
 # guess should move to core? or to 'ext' module, i.e. interfaces?
 # make automatic
 def fill_influxdb() -> None:
-    from .core.common import asdict
+    from .core import influxdb
-
+    it = (dict(
-    from influxdb import InfluxDBClient # type: ignore
+        dt=e.dt,
-    client = InfluxDBClient()
+        duration_d=e.duration_s,
-    db = 'db'
+        tags=dict(activity=e.activity),
-    measurement = __name__.replace('.', '_')
+    ) for e in entries() if isinstance(e, Entry)) # TODO handle errors in core.influxdb
-    client.delete_series(database=db,  measurement=measurement)
+    influxdb.fill(it, measurement=__name__)
    # client.drop_database(db)
    # todo create if not exists?
    # client.create_database(db)
    # todo handle errors.. not sure how? maybe add tag for 'error' and fill with emtpy data?
    vit = (e for e in entries() if isinstance(e, Entry))
    jsons = ({
        'measurement': measurement, # hmm, influx doesn't like dots?
        # hmm, so tags are autoindexed and might be faster?
        # not sure what's the big difference though
        # "fields are data and tags are metadata"
        'tags': {'activity': e.activity},
        'time': e.dt.isoformat(),
        'fields': {'duration_s': e.duration_s},
        # todo asdict(e),
    } for e in vit)
    # todo do we need to batch?
    client.write_points(jsons, database=db)
 # TODO lots of garbage in dir()? maybe need to del the imports...