influxdb: add helper to core + use it in bluemaestro/lastfm/rescuetime

2021-02-21 20:59:03 +00:00 · 2021-02-21 20:59:03 +00:00 · bfec6b975f
commit bfec6b975f
parent 271cd7feef
5 changed files with 95 additions and 75 deletions
--- a/my/core/influxdb.py
+++ b/my/core/influxdb.py
@ -0,0 +1,66 @@
+'''
+TODO doesn't really belong to 'core' morally, but can think of moving out later
+'''
+from typing import Iterable, Any, Optional
+
+
+from .common import LazyLogger, asdict, Json
+
+
+logger = LazyLogger(__name__)
+
+
+class config:
+    db = 'db'
+
+
+def fill(it: Iterable[Any], *, measurement: str, reset: bool=False) -> None:
+    # todo infer dt column automatically, reuse in stat?
+    # it doesn't like dots, ends up some syntax error?
+    measurement = measurement.replace('.', '_')
+    # todo autoinfer measurement?
+
+    db = config.db
+
+    from influxdb import InfluxDBClient # type: ignore
+    client = InfluxDBClient()
+    # todo maybe create if not exists?
+    # client.create_database(db)
+
+    # todo should be it be env variable?
+    if reset:
+        client.delete_series(database=db, measurement=measurement)
+
+    def dit() -> Iterable[Json]:
+        for i in it:
+            d = asdict(i)
+            tags: Optional[Json] = None
+            tags = d.get('tags') # meh... handle in a more robust manner
+            if tags is not None:
+                del d['tags']
+
+            # TODO what to do with exceptions??
+            # todo handle errors.. not sure how? maybe add tag for 'error' and fill with emtpy data?
+            dt = d['dt'].isoformat()
+            del d['dt']
+            fields = d
+            yield dict(
+                measurement=measurement,
+                # TODO maybe good idea to tag with database file/name? to inspect inconsistencies etc..
+                # hmm, so tags are autoindexed and might be faster?
+                # not sure what's the big difference though
+                # "fields are data and tags are metadata"
+                tags=tags,
+                time=dt,
+                fields=d,
+            )
+
+
+    from more_itertools import chunked
+    # "The optimal batch size is 5000 lines of line protocol."
+    # some chunking is def necessary, otherwise it fails
+    for chi in chunked(dit(), n=5000):
+        chl = list(chi)
+        logger.debug('writing next chunk %s', chl[-1])
+        client.write_points(chl, database=db)
+    # todo "Specify timestamp precision when writing to InfluxDB."?