bluemaestro: investigation of data quality + more sanity checks

This commit is contained in:
Dima Gerasimov 2021-02-14 21:38:26 +00:00 committed by karlicoss
parent 746c3da0ca
commit 1899b006de
3 changed files with 77 additions and 37 deletions

25
misc/rescuetime_cleanup.py → misc/repl.py Normal file → Executable file
View file

@ -1,3 +1,5 @@
#!/usr/bin/env python3
# M-x run-python (raise window so it doesn't hide) # M-x run-python (raise window so it doesn't hide)
# ?? python-shell-send-defun # ?? python-shell-send-defun
# C-c C-r python-shell-send-region # C-c C-r python-shell-send-region
@ -5,6 +7,9 @@
# maybe add hook # maybe add hook
# (setq comint-move-point-for-output t) ;; https://github.com/jorgenschaefer/elpy/issues/1641#issuecomment-528355368 # (setq comint-move-point-for-output t) ;; https://github.com/jorgenschaefer/elpy/issues/1641#issuecomment-528355368
# #
from itertools import islice, groupby
from more_itertools import ilen, bucket
from importlib import reload from importlib import reload
import sys import sys
@ -15,6 +20,21 @@ for m in todel: del sys.modules[m]
import my import my
# todo add to doc? # todo add to doc?
from my.core import get_files from my.core import get_files
import my.bluemaestro as M
from my.config import bluemaestro as BC
BC.export_path = get_files(BC.export_path)
# print(list(M.measurements())[:10])
print(M.dataframe())
ffwf
#
from my.config import rescuetime as RC from my.config import rescuetime as RC
# todo ugh. doesn't work?? # todo ugh. doesn't work??
@ -26,11 +46,6 @@ import my.rescuetime as M
# print(len(list(M.entries()))) # print(len(list(M.entries())))
M.fill_influxdb() M.fill_influxdb()
ffwf
from itertools import islice, groupby
from more_itertools import ilen, bucket
print(M.dataframe()) print(M.dataframe())
e = M.entries() e = M.entries()

View file

@ -3,33 +3,38 @@
[[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor [[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor
""" """
# todo eh, most of it belongs to DAL # todo most of it belongs to DAL... but considering so few people use it I didn't bother for now
from datetime import datetime, timedelta from datetime import datetime, timedelta
from pathlib import Path from pathlib import Path
import re import re
import sqlite3 import sqlite3
from typing import Iterable, NamedTuple, Sequence, Set, Optional from typing import Iterable, Sequence, Set, Optional
from .core import get_files, LazyLogger, dataclass
from .core.common import mcachew, LazyLogger, get_files
from .core.cachew import cache_dir
from my.config import bluemaestro as config from my.config import bluemaestro as config
logger = LazyLogger('bluemaestro', level='debug') # todo control level via env variable?
# i.e. HPI_LOGGING_MY_BLUEMAESTRO_LEVEL=debug
logger = LazyLogger(__name__, level='debug')
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
return get_files(config.export_path) return get_files(config.export_path)
class Measurement(NamedTuple): Celsius = float
dt: datetime Percent = float
temp : float # Celsius mBar = float
humidity: float # percent
pressure: float # mBar @dataclass
dewpoint: float # Celsius class Measurement:
dt: datetime # todo aware/naive
temp : Celsius
humidity: Percent
pressure: mBar
dewpoint: Celsius
# fixme: later, rely on the timezone provider # fixme: later, rely on the timezone provider
@ -39,8 +44,19 @@ tz = pytz.timezone('Europe/London')
# TODO when I change tz, check the diff # TODO when I change tz, check the diff
@mcachew(cache_path=cache_dir() / 'bluemaestro.cache') def is_bad_table(name: str) -> bool:
def measurements(dbs=inputs()) -> Iterable[Measurement]: # todo hmm would be nice to have a hook that can patch any module up to
delegate = getattr(config, 'is_bad_table', None)
return False if delegate is None else delegate(name)
from .core.cachew import cache_dir
from .core.common import mcachew
@mcachew(depends_on=lambda: inputs(), cache_path=cache_dir() / 'bluemaestro.cache')
def measurements() -> Iterable[Measurement]:
# todo ideally this would be via arguments... but needs to be lazy
dbs = inputs()
last: Optional[datetime] = None last: Optional[datetime] = None
# tables are immutable, so can save on processing.. # tables are immutable, so can save on processing..
@ -51,9 +67,17 @@ def measurements(dbs=inputs()) -> Iterable[Measurement]:
new = 0 new = 0
# todo assert increasing timestamp? # todo assert increasing timestamp?
with sqlite3.connect(f'file:{f}?immutable=1', uri=True) as db: with sqlite3.connect(f'file:{f}?immutable=1', uri=True) as db:
db_dt: Optional[datetime] = None
try: try:
datas = db.execute(f'SELECT "{f.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index') datas = db.execute(f'SELECT "{f.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index')
oldfmt = True oldfmt = True
db_dts = list(db.execute(f'SELECT last_download FROM info'))[0][0]
if db_dts == 'N/A':
# ??? happens for 20180923-20180928
continue
if db_dts.endswith(':'):
db_dts += '00' # wtf.. happens on some day
db_dt = tz.localize(datetime.strptime(db_dts, '%Y-%m-%d %H:%M:%S'))
except sqlite3.OperationalError: except sqlite3.OperationalError:
# Right, this looks really bad. # Right, this looks really bad.
# The device doesn't have internal time & what it does is: # The device doesn't have internal time & what it does is:
@ -94,24 +118,34 @@ def measurements(dbs=inputs()) -> Iterable[Measurement]:
query = f'SELECT * FROM ({query}) ORDER BY name, unix' query = f'SELECT * FROM ({query}) ORDER BY name, unix'
datas = db.execute(query) datas = db.execute(query)
oldfmt = False oldfmt = False
db_dt = None
for i, (name, tsc, temp, hum, pres, dewp) in enumerate(datas): for i, (name, tsc, temp, hum, pres, dewp) in enumerate(datas):
if is_bad_table(name):
continue
# note: bluemaestro keeps local datetime # note: bluemaestro keeps local datetime
if oldfmt: if oldfmt:
tss = tsc.replace('Juli', 'Jul').replace('Aug.', 'Aug') tss = tsc.replace('Juli', 'Jul').replace('Aug.', 'Aug')
dt = datetime.strptime(tss, '%Y-%b-%d %H:%M') dt = datetime.strptime(tss, '%Y-%b-%d %H:%M')
dt = tz.localize(dt) dt = tz.localize(dt)
assert db_dt is not None
else: else:
# todo cache?
m = re.search(r'_(\d+)_', name) m = re.search(r'_(\d+)_', name)
assert m is not None assert m is not None
export_ts = int(m.group(1)) export_ts = int(m.group(1))
edt = datetime.fromtimestamp(export_ts / 1000, tz=tz) db_dt = datetime.fromtimestamp(export_ts / 1000, tz=tz)
dt = datetime.fromtimestamp(tsc / 1000, tz=tz) dt = datetime.fromtimestamp(tsc / 1000, tz=tz)
## sanity checks (todo make defensive/configurable?) ## sanity checks (todo make defensive/configurable?)
# not sure how that happens.. but basically they'd better be excluded # not sure how that happens.. but basically they'd better be excluded
assert dt.year >= 2015, (f, name, dt) lower = timedelta(days=6000 / 24) # ugh some time ago I only did it once in an hour.. in theory can detect from meta?
upper = timedelta(days=10) # kinda arbitrary
if not (db_dt - lower < dt < db_dt + timedelta(days=10)):
# todo could be more defenive??
raise RuntimeError('timestamp too far out', f, name, db_dt, dt)
assert -60 <= temp <= 60, (f, dt, temp) assert -60 <= temp <= 60, (f, dt, temp)
## ##
@ -131,7 +165,6 @@ def measurements(dbs=inputs()) -> Iterable[Measurement]:
yield p yield p
logger.debug('%s: new %d/%d', f, new, tot) logger.debug('%s: new %d/%d', f, new, tot)
# logger.info('total items: %d', len(merged)) # logger.info('total items: %d', len(merged))
# TODO assert frequency?
# for k, v in merged.items(): # for k, v in merged.items():
# # TODO shit. quite a few of them have varying values... how is that freaking possible???? # # TODO shit. quite a few of them have varying values... how is that freaking possible????
# # most of them are within 0.5 degree though... so just ignore? # # most of them are within 0.5 degree though... so just ignore?
@ -145,27 +178,18 @@ def stats() -> Stats:
return stat(measurements) return stat(measurements)
from .core.pandas import DataFrameT, check_dataframe as cdf from .core.pandas import DataFrameT, as_dataframe
@cdf
def dataframe() -> DataFrameT: def dataframe() -> DataFrameT:
""" """
%matplotlib gtk %matplotlib gtk
from my.bluemaestro import dataframe from my.bluemaestro import dataframe
dataframe().plot() dataframe().plot()
""" """
# todo not sure why x axis time ticks are weird... df[:6269] works, whereas df[:6269] breaks... df = as_dataframe(measurements(), schema=Measurement)
# either way, plot is not the best representation for the temperature I guess.. maybe also use bokeh?
import pandas as pd # type: ignore
df = pd.DataFrame(
(p._asdict() for p in measurements()),
# todo meh. otherwise fails on empty inputs...
columns=list(Measurement._fields),
)
# todo not sure how it would handle mixed timezones?? # todo not sure how it would handle mixed timezones??
# todo hmm, not sure about setting the index
return df.set_index('dt') return df.set_index('dt')
# todo test against an older db?
def check() -> None: def check() -> None:
temps = list(measurements()) temps = list(measurements())
@ -187,6 +211,5 @@ def check() -> None:
NOW = datetime.now() NOW = datetime.now()
assert NOW - last < timedelta(hours=HOURS_STORED / 2), f'old backup! {last}' assert NOW - last < timedelta(hours=HOURS_STORED / 2), f'old backup! {last}'
assert last - prev < timedelta(minutes=3), f'bad interval! {last - prev}' assert last - prev < timedelta(minutes=3), f'bad interval! {last - prev}'
single = (last - prev).seconds single = (last - prev).seconds

View file

@ -2,6 +2,8 @@ import os
import pytest import pytest
V = 'HPI_TESTS_KARLICOSS'
skip_if_not_karlicoss = pytest.mark.skipif( skip_if_not_karlicoss = pytest.mark.skipif(
'HPI_TESTS_KARLICOSS' not in os.environ, reason='test only works on @karlicoss data for now', V not in os.environ, reason=f'test only works on @karlicoss data for now. Set evn variable {V}=true to override.',
) )