HPI/my/bluemaestro.py

"""
[[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor
"""

# todo most of it belongs to DAL... but considering so few people use it I didn't bother for now
import re
import sqlite3
from abc import abstractmethod
from dataclasses import dataclass
from datetime import datetime, timedelta
from pathlib import Path
from typing import Iterable, Optional, Protocol, Sequence, Set

import pytz

from my.core import (
    Paths,
    Res,
    Stats,
    get_files,
    make_logger,
    stat,
    unwrap,
)
from my.core.cachew import mcachew
from my.core.pandas import DataFrameT, as_dataframe
from my.core.sqlite import sqlite_connect_immutable


class config(Protocol):
    @property
    @abstractmethod
    def export_path(self) -> Paths:
        raise NotImplementedError

    @property
    def tz(self) -> pytz.BaseTzInfo:
        # fixme: later, rely on the timezone provider
        # NOTE: the timezone should be set with respect to the export date!!!
        return pytz.timezone('Europe/London')
        # TODO when I change tz, check the diff


def make_config() -> config:
    from my.config import bluemaestro as user_config

    class combined_config(user_config, config): ...

    return combined_config()


logger = make_logger(__name__)


def inputs() -> Sequence[Path]:
    cfg = make_config()
    return get_files(cfg.export_path)


Celsius = float
Percent = float
mBar = float


@dataclass
class Measurement:
    dt: datetime  # todo aware/naive
    temp: Celsius
    humidity: Percent
    pressure: mBar
    dewpoint: Celsius


def is_bad_table(name: str) -> bool:
    # todo hmm would be nice to have a hook that can patch any module up to
    delegate = getattr(config, 'is_bad_table', None)
    return False if delegate is None else delegate(name)


@mcachew(depends_on=inputs)
def measurements() -> Iterable[Res[Measurement]]:
    cfg = make_config()
    tz = cfg.tz

    # todo ideally this would be via arguments... but needs to be lazy
    paths = inputs()
    total = len(paths)
    width = len(str(total))

    last: Optional[datetime] = None

    # tables are immutable, so can save on processing..
    processed_tables: Set[str] = set()
    for idx, path in enumerate(paths):
        logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
        tot = 0
        new = 0
        # todo assert increasing timestamp?
        with sqlite_connect_immutable(path) as db:
            db_dt: Optional[datetime] = None
            try:
                datas = db.execute(
                    f'SELECT "{path.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index'
                )
                oldfmt = True
                [(db_dts,)] = db.execute('SELECT last_download FROM info')
                if db_dts == 'N/A':
                    # ??? happens for 20180923-20180928
                    continue
                if db_dts.endswith(':'):
                    db_dts += '00'  # wtf.. happens on some day
                db_dt = tz.localize(datetime.strptime(db_dts, '%Y-%m-%d %H:%M:%S'))
            except sqlite3.OperationalError:
                # Right, this looks really bad.
                # The device doesn't have internal time & what it does is:
                # 1. every X seconds, record a datapoint, store it in the internal memory
                # 2. on sync, take the phone's datetime ('now') and then ASSIGN the timestamps to the collected data
                #    as now, now - X, now - 2X, etc
                #
                # that basically means that for example, hourly timestamps are completely useless? because their error is about 1h
                # yep, confirmed on some historic exports. seriously, what the fuck???
                #
                # The device _does_ have an internal clock, but it's basically set to 0 every time you update settings
                # So, e.g. if, say, at 17:15 you set the interval to 3600, the 'real' timestamps would be
                # 17:15, 18:15, 19:15, etc
                # But depending on when you export, you might get
                # 17:35, 18:35, 19:35; or 17:55, 18:55, 19:55, etc
                # basically all you guaranteed is that the 'correct' interval is within the frequency
                # it doesn't seem to keep the reference time in the database
                #
                # UPD: fucking hell, so you can set the reference date in the settings (calcReferenceUnix field in meta db)
                # but it's not set by default.

                log_tables = [c[0] for c in db.execute('SELECT name FROM sqlite_sequence WHERE name LIKE "%_log"')]
                log_tables = [t for t in log_tables if t not in processed_tables]
                processed_tables |= set(log_tables)

                # todo use later?
                frequencies = [list(db.execute(f'SELECT interval from {t.replace("_log", "_meta")}'))[0][0] for t in log_tables]  # noqa: RUF015

                # todo could just filter out the older datapoints?? dunno.

                # eh. a bit horrible, but seems the easiest way to do it?
                # note: for some reason everything in the new table multiplied by 10
                query = ' UNION '.join(
                    f'SELECT "{t}" AS name, unix, tempReadings / 10.0, humiReadings / 10.0, pressReadings / 10.0, dewpReadings / 10.0 FROM {t}'
                    for t in log_tables
                )
                if len(log_tables) > 0:  # ugh. otherwise end up with syntax error..
                    query = f'SELECT * FROM ({query}) ORDER BY name, unix'
                datas = db.execute(query)
                oldfmt = False
                db_dt = None

            for (name, tsc, temp, hum, pres, dewp) in datas:
                if is_bad_table(name):
                    continue

                # note: bluemaestro keeps local datetime
                if oldfmt:
                    tss = tsc.replace('Juli', 'Jul').replace('Aug.', 'Aug')
                    dt = datetime.strptime(tss, '%Y-%b-%d %H:%M')
                    dt = tz.localize(dt)
                    assert db_dt is not None
                else:
                    # todo cache?
                    m = re.search(r'_(\d+)_', name)
                    assert m is not None
                    export_ts = int(m.group(1))
                    db_dt = datetime.fromtimestamp(export_ts / 1000, tz=tz)
                    dt = datetime.fromtimestamp(tsc / 1000, tz=tz)

                ## sanity checks (todo make defensive/configurable?)
                # not sure how that happens.. but basically they'd better be excluded
                lower = timedelta(days=6000 / 24)  # ugh some time ago I only did it once in an hour.. in theory can detect from meta?
                upper = timedelta(days=10)  # kinda arbitrary
                if not (db_dt - lower < dt < db_dt + timedelta(days=10)):
                    # todo could be more defenive??
                    yield RuntimeError('timestamp too far out', path, name, db_dt, dt)
                    continue

                # err.. sometimes my values are just interleaved with these for no apparent reason???
                if (temp, hum, pres, dewp) == (-144.1, 100.0, 1152.5, -144.1):
                    yield RuntimeError('the weird sensor bug')
                    continue

                assert -60 <= temp <= 60, (path, dt, temp)
                ##

                tot += 1
                if last is not None and last >= dt:
                    continue
                # todo for performance, pass 'last' to sqlite instead?
                last = dt
                new += 1
                p = Measurement(
                    dt=dt,
                    temp=temp,
                    pressure=pres,
                    humidity=hum,
                    dewpoint=dewp,
                )
                yield p
        logger.debug(f'{path}: new {new}/{tot}')
    # logger.info('total items: %d', len(merged))
    # for k, v in merged.items():
    #     # TODO shit. quite a few of them have varying values... how is that freaking possible????
    #     # most of them are within 0.5 degree though... so just ignore?
    #     if isinstance(v, set) and len(v) > 1:
    #         print(k, v)
    # for k, v in merged.items():
    #     yield Point(dt=k, temp=v) # meh?


def stats() -> Stats:
    return stat(measurements)


def dataframe() -> DataFrameT:
    """
    %matplotlib gtk
    from my.bluemaestro import dataframe
    dataframe().plot()
    """
    df = as_dataframe(measurements(), schema=Measurement)
    # todo not sure how it would handle mixed timezones??
    # todo hmm, not sure about setting the index
    return df.set_index('dt')


def fill_influxdb() -> None:
    from my.core import influxdb

    influxdb.fill(measurements(), measurement=__name__)


def check() -> None:
    temps = list(measurements())
    latest = temps[:-2]

    prev = unwrap(latest[-2]).dt
    last = unwrap(latest[-1]).dt

    # todo stat should expose a dataclass?
    # TODO ugh. might need to warn about points past 'now'??
    # the default shouldn't allow points in the future...
    #
    # TODO also needs to be filtered out on processing, should be rejected on the basis of export date?

    POINTS_STORED = 6000  # on device?
    FREQ_SEC = 60
    SECS_STORED = POINTS_STORED * FREQ_SEC
    HOURS_STORED = POINTS_STORED / (60 * 60 / FREQ_SEC)  # around 4 days
    NOW = datetime.now()
    assert NOW - last < timedelta(hours=HOURS_STORED / 2), f'old backup! {last}'

    assert last - prev < timedelta(minutes=3), f'bad interval! {last - prev}'
    single = (last - prev).seconds