HPI/my/body/exercise/cross_trainer.py

'''
My cross trainer exercise data, arbitrated from different sources (mainly, Endomondo and manual text notes)

This is probably too specific to my needs, so later I will move it away to a personal 'layer'.
For now it's worth keeping it here as an example and perhaps utility functions might be useful for other HPI modules.
'''

from datetime import datetime, timedelta
from typing import Optional

from ...core.pandas import DataFrameT, check_dataframe as cdf
from ...core.orgmode import collect, Table, parse_org_datetime, TypedTable

from my.config import exercise as config


import pytz
# FIXME how to attach it properly?
tz = pytz.timezone('Europe/London')

def tzify(d: datetime) -> datetime:
    assert d.tzinfo is None, d
    return tz.localize(d)


# todo predataframe?? entries??
def cross_trainer_data():
    # FIXME some manual entries in python
    # I guess just convert them to org
    import orgparse
    # todo should use all org notes and just query from them?
    wlog = orgparse.load(config.workout_log)

    [table] = collect(
        wlog,
        lambda n: [] if n.heading != 'Cross training' else [x for x in n.body_rich if isinstance(x, Table)]
    )
    cross_table = TypedTable(table)

    def maybe(f):
        def parse(s):
            if len(s) == 0:
                return None
            return f(s)
        return parse

    def parse_mm_ss(x: str) -> timedelta:
        hs, ms = x.split(':')
        return timedelta(seconds=int(hs) * 60 + int(ms))

    # todo eh. not sure if there is a way of getting around writing code...
    # I guess would be nice to have a means of specifying type in the column? maybe multirow column names??
    # need to look up org-mode standard..
    mappers = {
        'duration': lambda s: parse_mm_ss(s),
        'date'    : lambda s: tzify(parse_org_datetime(s)),
        'comment' : str,
    }
    for row in cross_table.as_dicts:
        # todo make more defensive, fallback on nan for individual fields??
        try:
            d = {}
            for k, v in row.items():
                # todo have something smarter... e.g. allow pandas to infer the type??
                mapper = mappers.get(k, maybe(float))
                d[k] = mapper(v) # type: ignore[operator]
            yield d
        except Exception as e:
            # todo add parsing context
            yield {'error': str(e)}

    # todo hmm, converting an org table directly to pandas kinda makes sense?
    # could have a '.dataframe' method in orgparse, optional dependency


@cdf
def cross_trainer_manual_dataframe() -> DataFrameT:
    '''
    Only manual org-mode entries
    '''
    import pandas as pd
    df = pd.DataFrame(cross_trainer_data())
    return df

# this should be enough?..
_DELTA = timedelta(hours=10)

# todo check error handling by introducing typos (e.g. especially dates) in org-mode
@cdf
def dataframe() -> DataFrameT:
    '''
    Attaches manually logged data (which Endomondo can't capture) and attaches it to Endomondo
    '''
    import pandas as pd

    from ...endomondo import dataframe as EDF
    edf = EDF()
    edf = edf[edf['sport'].str.contains('Cross training')]

    mdf = cross_trainer_manual_dataframe()
    # TODO shit. need to always remember to split errors???
    # on the other hand, dfs are always untyped. so it's not too bad??
    # now for each manual entry, find a 'close enough' endomondo entry
    # ideally it's a 1-1 (or 0-1) relationship, but there might be errors
    rows = []
    idxs = [] # type: ignore[var-annotated]
    NO_ENDOMONDO = 'no endomondo matches'
    for i, row in mdf.iterrows():
        rd = row.to_dict()
        mdate = row['date']
        if pd.isna(mdate):
            # todo error handling got to be easier. seriously, mypy friendly dataframes would be amazing
            idxs.append(None)
            rows.append(rd) # presumably has an error set
            continue

        idx: Optional[int]
        close = edf[edf['start_time'].apply(lambda t: pd_date_diff(t, mdate)).abs() < _DELTA]
        if len(close) == 0:
            idx = None
            d = {
                **rd,
                'error': NO_ENDOMONDO,
            }
        elif len(close) > 1:
            idx = None
            d = {
                **rd,
                'error': f'one manual, many endomondo: {close}',
            }
        else:
            idx = close.index[0]
            d = rd

            if idx in idxs:
                # todo might be a good idea to remove the original match as well?
                idx = None
                d = {
                    **rd,
                    'error': 'one endomondo, many manual',
                }
        idxs.append(idx)
        rows.append(d)
    mdf = pd.DataFrame(rows, index=idxs)

    # todo careful about 'how'? we need it to preserve the errors
    # maybe pd.merge is better suited for this??
    df = edf.join(mdf, how='outer', rsuffix='_manual')
    # todo reindex? so we don't have Nan leftovers

    # todo set date anyway? maybe just squeeze into the index??
    noendo = df['error'] == NO_ENDOMONDO
    # meh. otherwise the column type ends up object
    tz = df[noendo]['start_time'].dtype.tz
    df.loc[noendo, 'start_time'    ] = df[noendo]['date'           ].dt.tz_convert(tz)
    df.loc[noendo, 'duration'      ] = df[noendo]['duration_manual']
    df.loc[noendo, 'heart_rate_avg'] = df[noendo]['hr_avg'         ]

    # todo set sport?? set source?
    return df
# TODO arbitrate kcal, duration, avg hr
# compare power and hr? add 'quality' function??
# TODO wtf?? where is speed coming from??


from ...core import stat, Stats
def stats() -> Stats:
    return stat(cross_trainer_data)


def compare_manual() -> None:
    df = dataframe()
    df = df.set_index('start_time')

    df = df[[
        'kcal'    , 'kcal_manual',
        'duration', 'duration_manual',
    ]].dropna()
    print(df.to_string())


def pd_date_diff(a, b) -> timedelta:
    # ugh. pandas complains when we subtract timestamps in different timezones
    assert a.tzinfo is not None, a
    assert b.tzinfo is not None, b
    return a.to_pydatetime() - b.to_pydatetime()