my.body.exercise: cleanup & error handling for merging cross trainer stuff

2020-09-13 23:02:17 +01:00 · 2020-09-13 23:02:17 +01:00 · 1ca2d116ec
commit 1ca2d116ec
parent 0b947e7d14
1 changed files with 99 additions and 39 deletions
--- a/my/body/exercise.py
+++ b/my/body/exercise.py
@ -6,45 +6,71 @@ For now it's worth keeping it here as an example and perhaps utility functions m
 '''
 from datetime import datetime, timedelta
 from typing import Optional
 from my.config import exercise as config
 # todo predataframe?? entries??
 def cross_trainer_data():
    # FIXME manual entries
    from porg import Org
    # TODO FIXME should use all org notes and just query from them?
    wlog = Org.from_file(config.workout_log)
    cross_table = wlog.xpath('//org[heading="Cross training"]//table')
    return cross_table.lines
    # todo hmm, converting an org table directly to pandas kinda makes sense?
    # could have a '.dataframe' method in orgparse, optional dependency
 import pytz
 # FIXME how to attach it properly?
 tz = pytz.timezone('Europe/London')
 def tzify(d: datetime) -> datetime:
    assert d.tzinfo is None, d
    return tz.localize(d)
 # todo predataframe?? entries??
 def cross_trainer_data():
    # FIXME some manual entries in python
    # I guess just convert them to org
    from porg import Org
    # FIXME should use all org notes and just query from them?
    wlog = Org.from_file(config.workout_log)
    cross_table = wlog.xpath('//org[heading="Cross training"]//table')
    def maybe(f):
        def parse(s):
            if len(s) == 0:
                return None
            return f(s)
        return parse
    def parse_mm_ss(x: str) -> timedelta:
        hs, ms = x.split(':')
        return timedelta(seconds=int(hs) * 60 + int(ms))
    # todo eh. not sure if there is a way of getting around writing code...
    # I guess would be nice to have a means of specifying type in the column? maybe multirow column names??
    # need to look up org-mode standard..
    from ..core.orgmode import parse_org_datetime
    mappers = {
        'duration': lambda s: parse_mm_ss(s),
        'date'    : lambda s: tzify(parse_org_datetime(s)),
    }
    for row in cross_table.lines:
        # todo make more defensive, fallback on nan for individual fields??
        try:
            d = {}
            for k, v in row.items():
                mapper = mappers.get(k, maybe(float))
                d[k] = mapper(v)
            yield d
        except Exception as e:
            # todo add parsing context
            yield {'error': str(e)}
    # todo hmm, converting an org table directly to pandas kinda makes sense?
    # could have a '.dataframe' method in orgparse, optional dependency
 def cross_trainer_manual_dataframe():
    '''
    Only manual org-mode entries
    '''
    import pandas as pd
    df = pd.DataFrame(cross_trainer_data())
    from ..core.orgmode import parse_org_datetime
    df['date'] = df['date'].apply(parse_org_datetime)
    def tzify(d: datetime) -> datetime:
        assert d.tzinfo is None, d
        return tz.localize(d)
    df['date'] = df['date'].apply(tzify)
    # TODO convert duration as well
    #
    return df
@ -58,15 +84,19 @@ def cross_trainer_dataframe():
    edf = EDF()
    edf = edf[edf['sport'].str.contains('Cross training')]
    # Normalise and assume single bout of exercise per day
    # TODO this could be useful for other providers..
    # todo hmm maybe this bit is not really that necessary for this function??
    # just let it fail further down
    grouped = edf.set_index('start_time').groupby(lambda t: t.date())
    singles = []
    for day, grp in grouped:
        if len(grp) != 1:
            # FIXME yield runtimeerror
            continue
-        singles.append(grp)
+        else:
            singles.append(grp)
    edf = pd.concat(singles)
    edf = edf.reset_index()
@ -75,25 +105,44 @@ def cross_trainer_dataframe():
    rows = []
    idxs = []
    for i, row in mdf.iterrows():
        # todo rename 'date'??
        mdate = row['date']
        close = edf[edf['start_time'].apply(lambda t: pd_date_diff(t, mdate)).abs() < timedelta(hours=3)]
        idx: Optional[int]
        rd = row.to_dict()
        # todo in case of error, 'start date' becomes 'date'??
        if len(close) == 0:
-            # FIXME emit warning -- nothing matched
+            idx = None
-            continue
+            d = {
-        if len(close) > 1:
+                **rd,
-            # FIXME emit warning
+                'error': 'no endomondo matches',
-            continue
+            }
-        loc = close.index[0]
+        elif len(close) > 1:
-        # FIXME check and make defensive
+            idx = None
-        # assert loc not in idxs, (loc, row)
+            d = {
-        idxs.append(loc)
+                **rd,
-        rows.append(row)
+                'error': 'multiple endomondo matches',
                # todo add info on which exactly??
            }
        else:
            idx = close.index[0]
            d = rd
            if idx in idxs:
                # todo might be a good idea to remove the original match as well?
                idx = None
                d = {
                    **rd,
                    'error': 'manual entry matched multiple times',
                }
        idxs.append(idx)
        rows.append(d)
    mdf = pd.DataFrame(rows, index=idxs)
-    df = edf.join(mdf, rsuffix='_manual')
+    # todo careful about 'how'? we need it to preserve the errors
    # maybe pd.merge is better suited for this??
    df = edf.join(mdf, how='outer', rsuffix='_manual')
    # TODO arbitrate kcal, duration, avg hr
-    # compare power and hr?
+    # compare power and hr? add 'quality' function??
    return df
@ -102,6 +151,17 @@ def stats():
    return stat(cross_trainer_data())
 def compare_manual():
    df = cross_trainer_dataframe()
    df = df.set_index('start_time')
    df = df[[
        'kcal'    , 'kcal_manual',
        'duration', 'duration_manual',
    ]].dropna()
    print(df.to_string())
 def pd_date_diff(a, b) -> timedelta:
    # ugh. pandas complains when we subtract timestamps in different timezones
    assert a.tzinfo is not None, a