my.body.exercise: cleanup & error handling for merging cross trainer stuff

2020-09-13 23:02:17 +01:00 · 2020-09-13 23:02:17 +01:00 · 1ca2d116ec
commit 1ca2d116ec
parent 0b947e7d14
1 changed files with 99 additions and 39 deletions
--- a/my/body/exercise.py
+++ b/my/body/exercise.py
@ -6,45 +6,71 @@ For now it's worth keeping it here as an example and perhaps utility functions m
 '''

 from datetime import datetime, timedelta
+from typing import Optional

 from my.config import exercise as config


-# todo predataframe?? entries??
-def cross_trainer_data():
-    # FIXME manual entries
-
-    from porg import Org
-    # TODO FIXME should use all org notes and just query from them?
-    wlog = Org.from_file(config.workout_log)
-    cross_table = wlog.xpath('//org[heading="Cross training"]//table')
-    return cross_table.lines
-    # todo hmm, converting an org table directly to pandas kinda makes sense?
-    # could have a '.dataframe' method in orgparse, optional dependency
-
-
 import pytz
 # FIXME how to attach it properly?
 tz = pytz.timezone('Europe/London')

+def tzify(d: datetime) -> datetime:
+    assert d.tzinfo is None, d
+    return tz.localize(d)
+
+
+# todo predataframe?? entries??
+def cross_trainer_data():
+    # FIXME some manual entries in python
+    # I guess just convert them to org
+
+    from porg import Org
+    # FIXME should use all org notes and just query from them?
+    wlog = Org.from_file(config.workout_log)
+    cross_table = wlog.xpath('//org[heading="Cross training"]//table')
+
+    def maybe(f):
+        def parse(s):
+            if len(s) == 0:
+                return None
+            return f(s)
+        return parse
+
+    def parse_mm_ss(x: str) -> timedelta:
+        hs, ms = x.split(':')
+        return timedelta(seconds=int(hs) * 60 + int(ms))
+
+    # todo eh. not sure if there is a way of getting around writing code...
+    # I guess would be nice to have a means of specifying type in the column? maybe multirow column names??
+    # need to look up org-mode standard..
+    from ..core.orgmode import parse_org_datetime
+    mappers = {
+        'duration': lambda s: parse_mm_ss(s),
+        'date'    : lambda s: tzify(parse_org_datetime(s)),
+    }
+    for row in cross_table.lines:
+        # todo make more defensive, fallback on nan for individual fields??
+        try:
+            d = {}
+            for k, v in row.items():
+                mapper = mappers.get(k, maybe(float))
+                d[k] = mapper(v)
+            yield d
+        except Exception as e:
+            # todo add parsing context
+            yield {'error': str(e)}
+
+    # todo hmm, converting an org table directly to pandas kinda makes sense?
+    # could have a '.dataframe' method in orgparse, optional dependency
+
+
 def cross_trainer_manual_dataframe():
    '''
    Only manual org-mode entries
    '''
    import pandas as pd
    df = pd.DataFrame(cross_trainer_data())
-
-    from ..core.orgmode import parse_org_datetime
-    df['date'] = df['date'].apply(parse_org_datetime)
-
-    def tzify(d: datetime) -> datetime:
-        assert d.tzinfo is None, d
-        return tz.localize(d)
-
-    df['date'] = df['date'].apply(tzify)
-
-    # TODO convert duration as well
-    #
    return df


@ -58,15 +84,19 @@ def cross_trainer_dataframe():
    edf = EDF()
    edf = edf[edf['sport'].str.contains('Cross training')]

+
    # Normalise and assume single bout of exercise per day
    # TODO this could be useful for other providers..
+    # todo hmm maybe this bit is not really that necessary for this function??
+    # just let it fail further down
    grouped = edf.set_index('start_time').groupby(lambda t: t.date())
    singles = []
    for day, grp in grouped:
        if len(grp) != 1:
            # FIXME yield runtimeerror
            continue
-        singles.append(grp)
+        else:
+            singles.append(grp)
    edf = pd.concat(singles)
    edf = edf.reset_index()

@ -75,25 +105,44 @@ def cross_trainer_dataframe():
    rows = []
    idxs = []
    for i, row in mdf.iterrows():
-        # todo rename 'date'??
        mdate = row['date']
        close = edf[edf['start_time'].apply(lambda t: pd_date_diff(t, mdate)).abs() < timedelta(hours=3)]
+        idx: Optional[int]
+        rd = row.to_dict()
+        # todo in case of error, 'start date' becomes 'date'??
        if len(close) == 0:
-            # FIXME emit warning -- nothing matched
-            continue
-        if len(close) > 1:
-            # FIXME emit warning
-            continue
-        loc = close.index[0]
-        # FIXME check and make defensive
-        # assert loc not in idxs, (loc, row)
-        idxs.append(loc)
-        rows.append(row)
+            idx = None
+            d = {
+                **rd,
+                'error': 'no endomondo matches',
+            }
+        elif len(close) > 1:
+            idx = None
+            d = {
+                **rd,
+                'error': 'multiple endomondo matches',
+                # todo add info on which exactly??
+            }
+        else:
+            idx = close.index[0]
+            d = rd
+
+            if idx in idxs:
+                # todo might be a good idea to remove the original match as well?
+                idx = None
+                d = {
+                    **rd,
+                    'error': 'manual entry matched multiple times',
+                }
+        idxs.append(idx)
+        rows.append(d)
    mdf = pd.DataFrame(rows, index=idxs)

-    df = edf.join(mdf, rsuffix='_manual')
+    # todo careful about 'how'? we need it to preserve the errors
+    # maybe pd.merge is better suited for this??
+    df = edf.join(mdf, how='outer', rsuffix='_manual')
    # TODO arbitrate kcal, duration, avg hr
-    # compare power and hr?
+    # compare power and hr? add 'quality' function??
    return df


@ -102,6 +151,17 @@ def stats():
    return stat(cross_trainer_data())


+def compare_manual():
+    df = cross_trainer_dataframe()
+    df = df.set_index('start_time')
+
+    df = df[[
+        'kcal'    , 'kcal_manual',
+        'duration', 'duration_manual',
+    ]].dropna()
+    print(df.to_string())
+
+
 def pd_date_diff(a, b) -> timedelta:
    # ugh. pandas complains when we subtract timestamps in different timezones
    assert a.tzinfo is not None, a