From afce09d1d44ff736129cb89357f0284bd65fbddf Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 14 Sep 2020 21:09:28 +0100 Subject: [PATCH] my.body.exercise: more consistent merging for cross trainer data --- my/body/exercise.py | 46 ++++++++++++++++++++++----------------------- my/endomondo.py | 5 ++++- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/my/body/exercise.py b/my/body/exercise.py index fed6b8b..51de626 100644 --- a/my/body/exercise.py +++ b/my/body/exercise.py @@ -8,6 +8,8 @@ For now it's worth keeping it here as an example and perhaps utility functions m from datetime import datetime, timedelta from typing import Optional +from ..core.pandas import check_dataframe as cdf + from my.config import exercise as config @@ -65,6 +67,7 @@ def cross_trainer_data(): # could have a '.dataframe' method in orgparse, optional dependency +@cdf def cross_trainer_manual_dataframe(): ''' Only manual org-mode entries @@ -74,6 +77,7 @@ def cross_trainer_manual_dataframe(): return df +@cdf def cross_trainer_dataframe(): ''' Attaches manually logged data (which Endomondo can't capture) and attaches it to Endomondo @@ -84,44 +88,28 @@ def cross_trainer_dataframe(): edf = EDF() edf = edf[edf['sport'].str.contains('Cross training')] - - # Normalise and assume single bout of exercise per day - # TODO this could be useful for other providers.. - # todo hmm maybe this bit is not really that necessary for this function?? - # just let it fail further down - grouped = edf.set_index('start_time').groupby(lambda t: t.date()) - singles = [] - for day, grp in grouped: - if len(grp) != 1: - # FIXME yield runtimeerror - continue - else: - singles.append(grp) - edf = pd.concat(singles) - edf = edf.reset_index() - mdf = cross_trainer_manual_dataframe() # now for each manual entry, find a 'close enough' endomondo entry + # ideally it's a 1-1 (or 0-1) relationship, but there might be errors rows = [] idxs = [] + NO_ENDOMONDO = 'no endomondo matches' for i, row in mdf.iterrows(): mdate = row['date'] close = edf[edf['start_time'].apply(lambda t: pd_date_diff(t, mdate)).abs() < timedelta(hours=3)] idx: Optional[int] rd = row.to_dict() - # todo in case of error, 'start date' becomes 'date'?? if len(close) == 0: idx = None d = { **rd, - 'error': 'no endomondo matches', + 'error': NO_ENDOMONDO, } elif len(close) > 1: idx = None d = { **rd, - 'error': 'multiple endomondo matches', - # todo add info on which exactly?? + 'error': f'one manual, many endomondo: {close}', } else: idx = close.index[0] @@ -132,7 +120,7 @@ def cross_trainer_dataframe(): idx = None d = { **rd, - 'error': 'manual entry matched multiple times', + 'error': 'one endomondo, many manual', } idxs.append(idx) rows.append(d) @@ -141,9 +129,21 @@ def cross_trainer_dataframe(): # todo careful about 'how'? we need it to preserve the errors # maybe pd.merge is better suited for this?? df = edf.join(mdf, how='outer', rsuffix='_manual') - # TODO arbitrate kcal, duration, avg hr - # compare power and hr? add 'quality' function?? + # todo reindex? so we dont' have Nan leftovers + + # todo set date anyway? maybe just squeeze into the index?? + noendo = df['error'] == NO_ENDOMONDO + # meh. otherwise the column type ends up object + tz = df[noendo]['start_time'].dtype.tz + df.loc[noendo, 'start_time' ] = df[noendo]['date' ].dt.tz_convert(tz) + df.loc[noendo, 'duration' ] = df[noendo]['duration_manual'] + df.loc[noendo, 'heart_rate_avg'] = df[noendo]['hr_avg' ] + + # todo set sport?? set source? return df +# TODO arbitrate kcal, duration, avg hr +# compare power and hr? add 'quality' function?? +# TODO wtf?? where is speed coming from?? def stats(): diff --git a/my/endomondo.py b/my/endomondo.py index 53ffcdb..df470b6 100644 --- a/my/endomondo.py +++ b/my/endomondo.py @@ -58,7 +58,10 @@ def dataframe(defensive=True): d = {'error': f'{e} {w}'} yield d import pandas as pd # type: ignore - return pd.DataFrame(it()) + df = pd.DataFrame(it()) + # pandas guesses integer, which is pointless for this field (might get coerced to float too) + df['id'] = df['id'].astype(str) + return df