From 60ccca52ad7ed1a64d7e051c3b9a1faa9f9fc45f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 24 Apr 2020 15:57:44 +0100 Subject: [PATCH] more takeout tweaks and comments --- my/location/takeout.py | 2 +- my/media/youtube.py | 5 ++++- my/takeout.py | 21 ++++++++++++--------- tests/takeout.py | 18 ++++++++++++++++++ 4 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 tests/takeout.py diff --git a/my/location/takeout.py b/my/location/takeout.py index 3441f73..79ad25c 100644 --- a/my/location/takeout.py +++ b/my/location/takeout.py @@ -27,7 +27,7 @@ from ..takeout import get_last_takeout from ..kython import kompress -logger = LazyLogger(__package__) +logger = LazyLogger(__name__) def cache_path(*args, **kwargs): diff --git a/my/media/youtube.py b/my/media/youtube.py index 2050be3..4e23f5b 100755 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -20,7 +20,9 @@ class Watched(NamedTuple): def get_watched(): - path = 'Takeout/My Activity/YouTube/MyActivity.html' + # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/ + path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last + # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json last = get_last_takeout(path=path) watches: List[Watched] = [] @@ -33,6 +35,7 @@ def get_watched(): dd = fo.read().decode('utf8') parser.feed(dd) + # TODO hmm they already come sorted.. wonder if should just rely on it.. return list(sorted(watches, key=lambda e: e.when)) diff --git a/my/takeout.py b/my/takeout.py index 592f439..26404eb 100644 --- a/my/takeout.py +++ b/my/takeout.py @@ -1,24 +1,27 @@ from pathlib import Path -from typing import Optional +from typing import Optional, Iterable from .common import get_files from .kython.kompress import kopen, kexists from my.config import google as config -def get_last_takeout(*, path: Optional[str]=None) -> Path: +def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]: """ - Ok, sometimes google splits takeout into two zip archives - I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine.. + Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need """ # TODO FIXME zip is not great.. # allow a lambda expression? that way the user could restrict it - for takeout in reversed(get_files(config.takeout_path, glob='*.zip')): + for takeout in get_files(config.takeout_path, glob='*.zip'): if path is None or kexists(takeout, path): - return takeout - else: - continue - raise RuntimeError(f'Not found: {path}') + yield takeout + + +def get_last_takeout(*, path: Optional[str]=None) -> Path: + # TODO more_itertools? + matching = list(get_takeouts(path=path)) + return matching[-1] + # TODO might be a good idea to merge across multiple takeouts... # perhaps even a special takeout module that deals with all of this automatically? diff --git a/tests/takeout.py b/tests/takeout.py new file mode 100644 index 0000000..d7bd3ca --- /dev/null +++ b/tests/takeout.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +from itertools import islice + +from my.core.cachew import disable_cachew +disable_cachew() + +import my.location.takeout as LT + + +def ilen(it): + # TODO more_itertools? + return len(list(it)) + + +def test_location_perf(): + # 2.80 s for 10 iterations and 10K points + # TODO try switching to jq and see how it goes? not sure.. + print(ilen(islice(LT.iter_locations(), 0, 10000)))