From 21e82f0cd66767fcaae3206f172049fb459eccd5 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 24 Apr 2020 15:19:31 +0100 Subject: [PATCH 1/6] add disable_cachew helper --- my/core/cachew.py | 29 +++++++++++++++++++++++++++++ my/core/time.py | 2 +- my/takeout.py | 17 ++++++----------- 3 files changed, 36 insertions(+), 12 deletions(-) create mode 100644 my/core/cachew.py diff --git a/my/core/cachew.py b/my/core/cachew.py new file mode 100644 index 0000000..551527a --- /dev/null +++ b/my/core/cachew.py @@ -0,0 +1,29 @@ +''' +# TODO this probably belongs to cachew? or cachew.experimental +''' +from contextlib import contextmanager + + +def disable_cachew(): + ''' + NOTE: you need to use it before importing any function using @cachew.cachew + ''' + # TODO not sure... maybe it should instead use some hook.. it's a ibt ugly do + import cachew + + @cachew.doublewrap + def cachew_off(func=None, *args, **kwargs): + return func + old = cachew.cachew + cachew.cachew = cachew_off + return old + + +@contextmanager +def disabled_cachew(): + import cachew + old = disable_cachew() + try: + yield + finally: + cachew.cachew = old diff --git a/my/core/time.py b/my/core/time.py index d34ebf8..2c642d6 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -11,6 +11,6 @@ tz_lookup = { tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu... -@lru_cache(-1) +@lru_cache(None) def abbr_to_timezone(abbr: str): return tz_lookup[abbr] diff --git a/my/takeout.py b/my/takeout.py index 64dbcda..592f439 100644 --- a/my/takeout.py +++ b/my/takeout.py @@ -2,30 +2,25 @@ from pathlib import Path from typing import Optional from .common import get_files +from .kython.kompress import kopen, kexists from my.config import google as config -from .kython.kompress import kopen - def get_last_takeout(*, path: Optional[str]=None) -> Path: """ Ok, sometimes google splits takeout into two zip archives I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine.. """ + # TODO FIXME zip is not great.. + # allow a lambda expression? that way the user could restrict it for takeout in reversed(get_files(config.takeout_path, glob='*.zip')): - if path is None: + if path is None or kexists(takeout, path): return takeout else: - try: - kopen(takeout, path) - return takeout - except: - # TODO eh, a bit horrible, but works for now.. - # TODO move ot kompress? 'kexists'? - continue + continue raise RuntimeError(f'Not found: {path}') -# TODO might be a good idea to merge across multiple taekouts... +# TODO might be a good idea to merge across multiple takeouts... # perhaps even a special takeout module that deals with all of this automatically? # e.g. accumulate, filter and maybe report useless takeouts? From 60ccca52ad7ed1a64d7e051c3b9a1faa9f9fc45f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 24 Apr 2020 15:57:44 +0100 Subject: [PATCH 2/6] more takeout tweaks and comments --- my/location/takeout.py | 2 +- my/media/youtube.py | 5 ++++- my/takeout.py | 21 ++++++++++++--------- tests/takeout.py | 18 ++++++++++++++++++ 4 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 tests/takeout.py diff --git a/my/location/takeout.py b/my/location/takeout.py index 3441f73..79ad25c 100644 --- a/my/location/takeout.py +++ b/my/location/takeout.py @@ -27,7 +27,7 @@ from ..takeout import get_last_takeout from ..kython import kompress -logger = LazyLogger(__package__) +logger = LazyLogger(__name__) def cache_path(*args, **kwargs): diff --git a/my/media/youtube.py b/my/media/youtube.py index 2050be3..4e23f5b 100755 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -20,7 +20,9 @@ class Watched(NamedTuple): def get_watched(): - path = 'Takeout/My Activity/YouTube/MyActivity.html' + # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/ + path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last + # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json last = get_last_takeout(path=path) watches: List[Watched] = [] @@ -33,6 +35,7 @@ def get_watched(): dd = fo.read().decode('utf8') parser.feed(dd) + # TODO hmm they already come sorted.. wonder if should just rely on it.. return list(sorted(watches, key=lambda e: e.when)) diff --git a/my/takeout.py b/my/takeout.py index 592f439..26404eb 100644 --- a/my/takeout.py +++ b/my/takeout.py @@ -1,24 +1,27 @@ from pathlib import Path -from typing import Optional +from typing import Optional, Iterable from .common import get_files from .kython.kompress import kopen, kexists from my.config import google as config -def get_last_takeout(*, path: Optional[str]=None) -> Path: +def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]: """ - Ok, sometimes google splits takeout into two zip archives - I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine.. + Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need """ # TODO FIXME zip is not great.. # allow a lambda expression? that way the user could restrict it - for takeout in reversed(get_files(config.takeout_path, glob='*.zip')): + for takeout in get_files(config.takeout_path, glob='*.zip'): if path is None or kexists(takeout, path): - return takeout - else: - continue - raise RuntimeError(f'Not found: {path}') + yield takeout + + +def get_last_takeout(*, path: Optional[str]=None) -> Path: + # TODO more_itertools? + matching = list(get_takeouts(path=path)) + return matching[-1] + # TODO might be a good idea to merge across multiple takeouts... # perhaps even a special takeout module that deals with all of this automatically? diff --git a/tests/takeout.py b/tests/takeout.py new file mode 100644 index 0000000..d7bd3ca --- /dev/null +++ b/tests/takeout.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +from itertools import islice + +from my.core.cachew import disable_cachew +disable_cachew() + +import my.location.takeout as LT + + +def ilen(it): + # TODO more_itertools? + return len(list(it)) + + +def test_location_perf(): + # 2.80 s for 10 iterations and 10K points + # TODO try switching to jq and see how it goes? not sure.. + print(ilen(islice(LT.iter_locations(), 0, 10000))) From adadffef16263585dade3752a3d1fd33a7955690 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 24 Apr 2020 16:11:19 +0100 Subject: [PATCH 3/6] add takeout parser test --- tests/takeout.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/takeout.py b/tests/takeout.py index d7bd3ca..bbe6271 100644 --- a/tests/takeout.py +++ b/tests/takeout.py @@ -5,6 +5,7 @@ from my.core.cachew import disable_cachew disable_cachew() import my.location.takeout as LT +from my.kython.kompress import kopen def ilen(it): @@ -16,3 +17,23 @@ def test_location_perf(): # 2.80 s for 10 iterations and 10K points # TODO try switching to jq and see how it goes? not sure.. print(ilen(islice(LT.iter_locations(), 0, 10000))) + + +def test_parser(): + from my.kython.ktakeout import TakeoutHTMLParser + from my.takeout import get_last_takeout + + # 4s for parsing with HTMLParser (30K results) + path = 'Takeout/My Activity/Chrome/MyActivity.html' + tpath = get_last_takeout(path=path) + + results = [] + def cb(dt, url, title): + results.append((dt, url, title)) + + parser = TakeoutHTMLParser(cb) + + with kopen(tpath, path) as fo: + dd = fo.read().decode('utf8') + parser.feed(dd) + print(len(results)) From 810fe218393491749121fe29f4dc798f7ca9a117 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 24 Apr 2020 16:35:20 +0100 Subject: [PATCH 4/6] attempt to use xmllint to speed up takeout parsing --- tests/takeout.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/takeout.py b/tests/takeout.py index bbe6271..6f7c8d8 100644 --- a/tests/takeout.py +++ b/tests/takeout.py @@ -37,3 +37,24 @@ def test_parser(): dd = fo.read().decode('utf8') parser.feed(dd) print(len(results)) + + +def parse_takeout_xmllint(data: str): + # without xmllint (splitting by '
Date: Fri, 24 Apr 2020 18:10:33 +0100 Subject: [PATCH 6/6] more takeout to a separate subpackage --- my/{kython/ktakeout.py => google/takeout/html.py} | 4 ++-- my/{takeout.py => google/takeout/paths.py} | 5 ++--- my/location/takeout.py | 2 +- my/media/youtube.py | 4 ++-- tests/takeout.py | 4 ++-- 5 files changed, 9 insertions(+), 10 deletions(-) rename my/{kython/ktakeout.py => google/takeout/html.py} (98%) rename my/{takeout.py => google/takeout/paths.py} (89%) diff --git a/my/kython/ktakeout.py b/my/google/takeout/html.py similarity index 98% rename from my/kython/ktakeout.py rename to my/google/takeout/html.py index 30688e3..2fccee9 100644 --- a/my/kython/ktakeout.py +++ b/my/google/takeout/html.py @@ -8,7 +8,7 @@ from collections import OrderedDict from urllib.parse import unquote import pytz -from ..core.time import abbr_to_timezone +from ...core.time import abbr_to_timezone # Mar 8, 2018, 5:14:40 PM _TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p" @@ -126,7 +126,7 @@ class TakeoutHTMLParser(HTMLParser): def read_html(tpath: Path, file: str) -> Iterable[Parsed]: - from .kompress import kopen + from ...kython.kompress import kopen results: List[Parsed] = [] def cb(dt: datetime, url: Url, title: Title) -> None: results.append((dt, url, title)) diff --git a/my/takeout.py b/my/google/takeout/paths.py similarity index 89% rename from my/takeout.py rename to my/google/takeout/paths.py index e38e493..312e2f4 100644 --- a/my/takeout.py +++ b/my/google/takeout/paths.py @@ -1,9 +1,8 @@ from pathlib import Path from typing import Optional, Iterable -from .common import get_files -from .kython.kompress import kopen, kexists -from .kython.ktakeout import read_html +from ...common import get_files +from ...kython.kompress import kopen, kexists from my.config import google as config diff --git a/my/location/takeout.py b/my/location/takeout.py index 79ad25c..da53664 100644 --- a/my/location/takeout.py +++ b/my/location/takeout.py @@ -23,7 +23,7 @@ except: import ijson # type: ignore from ..common import get_files, LazyLogger, mcachew -from ..takeout import get_last_takeout +from ..google.takeout.paths import get_last_takeout from ..kython import kompress diff --git a/my/media/youtube.py b/my/media/youtube.py index 6331190..ffe2740 100755 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -2,8 +2,8 @@ from datetime import datetime from typing import NamedTuple, List -from ..kython.ktakeout import read_html -from ..takeout import get_last_takeout +from ..google.takeout.html import read_html +from ..google.takeout.paths import get_last_takeout class Watched(NamedTuple): diff --git a/tests/takeout.py b/tests/takeout.py index 6acca9b..918582f 100644 --- a/tests/takeout.py +++ b/tests/takeout.py @@ -32,8 +32,8 @@ import pytest # type: ignore ) def test_parser(path: str): path = 'Takeout/' + path - from my.kython.ktakeout import read_html - from my.takeout import get_last_takeout + from my.google.takeout.html import read_html + from my.google.takeout.paths import get_last_takeout tpath = get_last_takeout(path=path)