diff --git a/my/core/cachew.py b/my/core/cachew.py new file mode 100644 index 0000000..551527a --- /dev/null +++ b/my/core/cachew.py @@ -0,0 +1,29 @@ +''' +# TODO this probably belongs to cachew? or cachew.experimental +''' +from contextlib import contextmanager + + +def disable_cachew(): + ''' + NOTE: you need to use it before importing any function using @cachew.cachew + ''' + # TODO not sure... maybe it should instead use some hook.. it's a ibt ugly do + import cachew + + @cachew.doublewrap + def cachew_off(func=None, *args, **kwargs): + return func + old = cachew.cachew + cachew.cachew = cachew_off + return old + + +@contextmanager +def disabled_cachew(): + import cachew + old = disable_cachew() + try: + yield + finally: + cachew.cachew = old diff --git a/my/core/time.py b/my/core/time.py index d34ebf8..2c642d6 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -11,6 +11,6 @@ tz_lookup = { tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu... -@lru_cache(-1) +@lru_cache(None) def abbr_to_timezone(abbr: str): return tz_lookup[abbr] diff --git a/my/kython/ktakeout.py b/my/google/takeout/html.py similarity index 83% rename from my/kython/ktakeout.py rename to my/google/takeout/html.py index 96a3f58..2fccee9 100644 --- a/my/kython/ktakeout.py +++ b/my/google/takeout/html.py @@ -3,12 +3,12 @@ import re from pathlib import Path from datetime import datetime from html.parser import HTMLParser -from typing import List, Dict, Optional, Any +from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple from collections import OrderedDict from urllib.parse import unquote import pytz -from ..core.time import abbr_to_timezone +from ...core.time import abbr_to_timezone # Mar 8, 2018, 5:14:40 PM _TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p" @@ -49,10 +49,15 @@ class State(Enum): PARSING_DATE = 3 +Url = str +Title = str +Parsed = Tuple[datetime, Url, Title] +Callback = Callable[[datetime, Url, Title], None] + # would be easier to use beautiful soup, but ends up in a big memory footprint.. class TakeoutHTMLParser(HTMLParser): - def __init__(self, callback) -> None: + def __init__(self, callback: Callback) -> None: super().__init__() self.state: State = State.OUTSIDE @@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser): self.state = State.OUTSIDE return + + +def read_html(tpath: Path, file: str) -> Iterable[Parsed]: + from ...kython.kompress import kopen + results: List[Parsed] = [] + def cb(dt: datetime, url: Url, title: Title) -> None: + results.append((dt, url, title)) + parser = TakeoutHTMLParser(callback=cb) + with kopen(tpath, file) as fo: + # TODO careful, wht if it's a string already? make asutf method? + data = fo.read().decode('utf8') + parser.feed(data) + return results diff --git a/my/google/takeout/paths.py b/my/google/takeout/paths.py new file mode 100644 index 0000000..312e2f4 --- /dev/null +++ b/my/google/takeout/paths.py @@ -0,0 +1,29 @@ +from pathlib import Path +from typing import Optional, Iterable + +from ...common import get_files +from ...kython.kompress import kopen, kexists + +from my.config import google as config + +def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]: + """ + Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need + """ + # TODO FIXME zip is not great.. + # allow a lambda expression? that way the user could restrict it + for takeout in get_files(config.takeout_path, glob='*.zip'): + if path is None or kexists(takeout, path): + yield takeout + + +def get_last_takeout(*, path: Optional[str]=None) -> Path: + # TODO more_itertools? + matching = list(get_takeouts(path=path)) + return matching[-1] + + +# TODO might be a good idea to merge across multiple takeouts... +# perhaps even a special takeout module that deals with all of this automatically? +# e.g. accumulate, filter and maybe report useless takeouts? + diff --git a/my/location/takeout.py b/my/location/takeout.py index 3441f73..da53664 100644 --- a/my/location/takeout.py +++ b/my/location/takeout.py @@ -23,11 +23,11 @@ except: import ijson # type: ignore from ..common import get_files, LazyLogger, mcachew -from ..takeout import get_last_takeout +from ..google.takeout.paths import get_last_takeout from ..kython import kompress -logger = LazyLogger(__package__) +logger = LazyLogger(__name__) def cache_path(*args, **kwargs): diff --git a/my/media/youtube.py b/my/media/youtube.py index 2050be3..ffe2740 100755 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -2,11 +2,8 @@ from datetime import datetime from typing import NamedTuple, List -# TODO ugh. reuse it in mypkg/releaste takeout parser separately? -from ..kython.ktakeout import TakeoutHTMLParser - -from ..kython.kompress import kopen -from ..takeout import get_last_takeout +from ..google.takeout.html import read_html +from ..google.takeout.paths import get_last_takeout class Watched(NamedTuple): @@ -20,19 +17,16 @@ class Watched(NamedTuple): def get_watched(): - path = 'Takeout/My Activity/YouTube/MyActivity.html' + # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/ + path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last + # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json last = get_last_takeout(path=path) watches: List[Watched] = [] - def cb(dt, url, title): + for dt, url, title in read_html(last, path): watches.append(Watched(url=url, title=title, when=dt)) - parser = TakeoutHTMLParser(cb) - - with kopen(last, path) as fo: - dd = fo.read().decode('utf8') - parser.feed(dd) - + # TODO hmm they already come sorted.. wonder if should just rely on it.. return list(sorted(watches, key=lambda e: e.when)) diff --git a/my/takeout.py b/my/takeout.py deleted file mode 100644 index 64dbcda..0000000 --- a/my/takeout.py +++ /dev/null @@ -1,31 +0,0 @@ -from pathlib import Path -from typing import Optional - -from .common import get_files - -from my.config import google as config - -from .kython.kompress import kopen - -def get_last_takeout(*, path: Optional[str]=None) -> Path: - """ - Ok, sometimes google splits takeout into two zip archives - I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine.. - """ - for takeout in reversed(get_files(config.takeout_path, glob='*.zip')): - if path is None: - return takeout - else: - try: - kopen(takeout, path) - return takeout - except: - # TODO eh, a bit horrible, but works for now.. - # TODO move ot kompress? 'kexists'? - continue - raise RuntimeError(f'Not found: {path}') - -# TODO might be a good idea to merge across multiple taekouts... -# perhaps even a special takeout module that deals with all of this automatically? -# e.g. accumulate, filter and maybe report useless takeouts? - diff --git a/tests/takeout.py b/tests/takeout.py new file mode 100644 index 0000000..918582f --- /dev/null +++ b/tests/takeout.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +from itertools import islice + +from my.core.cachew import disable_cachew +disable_cachew() + +import my.location.takeout as LT +from my.kython.kompress import kopen + + +def ilen(it): + # TODO more_itertools? + return len(list(it)) + + +def test_location_perf(): + # 2.80 s for 10 iterations and 10K points + # TODO try switching to jq and see how it goes? not sure.. + print(ilen(islice(LT.iter_locations(), 0, 10000))) + + +# in theory should support any HTML takeout file? +# although IIRC bookmakrs and search-history.html weren't working +import pytest # type: ignore +@pytest.mark.parametrize( + 'path', [ + 'YouTube/history/watch-history.html', + 'My Activity/YouTube/MyActivity.html', + 'My Activity/Chrome/MyActivity.html', + 'My Activity/Search/MyActivity.html', + ] +) +def test_parser(path: str): + path = 'Takeout/' + path + from my.google.takeout.html import read_html + from my.google.takeout.paths import get_last_takeout + + tpath = get_last_takeout(path=path) + + results = [] + for res in read_html(tpath, path): + results.append(res) + + print(len(results)) + + +def parse_takeout_xmllint(data: str): + # without xmllint (splitting by '