Merge pull request #33 from karlicoss/updates

Google takeout updates
2020-04-24 18:55:54 +01:00 · 2020-04-24 18:55:54 +01:00 · 0d4bcc1d7c
commit 0d4bcc1d7c
parent 121ed58c17 a84b51807f
8 changed files with 154 additions and 50 deletions
--- a/my/core/cachew.py
+++ b/my/core/cachew.py
@ -0,0 +1,29 @@
 '''
 # TODO this probably belongs to cachew? or cachew.experimental
 '''
 from contextlib import contextmanager
 def disable_cachew():
    '''
    NOTE: you need to use it before importing any function using @cachew.cachew
    '''
    # TODO not sure... maybe it should instead use some hook.. it's a ibt ugly do
    import cachew
    @cachew.doublewrap
    def cachew_off(func=None, *args, **kwargs):
        return func
    old = cachew.cachew
    cachew.cachew = cachew_off
    return old
@contextmanager
 def disabled_cachew():
    import cachew
    old = disable_cachew()
    try:
        yield
    finally:
        cachew.cachew = old
--- a/my/core/time.py
+++ b/my/core/time.py
@ -11,6 +11,6 @@ tz_lookup = {
 tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
-@lru_cache(-1)
+@lru_cache(None)
 def abbr_to_timezone(abbr: str):
    return tz_lookup[abbr]
--- a/my/google/takeout/html.py
+++ b/my/google/takeout/html.py
@ -3,12 +3,12 @@ import re
 from pathlib import Path
 from datetime import datetime
 from html.parser import HTMLParser
-from typing import List, Dict, Optional, Any
+from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
 from collections import OrderedDict
 from urllib.parse import unquote
 import pytz
-from ..core.time import abbr_to_timezone
+from ...core.time import abbr_to_timezone
 # Mar 8, 2018, 5:14:40 PM
 _TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
@ -49,10 +49,15 @@ class State(Enum):
    PARSING_DATE = 3
 Url = str
 Title = str
 Parsed = Tuple[datetime, Url, Title]
 Callback = Callable[[datetime, Url, Title], None]
 # would be easier to use beautiful soup, but ends up in a big memory footprint..
 class TakeoutHTMLParser(HTMLParser):
-    def __init__(self, callback) -> None:
+    def __init__(self, callback: Callback) -> None:
        super().__init__()
        self.state: State = State.OUTSIDE
@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser):
            self.state = State.OUTSIDE
            return
 def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
    from ...kython.kompress import kopen
    results: List[Parsed] = []
    def cb(dt: datetime, url: Url, title: Title) -> None:
        results.append((dt, url, title))
    parser = TakeoutHTMLParser(callback=cb)
    with kopen(tpath, file) as fo:
        # TODO careful, wht if it's a string already? make asutf method?
        data = fo.read().decode('utf8')
        parser.feed(data)
    return results
--- a/my/google/takeout/paths.py
+++ b/my/google/takeout/paths.py
@ -0,0 +1,29 @@
 from pathlib import Path
 from typing import Optional, Iterable
 from ...common import get_files
 from ...kython.kompress import kopen, kexists
 from my.config import google as config
 def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
    """
    Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need
    """
    # TODO FIXME zip is not great..
    # allow a lambda expression? that way the user could restrict it
    for takeout in get_files(config.takeout_path, glob='*.zip'):
        if path is None or kexists(takeout, path):
            yield takeout
 def get_last_takeout(*, path: Optional[str]=None) -> Path:
    # TODO more_itertools?
    matching = list(get_takeouts(path=path))
    return matching[-1]
 # TODO might be a good idea to merge across multiple takeouts...
 # perhaps even a special takeout module that deals with all of this automatically?
 # e.g. accumulate, filter and maybe report useless takeouts?
--- a/my/location/takeout.py
+++ b/my/location/takeout.py
@ -23,11 +23,11 @@ except:
    import ijson # type: ignore
 from ..common import get_files, LazyLogger, mcachew
-from ..takeout import get_last_takeout
+from ..google.takeout.paths import get_last_takeout
 from ..kython import kompress
-logger = LazyLogger(__package__)
+logger = LazyLogger(__name__)
 def cache_path(*args, **kwargs):
--- a/my/media/youtube.py
+++ b/my/media/youtube.py
@ -2,11 +2,8 @@
 from datetime import datetime
 from typing import NamedTuple, List
-# TODO ugh. reuse it in mypkg/releaste takeout parser separately?
+from ..google.takeout.html import read_html
-from ..kython.ktakeout import TakeoutHTMLParser
+from ..google.takeout.paths import get_last_takeout
 from ..kython.kompress import kopen
 from ..takeout import get_last_takeout
 class Watched(NamedTuple):
@ -20,19 +17,16 @@ class Watched(NamedTuple):
 def get_watched():
-    path = 'Takeout/My Activity/YouTube/MyActivity.html'
+    # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
    path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
    # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
    last = get_last_takeout(path=path)
    watches: List[Watched] = []
-    def cb(dt, url, title):
+    for dt, url, title in read_html(last, path):
        watches.append(Watched(url=url, title=title, when=dt))
-    parser = TakeoutHTMLParser(cb)
+    # TODO hmm they already come sorted.. wonder if should just rely on it..
    with kopen(last, path) as fo:
        dd = fo.read().decode('utf8')
        parser.feed(dd)
    return list(sorted(watches, key=lambda e: e.when))
--- a/my/takeout.py
+++ b/my/takeout.py
@ -1,31 +0,0 @@
 from pathlib import Path
 from typing import Optional
 from .common import get_files
 from my.config import google as config
 from .kython.kompress import kopen
 def get_last_takeout(*, path: Optional[str]=None) -> Path:
    """
    Ok, sometimes google splits takeout into two zip archives
    I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine..
    """
    for takeout in reversed(get_files(config.takeout_path, glob='*.zip')):
        if path is None:
            return takeout
        else:
            try:
                kopen(takeout, path)
                return takeout
            except:
                # TODO eh, a bit horrible, but works for now..
                # TODO move ot kompress? 'kexists'?
                continue
    raise RuntimeError(f'Not found: {path}')
 # TODO might be a good idea to merge across multiple taekouts...
 # perhaps even a special takeout module that deals with all of this automatically?
 # e.g. accumulate, filter and maybe report useless takeouts?
--- a/tests/takeout.py
+++ b/tests/takeout.py
@ -0,0 +1,65 @@
 #!/usr/bin/env python3
 from itertools import islice
 from my.core.cachew import disable_cachew
 disable_cachew()
 import my.location.takeout as LT
 from my.kython.kompress import kopen
 def ilen(it):
    # TODO more_itertools?
    return len(list(it))
 def test_location_perf():
    # 2.80 s for 10 iterations and 10K points
    # TODO try switching to jq and see how it goes? not sure..
    print(ilen(islice(LT.iter_locations(), 0, 10000)))
 # in theory should support any HTML takeout file?
 # although IIRC bookmakrs and search-history.html weren't working
 import pytest # type: ignore
@pytest.mark.parametrize(
    'path', [
        'YouTube/history/watch-history.html',
        'My Activity/YouTube/MyActivity.html',
        'My Activity/Chrome/MyActivity.html',
        'My Activity/Search/MyActivity.html',
    ]
 )
 def test_parser(path: str):
    path = 'Takeout/' + path
    from my.google.takeout.html import read_html
    from my.google.takeout.paths import get_last_takeout
    tpath = get_last_takeout(path=path)
    results = []
    for res in read_html(tpath, path):
        results.append(res)
    print(len(results))
 def parse_takeout_xmllint(data: str):
    # without xmllint (splitting by '<div class="content-cell' -- 0.68 secs)
    # with xmllint -- 2 seconds
    # using html.parser -- 4 seconds (+ all the parsing etc), 30K results
    # not *that* much opportunity to speedup I guess
    # the only downside is that html.parser isn't iterative.. might be able to hack with some iternal hacks?
    # wonder what's the bottleneck..
    #
    from subprocess import Popen, PIPE, run
    from more_itertools import split_before
    res = run(
        ['xmllint', '--html', '--xpath', '//div[contains(@class, "content-cell")]', '-'],
        input=data.encode('utf8'),
        check=True,
        stdout=PIPE,
    )
    out = res.stdout.decode('utf8')
    # out = data
    return out.split('<div class="content-cell')