Merge pull request #33 from karlicoss/updates

Google takeout updates
This commit is contained in:
karlicoss 2020-04-24 18:55:54 +01:00 committed by GitHub
commit 0d4bcc1d7c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 154 additions and 50 deletions

29
my/core/cachew.py Normal file
View file

@ -0,0 +1,29 @@
'''
# TODO this probably belongs to cachew? or cachew.experimental
'''
from contextlib import contextmanager
def disable_cachew():
'''
NOTE: you need to use it before importing any function using @cachew.cachew
'''
# TODO not sure... maybe it should instead use some hook.. it's a ibt ugly do
import cachew
@cachew.doublewrap
def cachew_off(func=None, *args, **kwargs):
return func
old = cachew.cachew
cachew.cachew = cachew_off
return old
@contextmanager
def disabled_cachew():
import cachew
old = disable_cachew()
try:
yield
finally:
cachew.cachew = old

View file

@ -11,6 +11,6 @@ tz_lookup = {
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
@lru_cache(-1)
@lru_cache(None)
def abbr_to_timezone(abbr: str):
return tz_lookup[abbr]

View file

@ -3,12 +3,12 @@ import re
from pathlib import Path
from datetime import datetime
from html.parser import HTMLParser
from typing import List, Dict, Optional, Any
from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
from collections import OrderedDict
from urllib.parse import unquote
import pytz
from ..core.time import abbr_to_timezone
from ...core.time import abbr_to_timezone
# Mar 8, 2018, 5:14:40 PM
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
@ -49,10 +49,15 @@ class State(Enum):
PARSING_DATE = 3
Url = str
Title = str
Parsed = Tuple[datetime, Url, Title]
Callback = Callable[[datetime, Url, Title], None]
# would be easier to use beautiful soup, but ends up in a big memory footprint..
class TakeoutHTMLParser(HTMLParser):
def __init__(self, callback) -> None:
def __init__(self, callback: Callback) -> None:
super().__init__()
self.state: State = State.OUTSIDE
@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser):
self.state = State.OUTSIDE
return
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
from ...kython.kompress import kopen
results: List[Parsed] = []
def cb(dt: datetime, url: Url, title: Title) -> None:
results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo:
# TODO careful, wht if it's a string already? make asutf method?
data = fo.read().decode('utf8')
parser.feed(data)
return results

View file

@ -0,0 +1,29 @@
from pathlib import Path
from typing import Optional, Iterable
from ...common import get_files
from ...kython.kompress import kopen, kexists
from my.config import google as config
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
"""
Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need
"""
# TODO FIXME zip is not great..
# allow a lambda expression? that way the user could restrict it
for takeout in get_files(config.takeout_path, glob='*.zip'):
if path is None or kexists(takeout, path):
yield takeout
def get_last_takeout(*, path: Optional[str]=None) -> Path:
# TODO more_itertools?
matching = list(get_takeouts(path=path))
return matching[-1]
# TODO might be a good idea to merge across multiple takeouts...
# perhaps even a special takeout module that deals with all of this automatically?
# e.g. accumulate, filter and maybe report useless takeouts?

View file

@ -23,11 +23,11 @@ except:
import ijson # type: ignore
from ..common import get_files, LazyLogger, mcachew
from ..takeout import get_last_takeout
from ..google.takeout.paths import get_last_takeout
from ..kython import kompress
logger = LazyLogger(__package__)
logger = LazyLogger(__name__)
def cache_path(*args, **kwargs):

View file

@ -2,11 +2,8 @@
from datetime import datetime
from typing import NamedTuple, List
# TODO ugh. reuse it in mypkg/releaste takeout parser separately?
from ..kython.ktakeout import TakeoutHTMLParser
from ..kython.kompress import kopen
from ..takeout import get_last_takeout
from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout
class Watched(NamedTuple):
@ -20,19 +17,16 @@ class Watched(NamedTuple):
def get_watched():
path = 'Takeout/My Activity/YouTube/MyActivity.html'
# TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
# TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
last = get_last_takeout(path=path)
watches: List[Watched] = []
def cb(dt, url, title):
for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt))
parser = TakeoutHTMLParser(cb)
with kopen(last, path) as fo:
dd = fo.read().decode('utf8')
parser.feed(dd)
# TODO hmm they already come sorted.. wonder if should just rely on it..
return list(sorted(watches, key=lambda e: e.when))

View file

@ -1,31 +0,0 @@
from pathlib import Path
from typing import Optional
from .common import get_files
from my.config import google as config
from .kython.kompress import kopen
def get_last_takeout(*, path: Optional[str]=None) -> Path:
"""
Ok, sometimes google splits takeout into two zip archives
I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine..
"""
for takeout in reversed(get_files(config.takeout_path, glob='*.zip')):
if path is None:
return takeout
else:
try:
kopen(takeout, path)
return takeout
except:
# TODO eh, a bit horrible, but works for now..
# TODO move ot kompress? 'kexists'?
continue
raise RuntimeError(f'Not found: {path}')
# TODO might be a good idea to merge across multiple taekouts...
# perhaps even a special takeout module that deals with all of this automatically?
# e.g. accumulate, filter and maybe report useless takeouts?

65
tests/takeout.py Normal file
View file

@ -0,0 +1,65 @@
#!/usr/bin/env python3
from itertools import islice
from my.core.cachew import disable_cachew
disable_cachew()
import my.location.takeout as LT
from my.kython.kompress import kopen
def ilen(it):
# TODO more_itertools?
return len(list(it))
def test_location_perf():
# 2.80 s for 10 iterations and 10K points
# TODO try switching to jq and see how it goes? not sure..
print(ilen(islice(LT.iter_locations(), 0, 10000)))
# in theory should support any HTML takeout file?
# although IIRC bookmakrs and search-history.html weren't working
import pytest # type: ignore
@pytest.mark.parametrize(
'path', [
'YouTube/history/watch-history.html',
'My Activity/YouTube/MyActivity.html',
'My Activity/Chrome/MyActivity.html',
'My Activity/Search/MyActivity.html',
]
)
def test_parser(path: str):
path = 'Takeout/' + path
from my.google.takeout.html import read_html
from my.google.takeout.paths import get_last_takeout
tpath = get_last_takeout(path=path)
results = []
for res in read_html(tpath, path):
results.append(res)
print(len(results))
def parse_takeout_xmllint(data: str):
# without xmllint (splitting by '<div class="content-cell' -- 0.68 secs)
# with xmllint -- 2 seconds
# using html.parser -- 4 seconds (+ all the parsing etc), 30K results
# not *that* much opportunity to speedup I guess
# the only downside is that html.parser isn't iterative.. might be able to hack with some iternal hacks?
# wonder what's the bottleneck..
#
from subprocess import Popen, PIPE, run
from more_itertools import split_before
res = run(
['xmllint', '--html', '--xpath', '//div[contains(@class, "content-cell")]', '-'],
input=data.encode('utf8'),
check=True,
stdout=PIPE,
)
out = res.stdout.decode('utf8')
# out = data
return out.split('<div class="content-cell')