commit
0d4bcc1d7c
8 changed files with 154 additions and 50 deletions
29
my/core/cachew.py
Normal file
29
my/core/cachew.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
'''
|
||||||
|
# TODO this probably belongs to cachew? or cachew.experimental
|
||||||
|
'''
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
|
||||||
|
def disable_cachew():
|
||||||
|
'''
|
||||||
|
NOTE: you need to use it before importing any function using @cachew.cachew
|
||||||
|
'''
|
||||||
|
# TODO not sure... maybe it should instead use some hook.. it's a ibt ugly do
|
||||||
|
import cachew
|
||||||
|
|
||||||
|
@cachew.doublewrap
|
||||||
|
def cachew_off(func=None, *args, **kwargs):
|
||||||
|
return func
|
||||||
|
old = cachew.cachew
|
||||||
|
cachew.cachew = cachew_off
|
||||||
|
return old
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def disabled_cachew():
|
||||||
|
import cachew
|
||||||
|
old = disable_cachew()
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
cachew.cachew = old
|
|
@ -11,6 +11,6 @@ tz_lookup = {
|
||||||
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
|
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(-1)
|
@lru_cache(None)
|
||||||
def abbr_to_timezone(abbr: str):
|
def abbr_to_timezone(abbr: str):
|
||||||
return tz_lookup[abbr]
|
return tz_lookup[abbr]
|
||||||
|
|
|
@ -3,12 +3,12 @@ import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from typing import List, Dict, Optional, Any
|
from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
from ..core.time import abbr_to_timezone
|
from ...core.time import abbr_to_timezone
|
||||||
|
|
||||||
# Mar 8, 2018, 5:14:40 PM
|
# Mar 8, 2018, 5:14:40 PM
|
||||||
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
|
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
|
||||||
|
@ -49,10 +49,15 @@ class State(Enum):
|
||||||
PARSING_DATE = 3
|
PARSING_DATE = 3
|
||||||
|
|
||||||
|
|
||||||
|
Url = str
|
||||||
|
Title = str
|
||||||
|
Parsed = Tuple[datetime, Url, Title]
|
||||||
|
Callback = Callable[[datetime, Url, Title], None]
|
||||||
|
|
||||||
|
|
||||||
# would be easier to use beautiful soup, but ends up in a big memory footprint..
|
# would be easier to use beautiful soup, but ends up in a big memory footprint..
|
||||||
class TakeoutHTMLParser(HTMLParser):
|
class TakeoutHTMLParser(HTMLParser):
|
||||||
def __init__(self, callback) -> None:
|
def __init__(self, callback: Callback) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.state: State = State.OUTSIDE
|
self.state: State = State.OUTSIDE
|
||||||
|
|
||||||
|
@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser):
|
||||||
|
|
||||||
self.state = State.OUTSIDE
|
self.state = State.OUTSIDE
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
||||||
|
from ...kython.kompress import kopen
|
||||||
|
results: List[Parsed] = []
|
||||||
|
def cb(dt: datetime, url: Url, title: Title) -> None:
|
||||||
|
results.append((dt, url, title))
|
||||||
|
parser = TakeoutHTMLParser(callback=cb)
|
||||||
|
with kopen(tpath, file) as fo:
|
||||||
|
# TODO careful, wht if it's a string already? make asutf method?
|
||||||
|
data = fo.read().decode('utf8')
|
||||||
|
parser.feed(data)
|
||||||
|
return results
|
29
my/google/takeout/paths.py
Normal file
29
my/google/takeout/paths.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Iterable
|
||||||
|
|
||||||
|
from ...common import get_files
|
||||||
|
from ...kython.kompress import kopen, kexists
|
||||||
|
|
||||||
|
from my.config import google as config
|
||||||
|
|
||||||
|
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
|
||||||
|
"""
|
||||||
|
Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need
|
||||||
|
"""
|
||||||
|
# TODO FIXME zip is not great..
|
||||||
|
# allow a lambda expression? that way the user could restrict it
|
||||||
|
for takeout in get_files(config.takeout_path, glob='*.zip'):
|
||||||
|
if path is None or kexists(takeout, path):
|
||||||
|
yield takeout
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_takeout(*, path: Optional[str]=None) -> Path:
|
||||||
|
# TODO more_itertools?
|
||||||
|
matching = list(get_takeouts(path=path))
|
||||||
|
return matching[-1]
|
||||||
|
|
||||||
|
|
||||||
|
# TODO might be a good idea to merge across multiple takeouts...
|
||||||
|
# perhaps even a special takeout module that deals with all of this automatically?
|
||||||
|
# e.g. accumulate, filter and maybe report useless takeouts?
|
||||||
|
|
|
@ -23,11 +23,11 @@ except:
|
||||||
import ijson # type: ignore
|
import ijson # type: ignore
|
||||||
|
|
||||||
from ..common import get_files, LazyLogger, mcachew
|
from ..common import get_files, LazyLogger, mcachew
|
||||||
from ..takeout import get_last_takeout
|
from ..google.takeout.paths import get_last_takeout
|
||||||
from ..kython import kompress
|
from ..kython import kompress
|
||||||
|
|
||||||
|
|
||||||
logger = LazyLogger(__package__)
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def cache_path(*args, **kwargs):
|
def cache_path(*args, **kwargs):
|
||||||
|
|
|
@ -2,11 +2,8 @@
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import NamedTuple, List
|
from typing import NamedTuple, List
|
||||||
|
|
||||||
# TODO ugh. reuse it in mypkg/releaste takeout parser separately?
|
from ..google.takeout.html import read_html
|
||||||
from ..kython.ktakeout import TakeoutHTMLParser
|
from ..google.takeout.paths import get_last_takeout
|
||||||
|
|
||||||
from ..kython.kompress import kopen
|
|
||||||
from ..takeout import get_last_takeout
|
|
||||||
|
|
||||||
|
|
||||||
class Watched(NamedTuple):
|
class Watched(NamedTuple):
|
||||||
|
@ -20,19 +17,16 @@ class Watched(NamedTuple):
|
||||||
|
|
||||||
|
|
||||||
def get_watched():
|
def get_watched():
|
||||||
path = 'Takeout/My Activity/YouTube/MyActivity.html'
|
# TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
|
||||||
|
path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
|
||||||
|
# TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
|
||||||
last = get_last_takeout(path=path)
|
last = get_last_takeout(path=path)
|
||||||
|
|
||||||
watches: List[Watched] = []
|
watches: List[Watched] = []
|
||||||
def cb(dt, url, title):
|
for dt, url, title in read_html(last, path):
|
||||||
watches.append(Watched(url=url, title=title, when=dt))
|
watches.append(Watched(url=url, title=title, when=dt))
|
||||||
|
|
||||||
parser = TakeoutHTMLParser(cb)
|
# TODO hmm they already come sorted.. wonder if should just rely on it..
|
||||||
|
|
||||||
with kopen(last, path) as fo:
|
|
||||||
dd = fo.read().decode('utf8')
|
|
||||||
parser.feed(dd)
|
|
||||||
|
|
||||||
return list(sorted(watches, key=lambda e: e.when))
|
return list(sorted(watches, key=lambda e: e.when))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,31 +0,0 @@
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from .common import get_files
|
|
||||||
|
|
||||||
from my.config import google as config
|
|
||||||
|
|
||||||
from .kython.kompress import kopen
|
|
||||||
|
|
||||||
def get_last_takeout(*, path: Optional[str]=None) -> Path:
|
|
||||||
"""
|
|
||||||
Ok, sometimes google splits takeout into two zip archives
|
|
||||||
I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine..
|
|
||||||
"""
|
|
||||||
for takeout in reversed(get_files(config.takeout_path, glob='*.zip')):
|
|
||||||
if path is None:
|
|
||||||
return takeout
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
kopen(takeout, path)
|
|
||||||
return takeout
|
|
||||||
except:
|
|
||||||
# TODO eh, a bit horrible, but works for now..
|
|
||||||
# TODO move ot kompress? 'kexists'?
|
|
||||||
continue
|
|
||||||
raise RuntimeError(f'Not found: {path}')
|
|
||||||
|
|
||||||
# TODO might be a good idea to merge across multiple taekouts...
|
|
||||||
# perhaps even a special takeout module that deals with all of this automatically?
|
|
||||||
# e.g. accumulate, filter and maybe report useless takeouts?
|
|
||||||
|
|
65
tests/takeout.py
Normal file
65
tests/takeout.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
from itertools import islice
|
||||||
|
|
||||||
|
from my.core.cachew import disable_cachew
|
||||||
|
disable_cachew()
|
||||||
|
|
||||||
|
import my.location.takeout as LT
|
||||||
|
from my.kython.kompress import kopen
|
||||||
|
|
||||||
|
|
||||||
|
def ilen(it):
|
||||||
|
# TODO more_itertools?
|
||||||
|
return len(list(it))
|
||||||
|
|
||||||
|
|
||||||
|
def test_location_perf():
|
||||||
|
# 2.80 s for 10 iterations and 10K points
|
||||||
|
# TODO try switching to jq and see how it goes? not sure..
|
||||||
|
print(ilen(islice(LT.iter_locations(), 0, 10000)))
|
||||||
|
|
||||||
|
|
||||||
|
# in theory should support any HTML takeout file?
|
||||||
|
# although IIRC bookmakrs and search-history.html weren't working
|
||||||
|
import pytest # type: ignore
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'path', [
|
||||||
|
'YouTube/history/watch-history.html',
|
||||||
|
'My Activity/YouTube/MyActivity.html',
|
||||||
|
'My Activity/Chrome/MyActivity.html',
|
||||||
|
'My Activity/Search/MyActivity.html',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_parser(path: str):
|
||||||
|
path = 'Takeout/' + path
|
||||||
|
from my.google.takeout.html import read_html
|
||||||
|
from my.google.takeout.paths import get_last_takeout
|
||||||
|
|
||||||
|
tpath = get_last_takeout(path=path)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for res in read_html(tpath, path):
|
||||||
|
results.append(res)
|
||||||
|
|
||||||
|
print(len(results))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_takeout_xmllint(data: str):
|
||||||
|
# without xmllint (splitting by '<div class="content-cell' -- 0.68 secs)
|
||||||
|
# with xmllint -- 2 seconds
|
||||||
|
# using html.parser -- 4 seconds (+ all the parsing etc), 30K results
|
||||||
|
# not *that* much opportunity to speedup I guess
|
||||||
|
# the only downside is that html.parser isn't iterative.. might be able to hack with some iternal hacks?
|
||||||
|
# wonder what's the bottleneck..
|
||||||
|
#
|
||||||
|
from subprocess import Popen, PIPE, run
|
||||||
|
from more_itertools import split_before
|
||||||
|
res = run(
|
||||||
|
['xmllint', '--html', '--xpath', '//div[contains(@class, "content-cell")]', '-'],
|
||||||
|
input=data.encode('utf8'),
|
||||||
|
check=True,
|
||||||
|
stdout=PIPE,
|
||||||
|
)
|
||||||
|
out = res.stdout.decode('utf8')
|
||||||
|
# out = data
|
||||||
|
return out.split('<div class="content-cell')
|
Loading…
Add table
Reference in a new issue