takeout module; make more resilient to multipart

This commit is contained in:
Dima Gerasimov 2020-01-27 22:35:45 +00:00
parent 232d62b3b7
commit 56f64c16db
3 changed files with 45 additions and 17 deletions

View file

@ -6,16 +6,7 @@ from pathlib import Path
from kython.ktakeout import TakeoutHTMLParser from kython.ktakeout import TakeoutHTMLParser
from kython.kompress import open as kopen from kython.kompress import open as kopen
from ..common import get_files from ..takeout import get_last_takeout
from mycfg import paths
def _get_last_takeout():
# TODO FIXME might be a good idea to merge across multiple taekouts...
# perhaps even a special takeout module that deals with all of this automatically?
# e.g. accumulate, filter and maybe report useless takeouts?
return max(get_files(paths.google.takeout_path, glob='*.zip'))
class Watched(NamedTuple): class Watched(NamedTuple):
@ -29,7 +20,8 @@ class Watched(NamedTuple):
def get_watched(): def get_watched():
last = _get_last_takeout() path = 'Takeout/My Activity/YouTube/MyActivity.html'
last = get_last_takeout(path=path)
watches: List[Watched] = [] watches: List[Watched] = []
def cb(dt, url, title): def cb(dt, url, title):
@ -37,18 +29,13 @@ def get_watched():
parser = TakeoutHTMLParser(cb) parser = TakeoutHTMLParser(cb)
with kopen(last, 'Takeout/My Activity/YouTube/MyActivity.html') as fo: with kopen(last, path) as fo:
dd = fo.read().decode('utf8') dd = fo.read().decode('utf8')
parser.feed(dd) parser.feed(dd)
return list(sorted(watches, key=lambda e: e.when)) return list(sorted(watches, key=lambda e: e.when))
def test():
watched = get_watched()
assert len(watched) > 1000
def main(): def main():
# TODO shit. a LOT of watches... # TODO shit. a LOT of watches...
for w in get_watched(): for w in get_watched():

30
my/takeout.py Normal file
View file

@ -0,0 +1,30 @@
from pathlib import Path
from typing import Optional
from .common import get_files
from mycfg import paths
from kython.kompress import open as kopen
def get_last_takeout(*, path: Optional[str]=None) -> Path:
"""
Ok, sometimes google splits takeout into two zip archives
I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine..
"""
for takeout in reversed(get_files(paths.google.takeout_path, glob='*.zip')):
if path is None:
return takeout
else:
try:
kopen(takeout, path)
return takeout
except:
# TODO eh, a bit horrible, but works for now..
continue
raise RuntimeError(f'Not found: {path}')
# TODO might be a good idea to merge across multiple taekouts...
# perhaps even a special takeout module that deals with all of this automatically?
# e.g. accumulate, filter and maybe report useless takeouts?

11
tests/youtube.py Normal file
View file

@ -0,0 +1,11 @@
# TODO move elsewhere?
# these tests would only make sense with some existing data? although some of them would work for everyone..
# not sure what's a good way of handling this..
from my.media.youtube import get_watched
def test():
watched = get_watched()
assert len(watched) > 1000