takeout module; make more resilient to multipart
This commit is contained in:
parent
232d62b3b7
commit
56f64c16db
3 changed files with 45 additions and 17 deletions
|
@ -6,16 +6,7 @@ from pathlib import Path
|
|||
from kython.ktakeout import TakeoutHTMLParser
|
||||
from kython.kompress import open as kopen
|
||||
|
||||
from ..common import get_files
|
||||
|
||||
from mycfg import paths
|
||||
|
||||
|
||||
def _get_last_takeout():
|
||||
# TODO FIXME might be a good idea to merge across multiple taekouts...
|
||||
# perhaps even a special takeout module that deals with all of this automatically?
|
||||
# e.g. accumulate, filter and maybe report useless takeouts?
|
||||
return max(get_files(paths.google.takeout_path, glob='*.zip'))
|
||||
from ..takeout import get_last_takeout
|
||||
|
||||
|
||||
class Watched(NamedTuple):
|
||||
|
@ -29,7 +20,8 @@ class Watched(NamedTuple):
|
|||
|
||||
|
||||
def get_watched():
|
||||
last = _get_last_takeout()
|
||||
path = 'Takeout/My Activity/YouTube/MyActivity.html'
|
||||
last = get_last_takeout(path=path)
|
||||
|
||||
watches: List[Watched] = []
|
||||
def cb(dt, url, title):
|
||||
|
@ -37,18 +29,13 @@ def get_watched():
|
|||
|
||||
parser = TakeoutHTMLParser(cb)
|
||||
|
||||
with kopen(last, 'Takeout/My Activity/YouTube/MyActivity.html') as fo:
|
||||
with kopen(last, path) as fo:
|
||||
dd = fo.read().decode('utf8')
|
||||
parser.feed(dd)
|
||||
|
||||
return list(sorted(watches, key=lambda e: e.when))
|
||||
|
||||
|
||||
def test():
|
||||
watched = get_watched()
|
||||
assert len(watched) > 1000
|
||||
|
||||
|
||||
def main():
|
||||
# TODO shit. a LOT of watches...
|
||||
for w in get_watched():
|
||||
|
|
30
my/takeout.py
Normal file
30
my/takeout.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from .common import get_files
|
||||
|
||||
from mycfg import paths
|
||||
|
||||
from kython.kompress import open as kopen
|
||||
|
||||
def get_last_takeout(*, path: Optional[str]=None) -> Path:
|
||||
"""
|
||||
Ok, sometimes google splits takeout into two zip archives
|
||||
I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine..
|
||||
"""
|
||||
for takeout in reversed(get_files(paths.google.takeout_path, glob='*.zip')):
|
||||
if path is None:
|
||||
return takeout
|
||||
else:
|
||||
try:
|
||||
kopen(takeout, path)
|
||||
return takeout
|
||||
except:
|
||||
# TODO eh, a bit horrible, but works for now..
|
||||
continue
|
||||
raise RuntimeError(f'Not found: {path}')
|
||||
|
||||
# TODO might be a good idea to merge across multiple taekouts...
|
||||
# perhaps even a special takeout module that deals with all of this automatically?
|
||||
# e.g. accumulate, filter and maybe report useless takeouts?
|
||||
|
11
tests/youtube.py
Normal file
11
tests/youtube.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
# TODO move elsewhere?
|
||||
|
||||
# these tests would only make sense with some existing data? although some of them would work for everyone..
|
||||
# not sure what's a good way of handling this..
|
||||
|
||||
from my.media.youtube import get_watched
|
||||
|
||||
|
||||
def test():
|
||||
watched = get_watched()
|
||||
assert len(watched) > 1000
|
Loading…
Add table
Reference in a new issue