takeout module; make more resilient to multipart

2020-01-27 22:35:45 +00:00 · 2020-01-27 22:35:45 +00:00 · 56f64c16db
commit 56f64c16db
parent 232d62b3b7
3 changed files with 45 additions and 17 deletions
--- a/my/media/youtube.py
+++ b/my/media/youtube.py
@ -6,16 +6,7 @@ from pathlib import Path
 from kython.ktakeout import TakeoutHTMLParser
 from kython.kompress import open as kopen

-from ..common import get_files
-
-from mycfg import paths
-
-
-def _get_last_takeout():
-    # TODO FIXME might be a good idea to merge across multiple taekouts...
-    # perhaps even a special takeout module that deals with all of this automatically?
-    # e.g. accumulate, filter and maybe report useless takeouts?
-    return max(get_files(paths.google.takeout_path, glob='*.zip'))
+from ..takeout import get_last_takeout


 class Watched(NamedTuple):
@ -29,7 +20,8 @@ class Watched(NamedTuple):


 def get_watched():
-    last = _get_last_takeout()
+    path = 'Takeout/My Activity/YouTube/MyActivity.html'
+    last = get_last_takeout(path=path)

    watches: List[Watched] = []
    def cb(dt, url, title):
@ -37,18 +29,13 @@ def get_watched():

    parser = TakeoutHTMLParser(cb)

-    with kopen(last, 'Takeout/My Activity/YouTube/MyActivity.html') as fo:
+    with kopen(last, path) as fo:
        dd = fo.read().decode('utf8')
        parser.feed(dd)

    return list(sorted(watches, key=lambda e: e.when))


-def test():
-    watched = get_watched()
-    assert len(watched) > 1000
-
-
 def main():
    # TODO shit. a LOT of watches...
    for w in get_watched():
--- a/my/takeout.py
+++ b/my/takeout.py
@ -0,0 +1,30 @@
+from pathlib import Path
+from typing import Optional
+
+from .common import get_files
+
+from mycfg import paths
+
+from kython.kompress import open as kopen
+
+def get_last_takeout(*, path: Optional[str]=None) -> Path:
+    """
+    Ok, sometimes google splits takeout into two zip archives
+    I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine..
+    """
+    for takeout in reversed(get_files(paths.google.takeout_path, glob='*.zip')):
+        if path is None:
+            return takeout
+        else:
+            try:
+                kopen(takeout, path)
+                return takeout
+            except:
+                # TODO eh, a bit horrible, but works for now..
+                continue
+    raise RuntimeError(f'Not found: {path}')
+
+# TODO might be a good idea to merge across multiple taekouts...
+# perhaps even a special takeout module that deals with all of this automatically?
+# e.g. accumulate, filter and maybe report useless takeouts?
+
--- a/tests/youtube.py
+++ b/tests/youtube.py
@ -0,0 +1,11 @@
+# TODO move elsewhere?
+
+# these tests would only make sense with some existing data? although some of them would work for everyone..
+# not sure what's a good way of handling this..
+
+from my.media.youtube import get_watched
+
+
+def test():
+    watched = get_watched()
+    assert len(watched) > 1000