From 60ccca52ad7ed1a64d7e051c3b9a1faa9f9fc45f Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Fri, 24 Apr 2020 15:57:44 +0100
Subject: [PATCH] more takeout tweaks and comments

---
 my/location/takeout.py |  2 +-
 my/media/youtube.py    |  5 ++++-
 my/takeout.py          | 21 ++++++++++++---------
 tests/takeout.py       | 18 ++++++++++++++++++
 4 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 tests/takeout.py

diff --git a/my/location/takeout.py b/my/location/takeout.py
index 3441f73..79ad25c 100644
--- a/my/location/takeout.py
+++ b/my/location/takeout.py
@@ -27,7 +27,7 @@ from ..takeout import get_last_takeout
 from ..kython import kompress
 
 
-logger = LazyLogger(__package__)
+logger = LazyLogger(__name__)
 
 
 def cache_path(*args, **kwargs):
diff --git a/my/media/youtube.py b/my/media/youtube.py
index 2050be3..4e23f5b 100755
--- a/my/media/youtube.py
+++ b/my/media/youtube.py
@@ -20,7 +20,9 @@ class Watched(NamedTuple):
 
 
 def get_watched():
-    path = 'Takeout/My Activity/YouTube/MyActivity.html'
+    # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
+    path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
+    # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
     last = get_last_takeout(path=path)
 
     watches: List[Watched] = []
@@ -33,6 +35,7 @@ def get_watched():
         dd = fo.read().decode('utf8')
         parser.feed(dd)
 
+    # TODO hmm they already come sorted.. wonder if should just rely on it..
     return list(sorted(watches, key=lambda e: e.when))
 
 
diff --git a/my/takeout.py b/my/takeout.py
index 592f439..26404eb 100644
--- a/my/takeout.py
+++ b/my/takeout.py
@@ -1,24 +1,27 @@
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Iterable
 
 from .common import get_files
 from .kython.kompress import kopen, kexists
 
 from my.config import google as config
 
-def get_last_takeout(*, path: Optional[str]=None) -> Path:
+def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
     """
-    Ok, sometimes google splits takeout into two zip archives
-    I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine..
+    Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need
     """
     # TODO FIXME zip is not great..
     # allow a lambda expression? that way the user could restrict it
-    for takeout in reversed(get_files(config.takeout_path, glob='*.zip')):
+    for takeout in get_files(config.takeout_path, glob='*.zip'):
         if path is None or kexists(takeout, path):
-            return takeout
-        else:
-            continue
-    raise RuntimeError(f'Not found: {path}')
+            yield takeout
+
+
+def get_last_takeout(*, path: Optional[str]=None) -> Path:
+    # TODO more_itertools?
+    matching = list(get_takeouts(path=path))
+    return matching[-1]
+
 
 # TODO might be a good idea to merge across multiple takeouts...
 # perhaps even a special takeout module that deals with all of this automatically?
diff --git a/tests/takeout.py b/tests/takeout.py
new file mode 100644
index 0000000..d7bd3ca
--- /dev/null
+++ b/tests/takeout.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+from itertools import islice
+
+from my.core.cachew import disable_cachew
+disable_cachew()
+
+import my.location.takeout as LT
+
+
+def ilen(it):
+    # TODO more_itertools?
+    return len(list(it))
+
+
+def test_location_perf():
+    # 2.80 s for 10 iterations and 10K points
+    # TODO try switching to jq and see how it goes? not sure..
+    print(ilen(islice(LT.iter_locations(), 0, 10000)))