From 21e82f0cd66767fcaae3206f172049fb459eccd5 Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Fri, 24 Apr 2020 15:19:31 +0100
Subject: [PATCH 1/6] add disable_cachew helper

---
 my/core/cachew.py | 29 +++++++++++++++++++++++++++++
 my/core/time.py   |  2 +-
 my/takeout.py     | 17 ++++++-----------
 3 files changed, 36 insertions(+), 12 deletions(-)
 create mode 100644 my/core/cachew.py

diff --git a/my/core/cachew.py b/my/core/cachew.py
new file mode 100644
index 0000000..551527a
--- /dev/null
+++ b/my/core/cachew.py
@@ -0,0 +1,29 @@
+'''
+# TODO this probably belongs to cachew? or cachew.experimental
+'''
+from contextlib import contextmanager
+
+
+def disable_cachew():
+    '''
+    NOTE: you need to use it before importing any function using @cachew.cachew
+    '''
+    # TODO not sure... maybe it should instead use some hook.. it's a ibt ugly do
+    import cachew
+
+    @cachew.doublewrap
+    def cachew_off(func=None, *args, **kwargs):
+        return func
+    old = cachew.cachew
+    cachew.cachew = cachew_off
+    return old
+
+
+@contextmanager
+def disabled_cachew():
+    import cachew
+    old = disable_cachew()
+    try:
+        yield
+    finally:
+        cachew.cachew = old
diff --git a/my/core/time.py b/my/core/time.py
index d34ebf8..2c642d6 100644
--- a/my/core/time.py
+++ b/my/core/time.py
@@ -11,6 +11,6 @@ tz_lookup = {
 tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
 
 
-@lru_cache(-1)
+@lru_cache(None)
 def abbr_to_timezone(abbr: str):
     return tz_lookup[abbr]
diff --git a/my/takeout.py b/my/takeout.py
index 64dbcda..592f439 100644
--- a/my/takeout.py
+++ b/my/takeout.py
@@ -2,30 +2,25 @@ from pathlib import Path
 from typing import Optional
 
 from .common import get_files
+from .kython.kompress import kopen, kexists
 
 from my.config import google as config
 
-from .kython.kompress import kopen
-
 def get_last_takeout(*, path: Optional[str]=None) -> Path:
     """
     Ok, sometimes google splits takeout into two zip archives
     I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine..
     """
+    # TODO FIXME zip is not great..
+    # allow a lambda expression? that way the user could restrict it
     for takeout in reversed(get_files(config.takeout_path, glob='*.zip')):
-        if path is None:
+        if path is None or kexists(takeout, path):
             return takeout
         else:
-            try:
-                kopen(takeout, path)
-                return takeout
-            except:
-                # TODO eh, a bit horrible, but works for now..
-                # TODO move ot kompress? 'kexists'?
-                continue
+            continue
     raise RuntimeError(f'Not found: {path}')
 
-# TODO might be a good idea to merge across multiple taekouts...
+# TODO might be a good idea to merge across multiple takeouts...
 # perhaps even a special takeout module that deals with all of this automatically?
 # e.g. accumulate, filter and maybe report useless takeouts?
 

From 60ccca52ad7ed1a64d7e051c3b9a1faa9f9fc45f Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Fri, 24 Apr 2020 15:57:44 +0100
Subject: [PATCH 2/6] more takeout tweaks and comments

---
 my/location/takeout.py |  2 +-
 my/media/youtube.py    |  5 ++++-
 my/takeout.py          | 21 ++++++++++++---------
 tests/takeout.py       | 18 ++++++++++++++++++
 4 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 tests/takeout.py

diff --git a/my/location/takeout.py b/my/location/takeout.py
index 3441f73..79ad25c 100644
--- a/my/location/takeout.py
+++ b/my/location/takeout.py
@@ -27,7 +27,7 @@ from ..takeout import get_last_takeout
 from ..kython import kompress
 
 
-logger = LazyLogger(__package__)
+logger = LazyLogger(__name__)
 
 
 def cache_path(*args, **kwargs):
diff --git a/my/media/youtube.py b/my/media/youtube.py
index 2050be3..4e23f5b 100755
--- a/my/media/youtube.py
+++ b/my/media/youtube.py
@@ -20,7 +20,9 @@ class Watched(NamedTuple):
 
 
 def get_watched():
-    path = 'Takeout/My Activity/YouTube/MyActivity.html'
+    # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
+    path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
+    # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
     last = get_last_takeout(path=path)
 
     watches: List[Watched] = []
@@ -33,6 +35,7 @@ def get_watched():
         dd = fo.read().decode('utf8')
         parser.feed(dd)
 
+    # TODO hmm they already come sorted.. wonder if should just rely on it..
     return list(sorted(watches, key=lambda e: e.when))
 
 
diff --git a/my/takeout.py b/my/takeout.py
index 592f439..26404eb 100644
--- a/my/takeout.py
+++ b/my/takeout.py
@@ -1,24 +1,27 @@
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Iterable
 
 from .common import get_files
 from .kython.kompress import kopen, kexists
 
 from my.config import google as config
 
-def get_last_takeout(*, path: Optional[str]=None) -> Path:
+def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
     """
-    Ok, sometimes google splits takeout into two zip archives
-    I guess I could detect it (they've got 001/002 etc suffixes), but fornow that works fine..
+    Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need
     """
     # TODO FIXME zip is not great..
     # allow a lambda expression? that way the user could restrict it
-    for takeout in reversed(get_files(config.takeout_path, glob='*.zip')):
+    for takeout in get_files(config.takeout_path, glob='*.zip'):
         if path is None or kexists(takeout, path):
-            return takeout
-        else:
-            continue
-    raise RuntimeError(f'Not found: {path}')
+            yield takeout
+
+
+def get_last_takeout(*, path: Optional[str]=None) -> Path:
+    # TODO more_itertools?
+    matching = list(get_takeouts(path=path))
+    return matching[-1]
+
 
 # TODO might be a good idea to merge across multiple takeouts...
 # perhaps even a special takeout module that deals with all of this automatically?
diff --git a/tests/takeout.py b/tests/takeout.py
new file mode 100644
index 0000000..d7bd3ca
--- /dev/null
+++ b/tests/takeout.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+from itertools import islice
+
+from my.core.cachew import disable_cachew
+disable_cachew()
+
+import my.location.takeout as LT
+
+
+def ilen(it):
+    # TODO more_itertools?
+    return len(list(it))
+
+
+def test_location_perf():
+    # 2.80 s for 10 iterations and 10K points
+    # TODO try switching to jq and see how it goes? not sure..
+    print(ilen(islice(LT.iter_locations(), 0, 10000)))

From adadffef16263585dade3752a3d1fd33a7955690 Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Fri, 24 Apr 2020 16:11:19 +0100
Subject: [PATCH 3/6] add takeout parser test

---
 tests/takeout.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/takeout.py b/tests/takeout.py
index d7bd3ca..bbe6271 100644
--- a/tests/takeout.py
+++ b/tests/takeout.py
@@ -5,6 +5,7 @@ from my.core.cachew import disable_cachew
 disable_cachew()
 
 import my.location.takeout as LT
+from my.kython.kompress import kopen
 
 
 def ilen(it):
@@ -16,3 +17,23 @@ def test_location_perf():
     # 2.80 s for 10 iterations and 10K points
     # TODO try switching to jq and see how it goes? not sure..
     print(ilen(islice(LT.iter_locations(), 0, 10000)))
+
+
+def test_parser():
+    from my.kython.ktakeout import TakeoutHTMLParser
+    from my.takeout import get_last_takeout
+
+    # 4s for parsing with HTMLParser (30K results)
+    path = 'Takeout/My Activity/Chrome/MyActivity.html'
+    tpath = get_last_takeout(path=path)
+
+    results = []
+    def cb(dt, url, title):
+        results.append((dt, url, title))
+
+    parser = TakeoutHTMLParser(cb)
+
+    with kopen(tpath, path) as fo:
+        dd = fo.read().decode('utf8')
+        parser.feed(dd)
+    print(len(results))

From 810fe218393491749121fe29f4dc798f7ca9a117 Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Fri, 24 Apr 2020 16:35:20 +0100
Subject: [PATCH 4/6] attempt to use xmllint to speed up takeout parsing

---
 tests/takeout.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/takeout.py b/tests/takeout.py
index bbe6271..6f7c8d8 100644
--- a/tests/takeout.py
+++ b/tests/takeout.py
@@ -37,3 +37,24 @@ def test_parser():
         dd = fo.read().decode('utf8')
         parser.feed(dd)
     print(len(results))
+
+
+def parse_takeout_xmllint(data: str):
+    # without xmllint (splitting by '<div class="content-cell' -- 0.68 secs)
+    # with xmllint -- 2 seconds
+    # using html.parser -- 4 seconds (+ all the parsing etc)
+    # not *that* much opportunity to speedup I guess
+    # the only downside is that html.parser isn't iterative.. might be able to hack with some iternal hacks?
+    # wonder what's the bottleneck..
+    #
+    from subprocess import Popen, PIPE, run
+    from more_itertools import split_before
+    res = run(
+        ['xmllint', '--html', '--xpath', '//div[contains(@class, "content-cell")]', '-'],
+        input=data.encode('utf8'),
+        check=True,
+        stdout=PIPE,
+    )
+    out = res.stdout.decode('utf8')
+    # out = data
+    return out.split('<div class="content-cell')

From d1aa4d19dcfe82a0f99bf26b7611d1b2163ed645 Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Fri, 24 Apr 2020 17:01:06 +0100
Subject: [PATCH 5/6] get rid of callbacks in takeout processing interface

---
 my/kython/ktakeout.py | 22 ++++++++++++++++++++--
 my/media/youtube.py   | 13 ++-----------
 my/takeout.py         |  1 +
 tests/takeout.py      | 29 +++++++++++++++++------------
 4 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/my/kython/ktakeout.py b/my/kython/ktakeout.py
index 96a3f58..30688e3 100644
--- a/my/kython/ktakeout.py
+++ b/my/kython/ktakeout.py
@@ -3,7 +3,7 @@ import re
 from pathlib import Path
 from datetime import datetime
 from html.parser import HTMLParser
-from typing import List, Dict, Optional, Any
+from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
 from collections import OrderedDict
 from urllib.parse import unquote
 import pytz
@@ -49,10 +49,15 @@ class State(Enum):
     PARSING_DATE = 3
 
 
+Url = str
+Title = str
+Parsed = Tuple[datetime, Url, Title]
+Callback = Callable[[datetime, Url, Title], None]
+
 
 # would be easier to use beautiful soup, but ends up in a big memory footprint..
 class TakeoutHTMLParser(HTMLParser):
-    def __init__(self, callback) -> None:
+    def __init__(self, callback: Callback) -> None:
         super().__init__()
         self.state: State = State.OUTSIDE
 
@@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser):
 
             self.state = State.OUTSIDE
             return
+
+
+def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
+    from .kompress import kopen
+    results: List[Parsed] = []
+    def cb(dt: datetime, url: Url, title: Title) -> None:
+        results.append((dt, url, title))
+    parser = TakeoutHTMLParser(callback=cb)
+    with kopen(tpath, file) as fo:
+        # TODO careful, wht if it's a string already? make asutf method?
+        data = fo.read().decode('utf8')
+        parser.feed(data)
+    return results
diff --git a/my/media/youtube.py b/my/media/youtube.py
index 4e23f5b..6331190 100755
--- a/my/media/youtube.py
+++ b/my/media/youtube.py
@@ -2,10 +2,7 @@
 from datetime import datetime
 from typing import NamedTuple, List
 
-# TODO ugh. reuse it in mypkg/releaste takeout parser separately?
-from ..kython.ktakeout import TakeoutHTMLParser
-
-from ..kython.kompress import kopen
+from ..kython.ktakeout import read_html
 from ..takeout import get_last_takeout
 
 
@@ -26,15 +23,9 @@ def get_watched():
     last = get_last_takeout(path=path)
 
     watches: List[Watched] = []
-    def cb(dt, url, title):
+    for dt, url, title in read_html(last, path):
         watches.append(Watched(url=url, title=title, when=dt))
 
-    parser = TakeoutHTMLParser(cb)
-
-    with kopen(last, path) as fo:
-        dd = fo.read().decode('utf8')
-        parser.feed(dd)
-
     # TODO hmm they already come sorted.. wonder if should just rely on it..
     return list(sorted(watches, key=lambda e: e.when))
 
diff --git a/my/takeout.py b/my/takeout.py
index 26404eb..e38e493 100644
--- a/my/takeout.py
+++ b/my/takeout.py
@@ -3,6 +3,7 @@ from typing import Optional, Iterable
 
 from .common import get_files
 from .kython.kompress import kopen, kexists
+from .kython.ktakeout import read_html
 
 from my.config import google as config
 
diff --git a/tests/takeout.py b/tests/takeout.py
index 6f7c8d8..6acca9b 100644
--- a/tests/takeout.py
+++ b/tests/takeout.py
@@ -19,30 +19,35 @@ def test_location_perf():
     print(ilen(islice(LT.iter_locations(), 0, 10000)))
 
 
-def test_parser():
-    from my.kython.ktakeout import TakeoutHTMLParser
+# in theory should support any HTML takeout file?
+# although IIRC bookmakrs and search-history.html weren't working
+import pytest # type: ignore
+@pytest.mark.parametrize(
+    'path', [
+        'YouTube/history/watch-history.html',
+        'My Activity/YouTube/MyActivity.html',
+        'My Activity/Chrome/MyActivity.html',
+        'My Activity/Search/MyActivity.html',
+    ]
+)
+def test_parser(path: str):
+    path = 'Takeout/' + path
+    from my.kython.ktakeout import read_html
     from my.takeout import get_last_takeout
 
-    # 4s for parsing with HTMLParser (30K results)
-    path = 'Takeout/My Activity/Chrome/MyActivity.html'
     tpath = get_last_takeout(path=path)
 
     results = []
-    def cb(dt, url, title):
-        results.append((dt, url, title))
+    for res in read_html(tpath, path):
+        results.append(res)
 
-    parser = TakeoutHTMLParser(cb)
-
-    with kopen(tpath, path) as fo:
-        dd = fo.read().decode('utf8')
-        parser.feed(dd)
     print(len(results))
 
 
 def parse_takeout_xmllint(data: str):
     # without xmllint (splitting by '<div class="content-cell' -- 0.68 secs)
     # with xmllint -- 2 seconds
-    # using html.parser -- 4 seconds (+ all the parsing etc)
+    # using html.parser -- 4 seconds (+ all the parsing etc), 30K results
     # not *that* much opportunity to speedup I guess
     # the only downside is that html.parser isn't iterative.. might be able to hack with some iternal hacks?
     # wonder what's the bottleneck..

From a84b51807fc0b9fb62cd5d063252aad1e16288da Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Fri, 24 Apr 2020 18:10:33 +0100
Subject: [PATCH 6/6] more takeout to a separate subpackage

---
 my/{kython/ktakeout.py => google/takeout/html.py} | 4 ++--
 my/{takeout.py => google/takeout/paths.py}        | 5 ++---
 my/location/takeout.py                            | 2 +-
 my/media/youtube.py                               | 4 ++--
 tests/takeout.py                                  | 4 ++--
 5 files changed, 9 insertions(+), 10 deletions(-)
 rename my/{kython/ktakeout.py => google/takeout/html.py} (98%)
 rename my/{takeout.py => google/takeout/paths.py} (89%)

diff --git a/my/kython/ktakeout.py b/my/google/takeout/html.py
similarity index 98%
rename from my/kython/ktakeout.py
rename to my/google/takeout/html.py
index 30688e3..2fccee9 100644
--- a/my/kython/ktakeout.py
+++ b/my/google/takeout/html.py
@@ -8,7 +8,7 @@ from collections import OrderedDict
 from urllib.parse import unquote
 import pytz
 
-from ..core.time import abbr_to_timezone
+from ...core.time import abbr_to_timezone
 
 # Mar 8, 2018, 5:14:40 PM
 _TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
@@ -126,7 +126,7 @@ class TakeoutHTMLParser(HTMLParser):
 
 
 def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
-    from .kompress import kopen
+    from ...kython.kompress import kopen
     results: List[Parsed] = []
     def cb(dt: datetime, url: Url, title: Title) -> None:
         results.append((dt, url, title))
diff --git a/my/takeout.py b/my/google/takeout/paths.py
similarity index 89%
rename from my/takeout.py
rename to my/google/takeout/paths.py
index e38e493..312e2f4 100644
--- a/my/takeout.py
+++ b/my/google/takeout/paths.py
@@ -1,9 +1,8 @@
 from pathlib import Path
 from typing import Optional, Iterable
 
-from .common import get_files
-from .kython.kompress import kopen, kexists
-from .kython.ktakeout import read_html
+from ...common import get_files
+from ...kython.kompress import kopen, kexists
 
 from my.config import google as config
 
diff --git a/my/location/takeout.py b/my/location/takeout.py
index 79ad25c..da53664 100644
--- a/my/location/takeout.py
+++ b/my/location/takeout.py
@@ -23,7 +23,7 @@ except:
     import ijson # type: ignore
 
 from ..common import get_files, LazyLogger, mcachew
-from ..takeout import get_last_takeout
+from ..google.takeout.paths import get_last_takeout
 from ..kython import kompress
 
 
diff --git a/my/media/youtube.py b/my/media/youtube.py
index 6331190..ffe2740 100755
--- a/my/media/youtube.py
+++ b/my/media/youtube.py
@@ -2,8 +2,8 @@
 from datetime import datetime
 from typing import NamedTuple, List
 
-from ..kython.ktakeout import read_html
-from ..takeout import get_last_takeout
+from ..google.takeout.html import read_html
+from ..google.takeout.paths import get_last_takeout
 
 
 class Watched(NamedTuple):
diff --git a/tests/takeout.py b/tests/takeout.py
index 6acca9b..918582f 100644
--- a/tests/takeout.py
+++ b/tests/takeout.py
@@ -32,8 +32,8 @@ import pytest # type: ignore
 )
 def test_parser(path: str):
     path = 'Takeout/' + path
-    from my.kython.ktakeout import read_html
-    from my.takeout import get_last_takeout
+    from my.google.takeout.html import read_html
+    from my.google.takeout.paths import get_last_takeout
 
     tpath = get_last_takeout(path=path)