more takeout to a separate subpackage

2020-04-24 18:10:33 +01:00 · 2020-04-24 18:10:33 +01:00 · a84b51807f
commit a84b51807f
parent d1aa4d19dc
5 changed files with 9 additions and 10 deletions
--- a/my/google/takeout/html.py
+++ b/my/google/takeout/html.py
@ -0,0 +1,138 @@
+from enum import Enum
+import re
+from pathlib import Path
+from datetime import datetime
+from html.parser import HTMLParser
+from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
+from collections import OrderedDict
+from urllib.parse import unquote
+import pytz
+
+from ...core.time import abbr_to_timezone
+
+# Mar 8, 2018, 5:14:40 PM
+_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
+
+
+# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
+def parse_dt(s: str) -> datetime:
+    fmt = _TIME_FORMAT
+    #
+    # ugh. https://bugs.python.org/issue22377 %Z doesn't work properly
+
+    end = s[-3:]
+    tz: Any # meh
+    if end == ' PM' or end == ' AM':
+        # old takeouts didn't have timezone
+        # hopefully it was utc? Legacy, so no that much of an issue anymore..
+        tz = pytz.utc
+    else:
+        s, tzabbr = s.rsplit(maxsplit=1)
+        tz = abbr_to_timezone(tzabbr)
+
+    dt = datetime.strptime(s, fmt)
+    dt = tz.localize(dt)
+    return dt
+
+
+def test_parse_dt():
+    parse_dt('Jun 23, 2015, 2:43:45 PM')
+    parse_dt('Jan 25, 2019, 8:23:48 AM GMT')
+    parse_dt('Jan 22, 2020, 8:34:00 PM UTC')
+    parse_dt('Sep 10, 2019, 8:51:45 PM MSK')
+
+
+class State(Enum):
+    OUTSIDE = 0
+    INSIDE = 1
+    PARSING_LINK = 2
+    PARSING_DATE = 3
+
+
+Url = str
+Title = str
+Parsed = Tuple[datetime, Url, Title]
+Callback = Callable[[datetime, Url, Title], None]
+
+
+# would be easier to use beautiful soup, but ends up in a big memory footprint..
+class TakeoutHTMLParser(HTMLParser):
+    def __init__(self, callback: Callback) -> None:
+        super().__init__()
+        self.state: State = State.OUTSIDE
+
+        self.title_parts: List[str] = []
+        self.title: Optional[str] = None
+        self.url: Optional[str] = None
+
+        self.callback = callback
+
+    # enter content cell -> scan link -> scan date -> finish till next content cell
+    def handle_starttag(self, tag, attrs):
+        if self.state == State.INSIDE and tag == 'a':
+            self.state = State.PARSING_LINK
+            attrs = OrderedDict(attrs)
+            hr = attrs['href']
+
+            # sometimes it's starts with this prefix, it's apparently clicks from google search? or visits from chrome address line? who knows...
+            # TODO handle http?
+            prefix = r'https://www.google.com/url?q='
+            if hr.startswith(prefix + "http"):
+                hr = hr[len(prefix):]
+                hr = unquote(hr) # TODO not sure about that...
+            assert self.url is None; self.url = hr
+
+    def handle_endtag(self, tag):
+        if self.state == State.PARSING_LINK and tag == 'a':
+            assert self.title is None
+            assert len(self.title_parts) > 0
+            self.title = ''.join(self.title_parts)
+            self.title_parts = []
+
+            self.state = State.PARSING_DATE
+
+    # search example:
+    # Visited Emmy Noether - Wikipedia
+    # Dec 17, 2018, 8:16:18 AM UTC
+
+    # youtube example:
+    # Watched Jamie xx - Gosh
+    # JamiexxVEVO
+    # Jun 21, 2018, 5:48:34 AM
+    # Products:
+    #  YouTube
+    def handle_data(self, data):
+        if self.state == State.OUTSIDE:
+            if data[:-1].strip() in ("Watched", "Visited"):
+                self.state = State.INSIDE
+                return
+
+        if self.state == State.PARSING_LINK:
+            self.title_parts.append(data)
+            return
+
+        # TODO extracting channel as part of wereyouhere could be useful as well
+        # need to check for regex because there might be some stuff in between
+        if self.state == State.PARSING_DATE and re.search(r'\d{4}.*:.*:', data):
+            time = parse_dt(data.strip())
+            assert time.tzinfo is not None
+
+            assert self.url is not None; assert self.title is not None
+            self.callback(time, self.url, self.title)
+            self.url = None; self.title = None
+
+            self.state = State.OUTSIDE
+            return
+
+
+def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
+    from ...kython.kompress import kopen
+    results: List[Parsed] = []
+    def cb(dt: datetime, url: Url, title: Title) -> None:
+        results.append((dt, url, title))
+    parser = TakeoutHTMLParser(callback=cb)
+    with kopen(tpath, file) as fo:
+        # TODO careful, wht if it's a string already? make asutf method?
+        data = fo.read().decode('utf8')
+        parser.feed(data)
+    return results
--- a/my/google/takeout/paths.py
+++ b/my/google/takeout/paths.py
@ -0,0 +1,29 @@
+from pathlib import Path
+from typing import Optional, Iterable
+
+from ...common import get_files
+from ...kython.kompress import kopen, kexists
+
+from my.config import google as config
+
+def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
+    """
+    Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need
+    """
+    # TODO FIXME zip is not great..
+    # allow a lambda expression? that way the user could restrict it
+    for takeout in get_files(config.takeout_path, glob='*.zip'):
+        if path is None or kexists(takeout, path):
+            yield takeout
+
+
+def get_last_takeout(*, path: Optional[str]=None) -> Path:
+    # TODO more_itertools?
+    matching = list(get_takeouts(path=path))
+    return matching[-1]
+
+
+# TODO might be a good idea to merge across multiple takeouts...
+# perhaps even a special takeout module that deals with all of this automatically?
+# e.g. accumulate, filter and maybe report useless takeouts?
+