From 4e09c5669e3276a4facdad86d4341c1ca8493f96 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 24 Mar 2020 21:05:15 +0000 Subject: [PATCH] import ktakeout from kython --- my/kython/ktakeout.py | 126 ++++++++++++++++++++++++++++++++++++++++++ my/media/youtube.py | 4 +- 2 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 my/kython/ktakeout.py diff --git a/my/kython/ktakeout.py b/my/kython/ktakeout.py new file mode 100644 index 0000000..513d4d6 --- /dev/null +++ b/my/kython/ktakeout.py @@ -0,0 +1,126 @@ +from enum import Enum +import re +from pathlib import Path +from datetime import datetime +from html.parser import HTMLParser +from typing import List, Dict, Optional, Any +from collections import OrderedDict +from urllib.parse import unquote +import pytz + +# Mar 8, 2018, 5:14:40 PM +_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p" + + +# https://gist.github.com/edwardabraham/8680198 +tz_lookup = { + pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x) + for x in pytz.all_timezones +} +tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu... + + +# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :( +def parse_dt(s: str) -> datetime: + fmt = _TIME_FORMAT + # + # ugh. https://bugs.python.org/issue22377 %Z doesn't work properly + + end = s[-3:] + tz: Any # meh + if end == ' PM' or end == ' AM': + # old takeouts didn't have timezone + # hopefully it was utc? Legacy, so no that much of an issue anymore.. + tz = pytz.utc + else: + s, tzname = s.rsplit(maxsplit=1) + tz = tz_lookup[tzname] + + dt = datetime.strptime(s, fmt) + dt = tz.localize(dt) + return dt + + +def test_parse_dt(): + parse_dt('Jun 23, 2015, 2:43:45 PM') + parse_dt('Jan 25, 2019, 8:23:48 AM GMT') + parse_dt('Jan 22, 2020, 8:34:00 PM UTC') + parse_dt('Sep 10, 2019, 8:51:45 PM MSK') + + +class State(Enum): + OUTSIDE = 0 + INSIDE = 1 + PARSING_LINK = 2 + PARSING_DATE = 3 + + + +# would be easier to use beautiful soup, but ends up in a big memory footprint.. +class TakeoutHTMLParser(HTMLParser): + def __init__(self, callback) -> None: + super().__init__() + self.state: State = State.OUTSIDE + + self.title_parts: List[str] = [] + self.title: Optional[str] = None + self.url: Optional[str] = None + + self.callback = callback + + # enter content cell -> scan link -> scan date -> finish till next content cell + def handle_starttag(self, tag, attrs): + if self.state == State.INSIDE and tag == 'a': + self.state = State.PARSING_LINK + attrs = OrderedDict(attrs) + hr = attrs['href'] + + # sometimes it's starts with this prefix, it's apparently clicks from google search? or visits from chrome address line? who knows... + # TODO handle http? + prefix = r'https://www.google.com/url?q=' + if hr.startswith(prefix + "http"): + hr = hr[len(prefix):] + hr = unquote(hr) # TODO not sure about that... + assert self.url is None; self.url = hr + + def handle_endtag(self, tag): + if self.state == State.PARSING_LINK and tag == 'a': + assert self.title is None + assert len(self.title_parts) > 0 + self.title = ''.join(self.title_parts) + self.title_parts = [] + + self.state = State.PARSING_DATE + + # search example: + # Visited Emmy Noether - Wikipedia + # Dec 17, 2018, 8:16:18 AM UTC + + # youtube example: + # Watched Jamie xx - Gosh + # JamiexxVEVO + # Jun 21, 2018, 5:48:34 AM + # Products: + #  YouTube + def handle_data(self, data): + if self.state == State.OUTSIDE: + if data[:-1].strip() in ("Watched", "Visited"): + self.state = State.INSIDE + return + + if self.state == State.PARSING_LINK: + self.title_parts.append(data) + return + + # TODO extracting channel as part of wereyouhere could be useful as well + # need to check for regex because there might be some stuff in between + if self.state == State.PARSING_DATE and re.search(r'\d{4}.*:.*:', data): + time = parse_dt(data.strip()) + assert time.tzinfo is not None + + assert self.url is not None; assert self.title is not None + self.callback(time, self.url, self.title) + self.url = None; self.title = None + + self.state = State.OUTSIDE + return diff --git a/my/media/youtube.py b/my/media/youtube.py index e8ee57f..2050be3 100755 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -2,8 +2,8 @@ from datetime import datetime from typing import NamedTuple, List -# TODO FIXME -from kython.ktakeout import TakeoutHTMLParser +# TODO ugh. reuse it in mypkg/releaste takeout parser separately? +from ..kython.ktakeout import TakeoutHTMLParser from ..kython.kompress import kopen from ..takeout import get_last_takeout