import ktakeout from kython
This commit is contained in:
parent
09a0c8eb28
commit
4e09c5669e
2 changed files with 128 additions and 2 deletions
126
my/kython/ktakeout.py
Normal file
126
my/kython/ktakeout.py
Normal file
|
@ -0,0 +1,126 @@
|
|||
from enum import Enum
|
||||
import re
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from html.parser import HTMLParser
|
||||
from typing import List, Dict, Optional, Any
|
||||
from collections import OrderedDict
|
||||
from urllib.parse import unquote
|
||||
import pytz
|
||||
|
||||
# Mar 8, 2018, 5:14:40 PM
|
||||
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
|
||||
|
||||
|
||||
# https://gist.github.com/edwardabraham/8680198
|
||||
tz_lookup = {
|
||||
pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
|
||||
for x in pytz.all_timezones
|
||||
}
|
||||
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
|
||||
|
||||
|
||||
# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
|
||||
def parse_dt(s: str) -> datetime:
|
||||
fmt = _TIME_FORMAT
|
||||
#
|
||||
# ugh. https://bugs.python.org/issue22377 %Z doesn't work properly
|
||||
|
||||
end = s[-3:]
|
||||
tz: Any # meh
|
||||
if end == ' PM' or end == ' AM':
|
||||
# old takeouts didn't have timezone
|
||||
# hopefully it was utc? Legacy, so no that much of an issue anymore..
|
||||
tz = pytz.utc
|
||||
else:
|
||||
s, tzname = s.rsplit(maxsplit=1)
|
||||
tz = tz_lookup[tzname]
|
||||
|
||||
dt = datetime.strptime(s, fmt)
|
||||
dt = tz.localize(dt)
|
||||
return dt
|
||||
|
||||
|
||||
def test_parse_dt():
|
||||
parse_dt('Jun 23, 2015, 2:43:45 PM')
|
||||
parse_dt('Jan 25, 2019, 8:23:48 AM GMT')
|
||||
parse_dt('Jan 22, 2020, 8:34:00 PM UTC')
|
||||
parse_dt('Sep 10, 2019, 8:51:45 PM MSK')
|
||||
|
||||
|
||||
class State(Enum):
|
||||
OUTSIDE = 0
|
||||
INSIDE = 1
|
||||
PARSING_LINK = 2
|
||||
PARSING_DATE = 3
|
||||
|
||||
|
||||
|
||||
# would be easier to use beautiful soup, but ends up in a big memory footprint..
|
||||
class TakeoutHTMLParser(HTMLParser):
|
||||
def __init__(self, callback) -> None:
|
||||
super().__init__()
|
||||
self.state: State = State.OUTSIDE
|
||||
|
||||
self.title_parts: List[str] = []
|
||||
self.title: Optional[str] = None
|
||||
self.url: Optional[str] = None
|
||||
|
||||
self.callback = callback
|
||||
|
||||
# enter content cell -> scan link -> scan date -> finish till next content cell
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if self.state == State.INSIDE and tag == 'a':
|
||||
self.state = State.PARSING_LINK
|
||||
attrs = OrderedDict(attrs)
|
||||
hr = attrs['href']
|
||||
|
||||
# sometimes it's starts with this prefix, it's apparently clicks from google search? or visits from chrome address line? who knows...
|
||||
# TODO handle http?
|
||||
prefix = r'https://www.google.com/url?q='
|
||||
if hr.startswith(prefix + "http"):
|
||||
hr = hr[len(prefix):]
|
||||
hr = unquote(hr) # TODO not sure about that...
|
||||
assert self.url is None; self.url = hr
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if self.state == State.PARSING_LINK and tag == 'a':
|
||||
assert self.title is None
|
||||
assert len(self.title_parts) > 0
|
||||
self.title = ''.join(self.title_parts)
|
||||
self.title_parts = []
|
||||
|
||||
self.state = State.PARSING_DATE
|
||||
|
||||
# search example:
|
||||
# Visited Emmy Noether - Wikipedia
|
||||
# Dec 17, 2018, 8:16:18 AM UTC
|
||||
|
||||
# youtube example:
|
||||
# Watched Jamie xx - Gosh
|
||||
# JamiexxVEVO
|
||||
# Jun 21, 2018, 5:48:34 AM
|
||||
# Products:
|
||||
# YouTube
|
||||
def handle_data(self, data):
|
||||
if self.state == State.OUTSIDE:
|
||||
if data[:-1].strip() in ("Watched", "Visited"):
|
||||
self.state = State.INSIDE
|
||||
return
|
||||
|
||||
if self.state == State.PARSING_LINK:
|
||||
self.title_parts.append(data)
|
||||
return
|
||||
|
||||
# TODO extracting channel as part of wereyouhere could be useful as well
|
||||
# need to check for regex because there might be some stuff in between
|
||||
if self.state == State.PARSING_DATE and re.search(r'\d{4}.*:.*:', data):
|
||||
time = parse_dt(data.strip())
|
||||
assert time.tzinfo is not None
|
||||
|
||||
assert self.url is not None; assert self.title is not None
|
||||
self.callback(time, self.url, self.title)
|
||||
self.url = None; self.title = None
|
||||
|
||||
self.state = State.OUTSIDE
|
||||
return
|
|
@ -2,8 +2,8 @@
|
|||
from datetime import datetime
|
||||
from typing import NamedTuple, List
|
||||
|
||||
# TODO FIXME
|
||||
from kython.ktakeout import TakeoutHTMLParser
|
||||
# TODO ugh. reuse it in mypkg/releaste takeout parser separately?
|
||||
from ..kython.ktakeout import TakeoutHTMLParser
|
||||
|
||||
from ..kython.kompress import kopen
|
||||
from ..takeout import get_last_takeout
|
||||
|
|
Loading…
Add table
Reference in a new issue