more takeout to a separate subpackage

This commit is contained in:
Dima Gerasimov 2020-04-24 18:10:33 +01:00
parent d1aa4d19dc
commit a84b51807f
5 changed files with 9 additions and 10 deletions

138
my/google/takeout/html.py Normal file
View file

@ -0,0 +1,138 @@
from enum import Enum
import re
from pathlib import Path
from datetime import datetime
from html.parser import HTMLParser
from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
from collections import OrderedDict
from urllib.parse import unquote
import pytz
from ...core.time import abbr_to_timezone
# Mar 8, 2018, 5:14:40 PM
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
def parse_dt(s: str) -> datetime:
fmt = _TIME_FORMAT
#
# ugh. https://bugs.python.org/issue22377 %Z doesn't work properly
end = s[-3:]
tz: Any # meh
if end == ' PM' or end == ' AM':
# old takeouts didn't have timezone
# hopefully it was utc? Legacy, so no that much of an issue anymore..
tz = pytz.utc
else:
s, tzabbr = s.rsplit(maxsplit=1)
tz = abbr_to_timezone(tzabbr)
dt = datetime.strptime(s, fmt)
dt = tz.localize(dt)
return dt
def test_parse_dt():
parse_dt('Jun 23, 2015, 2:43:45 PM')
parse_dt('Jan 25, 2019, 8:23:48 AM GMT')
parse_dt('Jan 22, 2020, 8:34:00 PM UTC')
parse_dt('Sep 10, 2019, 8:51:45 PM MSK')
class State(Enum):
OUTSIDE = 0
INSIDE = 1
PARSING_LINK = 2
PARSING_DATE = 3
Url = str
Title = str
Parsed = Tuple[datetime, Url, Title]
Callback = Callable[[datetime, Url, Title], None]
# would be easier to use beautiful soup, but ends up in a big memory footprint..
class TakeoutHTMLParser(HTMLParser):
def __init__(self, callback: Callback) -> None:
super().__init__()
self.state: State = State.OUTSIDE
self.title_parts: List[str] = []
self.title: Optional[str] = None
self.url: Optional[str] = None
self.callback = callback
# enter content cell -> scan link -> scan date -> finish till next content cell
def handle_starttag(self, tag, attrs):
if self.state == State.INSIDE and tag == 'a':
self.state = State.PARSING_LINK
attrs = OrderedDict(attrs)
hr = attrs['href']
# sometimes it's starts with this prefix, it's apparently clicks from google search? or visits from chrome address line? who knows...
# TODO handle http?
prefix = r'https://www.google.com/url?q='
if hr.startswith(prefix + "http"):
hr = hr[len(prefix):]
hr = unquote(hr) # TODO not sure about that...
assert self.url is None; self.url = hr
def handle_endtag(self, tag):
if self.state == State.PARSING_LINK and tag == 'a':
assert self.title is None
assert len(self.title_parts) > 0
self.title = ''.join(self.title_parts)
self.title_parts = []
self.state = State.PARSING_DATE
# search example:
# Visited Emmy Noether - Wikipedia
# Dec 17, 2018, 8:16:18 AM UTC
# youtube example:
# Watched Jamie xx - Gosh
# JamiexxVEVO
# Jun 21, 2018, 5:48:34 AM
# Products:
# YouTube
def handle_data(self, data):
if self.state == State.OUTSIDE:
if data[:-1].strip() in ("Watched", "Visited"):
self.state = State.INSIDE
return
if self.state == State.PARSING_LINK:
self.title_parts.append(data)
return
# TODO extracting channel as part of wereyouhere could be useful as well
# need to check for regex because there might be some stuff in between
if self.state == State.PARSING_DATE and re.search(r'\d{4}.*:.*:', data):
time = parse_dt(data.strip())
assert time.tzinfo is not None
assert self.url is not None; assert self.title is not None
self.callback(time, self.url, self.title)
self.url = None; self.title = None
self.state = State.OUTSIDE
return
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
from ...kython.kompress import kopen
results: List[Parsed] = []
def cb(dt: datetime, url: Url, title: Title) -> None:
results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo:
# TODO careful, wht if it's a string already? make asutf method?
data = fo.read().decode('utf8')
parser.feed(data)
return results

View file

@ -0,0 +1,29 @@
from pathlib import Path
from typing import Optional, Iterable
from ...common import get_files
from ...kython.kompress import kopen, kexists
from my.config import google as config
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
"""
Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need
"""
# TODO FIXME zip is not great..
# allow a lambda expression? that way the user could restrict it
for takeout in get_files(config.takeout_path, glob='*.zip'):
if path is None or kexists(takeout, path):
yield takeout
def get_last_takeout(*, path: Optional[str]=None) -> Path:
# TODO more_itertools?
matching = list(get_takeouts(path=path))
return matching[-1]
# TODO might be a good idea to merge across multiple takeouts...
# perhaps even a special takeout module that deals with all of this automatically?
# e.g. accumulate, filter and maybe report useless takeouts?