162 lines
5.1 KiB
Python
162 lines
5.1 KiB
Python
'''
|
|
Google Takeout exports: browsing history, search/youtube/google play activity
|
|
'''
|
|
|
|
from __future__ import annotations
|
|
|
|
from my.core import __NOT_HPI_MODULE__ # isort: skip
|
|
|
|
import re
|
|
from collections.abc import Iterable
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
from typing import Any, Callable
|
|
from urllib.parse import unquote
|
|
|
|
import pytz
|
|
|
|
from my.core.time import abbr_to_timezone
|
|
|
|
# NOTE: https://bugs.python.org/issue22377 %Z doesn't work properly
|
|
_TIME_FORMATS = [
|
|
"%b %d, %Y, %I:%M:%S %p", # Mar 8, 2018, 5:14:40 PM
|
|
"%d %b %Y, %H:%M:%S",
|
|
]
|
|
|
|
|
|
# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
|
|
def parse_dt(s: str) -> datetime:
|
|
end = s[-3:]
|
|
tz: Any # meh
|
|
if end == ' PM' or end == ' AM':
|
|
# old takeouts didn't have timezone
|
|
# hopefully it was utc? Legacy, so no that much of an issue anymore..
|
|
# todo although maybe worth adding timezone from location provider?
|
|
# note: need to use pytz here for localize call later
|
|
tz = pytz.utc
|
|
else:
|
|
s, tzabbr = s.rsplit(maxsplit=1)
|
|
tz = abbr_to_timezone(tzabbr)
|
|
|
|
dt: datetime | None = None
|
|
for fmt in _TIME_FORMATS:
|
|
try:
|
|
dt = datetime.strptime(s, fmt)
|
|
break
|
|
except ValueError:
|
|
continue
|
|
if dt is None:
|
|
raise RuntimeError("None of formats {} matched {}", _TIME_FORMATS, dt)
|
|
return tz.localize(dt)
|
|
|
|
|
|
def test_parse_dt() -> None:
|
|
parse_dt('Jun 23, 2015, 2:43:45 PM')
|
|
parse_dt('Jan 25, 2019, 8:23:48 AM GMT')
|
|
parse_dt('Jan 22, 2020, 8:34:00 PM UTC')
|
|
parse_dt('Sep 10, 2019, 8:51:45 PM MSK')
|
|
|
|
# this testcases are interesting: in pytz, abbr resolution might depend on the _current_ date!
|
|
# so these used to fail during winter
|
|
# you can see all the different values used in in _tzinfos field
|
|
parse_dt('Jun 01, 2018, 11:00:00 PM BST')
|
|
parse_dt('Jun 01, 2018, 11:00:00 PM PDT')
|
|
parse_dt('Feb 01, 2018, 11:00:00 PM PST')
|
|
|
|
parse_dt('6 Oct 2020, 14:32:28 PDT')
|
|
|
|
|
|
class State(Enum):
|
|
OUTSIDE = 0
|
|
INSIDE = 1
|
|
PARSING_LINK = 2
|
|
PARSING_DATE = 3
|
|
|
|
|
|
Url = str
|
|
Title = str
|
|
Parsed = tuple[datetime, Url, Title]
|
|
Callback = Callable[[datetime, Url, Title], None]
|
|
|
|
|
|
# would be easier to use beautiful soup, but ends up in a big memory footprint..
|
|
class TakeoutHTMLParser(HTMLParser):
|
|
def __init__(self, callback: Callback) -> None:
|
|
super().__init__()
|
|
self.state: State = State.OUTSIDE
|
|
|
|
self.title_parts: list[str] = []
|
|
self.title: str | None = None
|
|
self.url: str | None = None
|
|
|
|
self.callback = callback
|
|
|
|
# enter content cell -> scan link -> scan date -> finish till next content cell
|
|
def handle_starttag(self, tag, attrs):
|
|
if self.state == State.INSIDE and tag == 'a':
|
|
self.state = State.PARSING_LINK
|
|
[hr] = (v for k, v in attrs if k == 'href')
|
|
assert hr is not None
|
|
|
|
# sometimes it's starts with this prefix, it's apparently clicks from google search? or visits from chrome address line? who knows...
|
|
# TODO handle http?
|
|
prefix = r'https://www.google.com/url?q='
|
|
if hr.startswith(prefix + "http"):
|
|
hr = hr[len(prefix):]
|
|
hr = unquote(hr) # TODO not sure about that...
|
|
assert self.url is None; self.url = hr
|
|
|
|
def handle_endtag(self, tag):
|
|
if self.state == State.PARSING_LINK and tag == 'a':
|
|
assert self.title is None
|
|
assert len(self.title_parts) > 0
|
|
self.title = ''.join(self.title_parts)
|
|
self.title_parts = []
|
|
|
|
self.state = State.PARSING_DATE
|
|
|
|
# search example:
|
|
# Visited Emmy Noether - Wikipedia
|
|
# Dec 17, 2018, 8:16:18 AM UTC
|
|
|
|
# youtube example:
|
|
# Watched Jamie xx - Gosh
|
|
# JamiexxVEVO
|
|
# Jun 21, 2018, 5:48:34 AM
|
|
# Products:
|
|
# YouTube
|
|
def handle_data(self, data):
|
|
if self.state == State.OUTSIDE:
|
|
if data[:-1].strip() in ("Watched", "Visited"):
|
|
self.state = State.INSIDE
|
|
return
|
|
|
|
if self.state == State.PARSING_LINK:
|
|
self.title_parts.append(data)
|
|
return
|
|
|
|
# TODO extracting channel as part of wereyouhere could be useful as well
|
|
# need to check for regex because there might be some stuff in between
|
|
if self.state == State.PARSING_DATE and re.search(r'\d{4}.*:.*:', data):
|
|
time = parse_dt(data.strip())
|
|
assert time.tzinfo is not None
|
|
|
|
assert self.url is not None; assert self.title is not None
|
|
self.callback(time, self.url, self.title)
|
|
self.url = None; self.title = None
|
|
|
|
self.state = State.OUTSIDE
|
|
return
|
|
|
|
|
|
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
|
results: list[Parsed] = []
|
|
def cb(dt: datetime, url: Url, title: Title) -> None:
|
|
results.append((dt, url, title))
|
|
parser = TakeoutHTMLParser(callback=cb)
|
|
with (tpath / file).open() as fo:
|
|
data = fo.read()
|
|
parser.feed(data)
|
|
return results
|