diff --git a/my/kython/ktakeout.py b/my/kython/ktakeout.py index 96a3f58..30688e3 100644 --- a/my/kython/ktakeout.py +++ b/my/kython/ktakeout.py @@ -3,7 +3,7 @@ import re from pathlib import Path from datetime import datetime from html.parser import HTMLParser -from typing import List, Dict, Optional, Any +from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple from collections import OrderedDict from urllib.parse import unquote import pytz @@ -49,10 +49,15 @@ class State(Enum): PARSING_DATE = 3 +Url = str +Title = str +Parsed = Tuple[datetime, Url, Title] +Callback = Callable[[datetime, Url, Title], None] + # would be easier to use beautiful soup, but ends up in a big memory footprint.. class TakeoutHTMLParser(HTMLParser): - def __init__(self, callback) -> None: + def __init__(self, callback: Callback) -> None: super().__init__() self.state: State = State.OUTSIDE @@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser): self.state = State.OUTSIDE return + + +def read_html(tpath: Path, file: str) -> Iterable[Parsed]: + from .kompress import kopen + results: List[Parsed] = [] + def cb(dt: datetime, url: Url, title: Title) -> None: + results.append((dt, url, title)) + parser = TakeoutHTMLParser(callback=cb) + with kopen(tpath, file) as fo: + # TODO careful, wht if it's a string already? make asutf method? + data = fo.read().decode('utf8') + parser.feed(data) + return results diff --git a/my/media/youtube.py b/my/media/youtube.py index 4e23f5b..6331190 100755 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -2,10 +2,7 @@ from datetime import datetime from typing import NamedTuple, List -# TODO ugh. reuse it in mypkg/releaste takeout parser separately? -from ..kython.ktakeout import TakeoutHTMLParser - -from ..kython.kompress import kopen +from ..kython.ktakeout import read_html from ..takeout import get_last_takeout @@ -26,15 +23,9 @@ def get_watched(): last = get_last_takeout(path=path) watches: List[Watched] = [] - def cb(dt, url, title): + for dt, url, title in read_html(last, path): watches.append(Watched(url=url, title=title, when=dt)) - parser = TakeoutHTMLParser(cb) - - with kopen(last, path) as fo: - dd = fo.read().decode('utf8') - parser.feed(dd) - # TODO hmm they already come sorted.. wonder if should just rely on it.. return list(sorted(watches, key=lambda e: e.when)) diff --git a/my/takeout.py b/my/takeout.py index 26404eb..e38e493 100644 --- a/my/takeout.py +++ b/my/takeout.py @@ -3,6 +3,7 @@ from typing import Optional, Iterable from .common import get_files from .kython.kompress import kopen, kexists +from .kython.ktakeout import read_html from my.config import google as config diff --git a/tests/takeout.py b/tests/takeout.py index 6f7c8d8..6acca9b 100644 --- a/tests/takeout.py +++ b/tests/takeout.py @@ -19,30 +19,35 @@ def test_location_perf(): print(ilen(islice(LT.iter_locations(), 0, 10000))) -def test_parser(): - from my.kython.ktakeout import TakeoutHTMLParser +# in theory should support any HTML takeout file? +# although IIRC bookmakrs and search-history.html weren't working +import pytest # type: ignore +@pytest.mark.parametrize( + 'path', [ + 'YouTube/history/watch-history.html', + 'My Activity/YouTube/MyActivity.html', + 'My Activity/Chrome/MyActivity.html', + 'My Activity/Search/MyActivity.html', + ] +) +def test_parser(path: str): + path = 'Takeout/' + path + from my.kython.ktakeout import read_html from my.takeout import get_last_takeout - # 4s for parsing with HTMLParser (30K results) - path = 'Takeout/My Activity/Chrome/MyActivity.html' tpath = get_last_takeout(path=path) results = [] - def cb(dt, url, title): - results.append((dt, url, title)) + for res in read_html(tpath, path): + results.append(res) - parser = TakeoutHTMLParser(cb) - - with kopen(tpath, path) as fo: - dd = fo.read().decode('utf8') - parser.feed(dd) print(len(results)) def parse_takeout_xmllint(data: str): # without xmllint (splitting by '