get rid of callbacks in takeout processing interface

This commit is contained in:
Dima Gerasimov 2020-04-24 17:01:06 +01:00
parent 810fe21839
commit d1aa4d19dc
4 changed files with 40 additions and 25 deletions

View file

@ -3,7 +3,7 @@ import re
from pathlib import Path
from datetime import datetime
from html.parser import HTMLParser
from typing import List, Dict, Optional, Any
from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
from collections import OrderedDict
from urllib.parse import unquote
import pytz
@ -49,10 +49,15 @@ class State(Enum):
PARSING_DATE = 3
Url = str
Title = str
Parsed = Tuple[datetime, Url, Title]
Callback = Callable[[datetime, Url, Title], None]
# would be easier to use beautiful soup, but ends up in a big memory footprint..
class TakeoutHTMLParser(HTMLParser):
def __init__(self, callback) -> None:
def __init__(self, callback: Callback) -> None:
super().__init__()
self.state: State = State.OUTSIDE
@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser):
self.state = State.OUTSIDE
return
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
from .kompress import kopen
results: List[Parsed] = []
def cb(dt: datetime, url: Url, title: Title) -> None:
results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo:
# TODO careful, wht if it's a string already? make asutf method?
data = fo.read().decode('utf8')
parser.feed(data)
return results

View file

@ -2,10 +2,7 @@
from datetime import datetime
from typing import NamedTuple, List
# TODO ugh. reuse it in mypkg/releaste takeout parser separately?
from ..kython.ktakeout import TakeoutHTMLParser
from ..kython.kompress import kopen
from ..kython.ktakeout import read_html
from ..takeout import get_last_takeout
@ -26,15 +23,9 @@ def get_watched():
last = get_last_takeout(path=path)
watches: List[Watched] = []
def cb(dt, url, title):
for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt))
parser = TakeoutHTMLParser(cb)
with kopen(last, path) as fo:
dd = fo.read().decode('utf8')
parser.feed(dd)
# TODO hmm they already come sorted.. wonder if should just rely on it..
return list(sorted(watches, key=lambda e: e.when))

View file

@ -3,6 +3,7 @@ from typing import Optional, Iterable
from .common import get_files
from .kython.kompress import kopen, kexists
from .kython.ktakeout import read_html
from my.config import google as config

View file

@ -19,30 +19,35 @@ def test_location_perf():
print(ilen(islice(LT.iter_locations(), 0, 10000)))
def test_parser():
from my.kython.ktakeout import TakeoutHTMLParser
# in theory should support any HTML takeout file?
# although IIRC bookmakrs and search-history.html weren't working
import pytest # type: ignore
@pytest.mark.parametrize(
'path', [
'YouTube/history/watch-history.html',
'My Activity/YouTube/MyActivity.html',
'My Activity/Chrome/MyActivity.html',
'My Activity/Search/MyActivity.html',
]
)
def test_parser(path: str):
path = 'Takeout/' + path
from my.kython.ktakeout import read_html
from my.takeout import get_last_takeout
# 4s for parsing with HTMLParser (30K results)
path = 'Takeout/My Activity/Chrome/MyActivity.html'
tpath = get_last_takeout(path=path)
results = []
def cb(dt, url, title):
results.append((dt, url, title))
for res in read_html(tpath, path):
results.append(res)
parser = TakeoutHTMLParser(cb)
with kopen(tpath, path) as fo:
dd = fo.read().decode('utf8')
parser.feed(dd)
print(len(results))
def parse_takeout_xmllint(data: str):
# without xmllint (splitting by '<div class="content-cell' -- 0.68 secs)
# with xmllint -- 2 seconds
# using html.parser -- 4 seconds (+ all the parsing etc)
# using html.parser -- 4 seconds (+ all the parsing etc), 30K results
# not *that* much opportunity to speedup I guess
# the only downside is that html.parser isn't iterative.. might be able to hack with some iternal hacks?
# wonder what's the bottleneck..