get rid of callbacks in takeout processing interface

This commit is contained in:
Dima Gerasimov 2020-04-24 17:01:06 +01:00
parent 810fe21839
commit d1aa4d19dc
4 changed files with 40 additions and 25 deletions

View file

@ -3,7 +3,7 @@ import re
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from html.parser import HTMLParser from html.parser import HTMLParser
from typing import List, Dict, Optional, Any from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
from collections import OrderedDict from collections import OrderedDict
from urllib.parse import unquote from urllib.parse import unquote
import pytz import pytz
@ -49,10 +49,15 @@ class State(Enum):
PARSING_DATE = 3 PARSING_DATE = 3
Url = str
Title = str
Parsed = Tuple[datetime, Url, Title]
Callback = Callable[[datetime, Url, Title], None]
# would be easier to use beautiful soup, but ends up in a big memory footprint.. # would be easier to use beautiful soup, but ends up in a big memory footprint..
class TakeoutHTMLParser(HTMLParser): class TakeoutHTMLParser(HTMLParser):
def __init__(self, callback) -> None: def __init__(self, callback: Callback) -> None:
super().__init__() super().__init__()
self.state: State = State.OUTSIDE self.state: State = State.OUTSIDE
@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser):
self.state = State.OUTSIDE self.state = State.OUTSIDE
return return
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
from .kompress import kopen
results: List[Parsed] = []
def cb(dt: datetime, url: Url, title: Title) -> None:
results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo:
# TODO careful, wht if it's a string already? make asutf method?
data = fo.read().decode('utf8')
parser.feed(data)
return results

View file

@ -2,10 +2,7 @@
from datetime import datetime from datetime import datetime
from typing import NamedTuple, List from typing import NamedTuple, List
# TODO ugh. reuse it in mypkg/releaste takeout parser separately? from ..kython.ktakeout import read_html
from ..kython.ktakeout import TakeoutHTMLParser
from ..kython.kompress import kopen
from ..takeout import get_last_takeout from ..takeout import get_last_takeout
@ -26,15 +23,9 @@ def get_watched():
last = get_last_takeout(path=path) last = get_last_takeout(path=path)
watches: List[Watched] = [] watches: List[Watched] = []
def cb(dt, url, title): for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt)) watches.append(Watched(url=url, title=title, when=dt))
parser = TakeoutHTMLParser(cb)
with kopen(last, path) as fo:
dd = fo.read().decode('utf8')
parser.feed(dd)
# TODO hmm they already come sorted.. wonder if should just rely on it.. # TODO hmm they already come sorted.. wonder if should just rely on it..
return list(sorted(watches, key=lambda e: e.when)) return list(sorted(watches, key=lambda e: e.when))

View file

@ -3,6 +3,7 @@ from typing import Optional, Iterable
from .common import get_files from .common import get_files
from .kython.kompress import kopen, kexists from .kython.kompress import kopen, kexists
from .kython.ktakeout import read_html
from my.config import google as config from my.config import google as config

View file

@ -19,30 +19,35 @@ def test_location_perf():
print(ilen(islice(LT.iter_locations(), 0, 10000))) print(ilen(islice(LT.iter_locations(), 0, 10000)))
def test_parser(): # in theory should support any HTML takeout file?
from my.kython.ktakeout import TakeoutHTMLParser # although IIRC bookmakrs and search-history.html weren't working
import pytest # type: ignore
@pytest.mark.parametrize(
'path', [
'YouTube/history/watch-history.html',
'My Activity/YouTube/MyActivity.html',
'My Activity/Chrome/MyActivity.html',
'My Activity/Search/MyActivity.html',
]
)
def test_parser(path: str):
path = 'Takeout/' + path
from my.kython.ktakeout import read_html
from my.takeout import get_last_takeout from my.takeout import get_last_takeout
# 4s for parsing with HTMLParser (30K results)
path = 'Takeout/My Activity/Chrome/MyActivity.html'
tpath = get_last_takeout(path=path) tpath = get_last_takeout(path=path)
results = [] results = []
def cb(dt, url, title): for res in read_html(tpath, path):
results.append((dt, url, title)) results.append(res)
parser = TakeoutHTMLParser(cb)
with kopen(tpath, path) as fo:
dd = fo.read().decode('utf8')
parser.feed(dd)
print(len(results)) print(len(results))
def parse_takeout_xmllint(data: str): def parse_takeout_xmllint(data: str):
# without xmllint (splitting by '<div class="content-cell' -- 0.68 secs) # without xmllint (splitting by '<div class="content-cell' -- 0.68 secs)
# with xmllint -- 2 seconds # with xmllint -- 2 seconds
# using html.parser -- 4 seconds (+ all the parsing etc) # using html.parser -- 4 seconds (+ all the parsing etc), 30K results
# not *that* much opportunity to speedup I guess # not *that* much opportunity to speedup I guess
# the only downside is that html.parser isn't iterative.. might be able to hack with some iternal hacks? # the only downside is that html.parser isn't iterative.. might be able to hack with some iternal hacks?
# wonder what's the bottleneck.. # wonder what's the bottleneck..