get rid of callbacks in takeout processing interface
This commit is contained in:
parent
810fe21839
commit
d1aa4d19dc
4 changed files with 40 additions and 25 deletions
|
@ -3,7 +3,7 @@ import re
|
|||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from html.parser import HTMLParser
|
||||
from typing import List, Dict, Optional, Any
|
||||
from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
|
||||
from collections import OrderedDict
|
||||
from urllib.parse import unquote
|
||||
import pytz
|
||||
|
@ -49,10 +49,15 @@ class State(Enum):
|
|||
PARSING_DATE = 3
|
||||
|
||||
|
||||
Url = str
|
||||
Title = str
|
||||
Parsed = Tuple[datetime, Url, Title]
|
||||
Callback = Callable[[datetime, Url, Title], None]
|
||||
|
||||
|
||||
# would be easier to use beautiful soup, but ends up in a big memory footprint..
|
||||
class TakeoutHTMLParser(HTMLParser):
|
||||
def __init__(self, callback) -> None:
|
||||
def __init__(self, callback: Callback) -> None:
|
||||
super().__init__()
|
||||
self.state: State = State.OUTSIDE
|
||||
|
||||
|
@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser):
|
|||
|
||||
self.state = State.OUTSIDE
|
||||
return
|
||||
|
||||
|
||||
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
||||
from .kompress import kopen
|
||||
results: List[Parsed] = []
|
||||
def cb(dt: datetime, url: Url, title: Title) -> None:
|
||||
results.append((dt, url, title))
|
||||
parser = TakeoutHTMLParser(callback=cb)
|
||||
with kopen(tpath, file) as fo:
|
||||
# TODO careful, wht if it's a string already? make asutf method?
|
||||
data = fo.read().decode('utf8')
|
||||
parser.feed(data)
|
||||
return results
|
||||
|
|
|
@ -2,10 +2,7 @@
|
|||
from datetime import datetime
|
||||
from typing import NamedTuple, List
|
||||
|
||||
# TODO ugh. reuse it in mypkg/releaste takeout parser separately?
|
||||
from ..kython.ktakeout import TakeoutHTMLParser
|
||||
|
||||
from ..kython.kompress import kopen
|
||||
from ..kython.ktakeout import read_html
|
||||
from ..takeout import get_last_takeout
|
||||
|
||||
|
||||
|
@ -26,15 +23,9 @@ def get_watched():
|
|||
last = get_last_takeout(path=path)
|
||||
|
||||
watches: List[Watched] = []
|
||||
def cb(dt, url, title):
|
||||
for dt, url, title in read_html(last, path):
|
||||
watches.append(Watched(url=url, title=title, when=dt))
|
||||
|
||||
parser = TakeoutHTMLParser(cb)
|
||||
|
||||
with kopen(last, path) as fo:
|
||||
dd = fo.read().decode('utf8')
|
||||
parser.feed(dd)
|
||||
|
||||
# TODO hmm they already come sorted.. wonder if should just rely on it..
|
||||
return list(sorted(watches, key=lambda e: e.when))
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ from typing import Optional, Iterable
|
|||
|
||||
from .common import get_files
|
||||
from .kython.kompress import kopen, kexists
|
||||
from .kython.ktakeout import read_html
|
||||
|
||||
from my.config import google as config
|
||||
|
||||
|
|
|
@ -19,30 +19,35 @@ def test_location_perf():
|
|||
print(ilen(islice(LT.iter_locations(), 0, 10000)))
|
||||
|
||||
|
||||
def test_parser():
|
||||
from my.kython.ktakeout import TakeoutHTMLParser
|
||||
# in theory should support any HTML takeout file?
|
||||
# although IIRC bookmakrs and search-history.html weren't working
|
||||
import pytest # type: ignore
|
||||
@pytest.mark.parametrize(
|
||||
'path', [
|
||||
'YouTube/history/watch-history.html',
|
||||
'My Activity/YouTube/MyActivity.html',
|
||||
'My Activity/Chrome/MyActivity.html',
|
||||
'My Activity/Search/MyActivity.html',
|
||||
]
|
||||
)
|
||||
def test_parser(path: str):
|
||||
path = 'Takeout/' + path
|
||||
from my.kython.ktakeout import read_html
|
||||
from my.takeout import get_last_takeout
|
||||
|
||||
# 4s for parsing with HTMLParser (30K results)
|
||||
path = 'Takeout/My Activity/Chrome/MyActivity.html'
|
||||
tpath = get_last_takeout(path=path)
|
||||
|
||||
results = []
|
||||
def cb(dt, url, title):
|
||||
results.append((dt, url, title))
|
||||
for res in read_html(tpath, path):
|
||||
results.append(res)
|
||||
|
||||
parser = TakeoutHTMLParser(cb)
|
||||
|
||||
with kopen(tpath, path) as fo:
|
||||
dd = fo.read().decode('utf8')
|
||||
parser.feed(dd)
|
||||
print(len(results))
|
||||
|
||||
|
||||
def parse_takeout_xmllint(data: str):
|
||||
# without xmllint (splitting by '<div class="content-cell' -- 0.68 secs)
|
||||
# with xmllint -- 2 seconds
|
||||
# using html.parser -- 4 seconds (+ all the parsing etc)
|
||||
# using html.parser -- 4 seconds (+ all the parsing etc), 30K results
|
||||
# not *that* much opportunity to speedup I guess
|
||||
# the only downside is that html.parser isn't iterative.. might be able to hack with some iternal hacks?
|
||||
# wonder what's the bottleneck..
|
||||
|
|
Loading…
Add table
Reference in a new issue