get rid of callbacks in takeout processing interface

This commit is contained in:
Dima Gerasimov 2020-04-24 17:01:06 +01:00
parent 810fe21839
commit d1aa4d19dc
4 changed files with 40 additions and 25 deletions

View file

@ -3,7 +3,7 @@ import re
from pathlib import Path
from datetime import datetime
from html.parser import HTMLParser
from typing import List, Dict, Optional, Any
from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
from collections import OrderedDict
from urllib.parse import unquote
import pytz
@ -49,10 +49,15 @@ class State(Enum):
PARSING_DATE = 3
Url = str
Title = str
Parsed = Tuple[datetime, Url, Title]
Callback = Callable[[datetime, Url, Title], None]
# would be easier to use beautiful soup, but ends up in a big memory footprint..
class TakeoutHTMLParser(HTMLParser):
def __init__(self, callback) -> None:
def __init__(self, callback: Callback) -> None:
super().__init__()
self.state: State = State.OUTSIDE
@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser):
self.state = State.OUTSIDE
return
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
from .kompress import kopen
results: List[Parsed] = []
def cb(dt: datetime, url: Url, title: Title) -> None:
results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo:
# TODO careful, wht if it's a string already? make asutf method?
data = fo.read().decode('utf8')
parser.feed(data)
return results