get rid of callbacks in takeout processing interface
This commit is contained in:
parent
810fe21839
commit
d1aa4d19dc
4 changed files with 40 additions and 25 deletions
|
@ -3,7 +3,7 @@ import re
|
|||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from html.parser import HTMLParser
|
||||
from typing import List, Dict, Optional, Any
|
||||
from typing import List, Dict, Optional, Any, Callable, Iterable, Tuple
|
||||
from collections import OrderedDict
|
||||
from urllib.parse import unquote
|
||||
import pytz
|
||||
|
@ -49,10 +49,15 @@ class State(Enum):
|
|||
PARSING_DATE = 3
|
||||
|
||||
|
||||
Url = str
|
||||
Title = str
|
||||
Parsed = Tuple[datetime, Url, Title]
|
||||
Callback = Callable[[datetime, Url, Title], None]
|
||||
|
||||
|
||||
# would be easier to use beautiful soup, but ends up in a big memory footprint..
|
||||
class TakeoutHTMLParser(HTMLParser):
|
||||
def __init__(self, callback) -> None:
|
||||
def __init__(self, callback: Callback) -> None:
|
||||
super().__init__()
|
||||
self.state: State = State.OUTSIDE
|
||||
|
||||
|
@ -118,3 +123,16 @@ class TakeoutHTMLParser(HTMLParser):
|
|||
|
||||
self.state = State.OUTSIDE
|
||||
return
|
||||
|
||||
|
||||
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
||||
from .kompress import kopen
|
||||
results: List[Parsed] = []
|
||||
def cb(dt: datetime, url: Url, title: Title) -> None:
|
||||
results.append((dt, url, title))
|
||||
parser = TakeoutHTMLParser(callback=cb)
|
||||
with kopen(tpath, file) as fo:
|
||||
# TODO careful, wht if it's a string already? make asutf method?
|
||||
data = fo.read().decode('utf8')
|
||||
parser.feed(data)
|
||||
return results
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue