HPI/my/hackernews/dogsheep.py
2022-02-04 23:38:50 +00:00

64 lines
1.9 KiB
Python

"""
Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogsheep/hacker-news-to-sqlite]]
"""
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from typing import Iterator, Sequence, Optional, Dict
from my.config import hackernews as user_config
from ..core import Paths
@dataclass
class config(user_config.dogsheep):
# paths[s]/glob to the dogsheep database
export_path: Paths
# todo so much boilerplate... really need some common wildcard imports?...
# at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional
from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
from .common import hackernews_link
# TODO not sure if worth splitting into Comment and Story?
@dataclass(unsafe_hash=True)
class Item:
id: str
type: str
# TODO is it urc??
created: datetime
title: Optional[str] # only present for Story
text_html: Optional[str] # should be present for Comment and might for Story
url: Optional[str] # might be present for Story
# todo process 'deleted'? fields?
# todo process 'parent'?
@property
def permalink(self) -> str:
return hackernews_link(self.id)
from ..core.error import Res
from ..core.dataset import connect_readonly
def items() -> Iterator[Res[Item]]:
f = max(inputs())
with connect_readonly(f) as db:
items = db['items']
for r in items.all(order_by='time'):
yield Item(
id=r['id'],
type=r['type'],
created=datetime.fromtimestamp(r['time']),
title=r['title'],
# todo hmm maybe a method to stip off html tags would be nice
text_html=r['text'],
url=r['url'],
)