From 590e09f80b8394769b1d8f942438b94cdbba4f37 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 4 Feb 2022 21:56:53 +0000 Subject: [PATCH] hackernews: add initial dogsheep database importer --- my/config.py | 5 +++ my/hackernews/common.py | 2 ++ my/hackernews/dogsheep.py | 64 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 my/hackernews/common.py create mode 100644 my/hackernews/dogsheep.py diff --git a/my/config.py b/my/config.py index cd1bfe8..ad3b854 100644 --- a/my/config.py +++ b/my/config.py @@ -112,3 +112,8 @@ class instagram: export_path: Paths class gdpr: export_path: Paths + + +class hackernews: + class dogsheep: + export_path: Paths diff --git a/my/hackernews/common.py b/my/hackernews/common.py new file mode 100644 index 0000000..8c7dd1e --- /dev/null +++ b/my/hackernews/common.py @@ -0,0 +1,2 @@ +def hackernews_link(id: str) -> str: + return f'https://news.ycombinator.com/item?id={id}' diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py new file mode 100644 index 0000000..7329690 --- /dev/null +++ b/my/hackernews/dogsheep.py @@ -0,0 +1,64 @@ +""" +Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogsheep/hacker-news-to-sqlite]] +""" +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from typing import Iterator, Sequence, Optional, Dict + + +from my.config import hackernews as user_config + + +from ..core import Paths +@dataclass +class config(user_config.dogsheep): + # paths[s]/glob to the dogsheep database + export_path: Paths + + +# todo so much boilerplate... really need some common wildcard imports?... +# at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional +from ..core import get_files +from pathlib import Path +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +from .common import hackernews_link + +# TODO not sure if worth splitting into Comment and Story? +@dataclass(unsafe_hash=True) +class Item: + id: str + type: str + # TODO is it urc?? + created: datetime + title: Optional[str] # only present for Story + text_html: Optional[str] # should be present for Comment and might for Story + url: Optional[str] # might be present for Story + # todo process 'deleted'? fields? + # todo process 'parent'? + + @property + def permalink(self) -> str: + return hackernews_link(self.id) + + +from ..core.error import Res +from ..core.dataset import connect_readonly +def items() -> Iterator[Res[Item]]: + f = max(inputs()) + with connect_readonly(f) as db: + items = db['items'] + for r in items.all(order_by='time'): + yield Item( + id=r['id'], + type=r['type'], + created=datetime.fromtimestamp(r['time']), + title=r['title'], + # todo hmm maybe a method to stip off html tags would be nice + text_html=r['text'], + url=r['url'], + )