hackernews: add initial dogsheep database importer

2022-02-04 21:56:53 +00:00 · 2022-02-04 21:56:53 +00:00 · 590e09f80b
commit 590e09f80b
parent 1e635502a2
3 changed files with 71 additions and 0 deletions
--- a/my/config.py
+++ b/my/config.py
@ -112,3 +112,8 @@ class instagram:
        export_path: Paths
    class gdpr:
        export_path: Paths
+
+
+class hackernews:
+    class dogsheep:
+        export_path: Paths
--- a/my/hackernews/common.py
+++ b/my/hackernews/common.py
@ -0,0 +1,2 @@
+def hackernews_link(id: str) -> str:
+    return f'https://news.ycombinator.com/item?id={id}'
--- a/my/hackernews/dogsheep.py
+++ b/my/hackernews/dogsheep.py
@ -0,0 +1,64 @@
+"""
+Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogsheep/hacker-news-to-sqlite]]
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Iterator, Sequence, Optional, Dict
+
+
+from my.config import hackernews as user_config
+
+
+from ..core import Paths
+@dataclass
+class config(user_config.dogsheep):
+    # paths[s]/glob to the dogsheep database
+    export_path: Paths
+
+
+# todo so much boilerplate... really need some common wildcard imports?...
+# at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional
+from ..core import get_files
+from pathlib import Path
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+
+
+from .common import hackernews_link
+
+# TODO not sure if worth splitting into Comment and Story?
+@dataclass(unsafe_hash=True)
+class Item:
+    id: str
+    type: str
+    # TODO is it urc??
+    created: datetime
+    title: Optional[str]  # only present for Story
+    text_html: Optional[str] # should be present for Comment and might for Story
+    url: Optional[str] # might be present for Story
+    # todo process 'deleted'? fields?
+    # todo process 'parent'?
+
+    @property
+    def permalink(self) -> str:
+        return hackernews_link(self.id)
+
+
+from ..core.error import Res
+from ..core.dataset import connect_readonly
+def items() -> Iterator[Res[Item]]:
+    f = max(inputs())
+    with connect_readonly(f) as db:
+        items = db['items']
+        for r in items.all(order_by='time'):
+            yield Item(
+                id=r['id'],
+                type=r['type'],
+                created=datetime.fromtimestamp(r['time']),
+                title=r['title'],
+                # todo hmm maybe a method to stip off html tags would be nice
+                text_html=r['text'],
+                url=r['url'],
+            )