hackernews: add initial dogsheep database importer

2022-02-04 21:56:53 +00:00 · 2022-02-04 21:56:53 +00:00 · 590e09f80b
commit 590e09f80b
parent 1e635502a2
3 changed files with 71 additions and 0 deletions
--- a/my/config.py
+++ b/my/config.py
@ -112,3 +112,8 @@ class instagram:
        export_path: Paths
    class gdpr:
        export_path: Paths
 class hackernews:
    class dogsheep:
        export_path: Paths
--- a/my/hackernews/common.py
+++ b/my/hackernews/common.py
@ -0,0 +1,2 @@
 def hackernews_link(id: str) -> str:
    return f'https://news.ycombinator.com/item?id={id}'
--- a/my/hackernews/dogsheep.py
+++ b/my/hackernews/dogsheep.py
@ -0,0 +1,64 @@
 """
 Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogsheep/hacker-news-to-sqlite]]
 """
 from __future__ import annotations
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Iterator, Sequence, Optional, Dict
 from my.config import hackernews as user_config
 from ..core import Paths
@dataclass
 class config(user_config.dogsheep):
    # paths[s]/glob to the dogsheep database
    export_path: Paths
 # todo so much boilerplate... really need some common wildcard imports?...
 # at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional
 from ..core import get_files
 from pathlib import Path
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)
 from .common import hackernews_link
 # TODO not sure if worth splitting into Comment and Story?
@dataclass(unsafe_hash=True)
 class Item:
    id: str
    type: str
    # TODO is it urc??
    created: datetime
    title: Optional[str]  # only present for Story
    text_html: Optional[str] # should be present for Comment and might for Story
    url: Optional[str] # might be present for Story
    # todo process 'deleted'? fields?
    # todo process 'parent'?
    @property
    def permalink(self) -> str:
        return hackernews_link(self.id)
 from ..core.error import Res
 from ..core.dataset import connect_readonly
 def items() -> Iterator[Res[Item]]:
    f = max(inputs())
    with connect_readonly(f) as db:
        items = db['items']
        for r in items.all(order_by='time'):
            yield Item(
                id=r['id'],
                type=r['type'],
                created=datetime.fromtimestamp(r['time']),
                title=r['title'],
                # todo hmm maybe a method to stip off html tags would be nice
                text_html=r['text'],
                url=r['url'],
            )
		`@ -0,0 +1,2 @@`
							`def hackernews_link(id: str) -> str:`
							`return f'https://news.ycombinator.com/item?id={id}'`