hackernews: add initial dogsheep database importer
This commit is contained in:
parent
1e635502a2
commit
590e09f80b
3 changed files with 71 additions and 0 deletions
|
@ -112,3 +112,8 @@ class instagram:
|
||||||
export_path: Paths
|
export_path: Paths
|
||||||
class gdpr:
|
class gdpr:
|
||||||
export_path: Paths
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class hackernews:
|
||||||
|
class dogsheep:
|
||||||
|
export_path: Paths
|
||||||
|
|
2
my/hackernews/common.py
Normal file
2
my/hackernews/common.py
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
def hackernews_link(id: str) -> str:
|
||||||
|
return f'https://news.ycombinator.com/item?id={id}'
|
64
my/hackernews/dogsheep.py
Normal file
64
my/hackernews/dogsheep.py
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
"""
|
||||||
|
Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogsheep/hacker-news-to-sqlite]]
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Iterator, Sequence, Optional, Dict
|
||||||
|
|
||||||
|
|
||||||
|
from my.config import hackernews as user_config
|
||||||
|
|
||||||
|
|
||||||
|
from ..core import Paths
|
||||||
|
@dataclass
|
||||||
|
class config(user_config.dogsheep):
|
||||||
|
# paths[s]/glob to the dogsheep database
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
# todo so much boilerplate... really need some common wildcard imports?...
|
||||||
|
# at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional
|
||||||
|
from ..core import get_files
|
||||||
|
from pathlib import Path
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
|
||||||
|
from .common import hackernews_link
|
||||||
|
|
||||||
|
# TODO not sure if worth splitting into Comment and Story?
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class Item:
|
||||||
|
id: str
|
||||||
|
type: str
|
||||||
|
# TODO is it urc??
|
||||||
|
created: datetime
|
||||||
|
title: Optional[str] # only present for Story
|
||||||
|
text_html: Optional[str] # should be present for Comment and might for Story
|
||||||
|
url: Optional[str] # might be present for Story
|
||||||
|
# todo process 'deleted'? fields?
|
||||||
|
# todo process 'parent'?
|
||||||
|
|
||||||
|
@property
|
||||||
|
def permalink(self) -> str:
|
||||||
|
return hackernews_link(self.id)
|
||||||
|
|
||||||
|
|
||||||
|
from ..core.error import Res
|
||||||
|
from ..core.dataset import connect_readonly
|
||||||
|
def items() -> Iterator[Res[Item]]:
|
||||||
|
f = max(inputs())
|
||||||
|
with connect_readonly(f) as db:
|
||||||
|
items = db['items']
|
||||||
|
for r in items.all(order_by='time'):
|
||||||
|
yield Item(
|
||||||
|
id=r['id'],
|
||||||
|
type=r['type'],
|
||||||
|
created=datetime.fromtimestamp(r['time']),
|
||||||
|
title=r['title'],
|
||||||
|
# todo hmm maybe a method to stip off html tags would be nice
|
||||||
|
text_html=r['text'],
|
||||||
|
url=r['url'],
|
||||||
|
)
|
Loading…
Add table
Reference in a new issue