my.hackernews.dogsheep: use utc datetime + minor cleanup

This commit is contained in:
karlicoss 2023-10-27 02:27:04 +01:00
parent bef0423b4f
commit 3a25c9042c

View file

@ -4,18 +4,19 @@ Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogshee
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterator, Sequence, Optional
from my.core import get_files, Paths, Res
from my.core import get_files, Paths, Res, datetime_aware
from my.core.sqlite import sqlite_connection
import my.config
from my.config import hackernews as user_config
from .common import hackernews_link
@dataclass
class config(user_config.dogsheep):
class config(my.config.hackernews.dogsheep):
# paths[s]/glob to the dogsheep database
export_path: Paths
@ -26,24 +27,23 @@ def inputs() -> Sequence[Path]:
return get_files(config.export_path)
from .common import hackernews_link
# TODO not sure if worth splitting into Comment and Story?
@dataclass(unsafe_hash=True)
class Item:
id: str
type: str
# TODO is it urc??
created: datetime
created: datetime_aware # checked and it's utc
title: Optional[str] # only present for Story
text_html: Optional[str] # should be present for Comment and might for Story
url: Optional[str] # might be present for Story
text_html: Optional[str] # should be present for Comment and might for Story
url: Optional[str] # might be present for Story
# todo process 'deleted'? fields?
# todo process 'parent'?
@property
def permalink(self) -> str:
return hackernews_link(self.id)
# TODO hmm kinda annoying that permalink isn't getting serialized
# maybe won't be such a big problem if we used hpi query directly on objects, without jsons?
# so we could just take .permalink thing
@ -56,7 +56,7 @@ def items() -> Iterator[Res[Item]]:
yield Item(
id=r['id'],
type=r['type'],
created=datetime.fromtimestamp(r['time']),
created=datetime.fromtimestamp(r['time'], tz=timezone.utc),
title=r['title'],
# todo hmm maybe a method to strip off html tags would be nice
text_html=r['text'],