diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py index aac0b1a..de6c58d 100644 --- a/my/hackernews/dogsheep.py +++ b/my/hackernews/dogsheep.py @@ -4,18 +4,19 @@ Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogshee from __future__ import annotations from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from typing import Iterator, Sequence, Optional -from my.core import get_files, Paths, Res +from my.core import get_files, Paths, Res, datetime_aware from my.core.sqlite import sqlite_connection +import my.config -from my.config import hackernews as user_config +from .common import hackernews_link @dataclass -class config(user_config.dogsheep): +class config(my.config.hackernews.dogsheep): # paths[s]/glob to the dogsheep database export_path: Paths @@ -26,24 +27,23 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) -from .common import hackernews_link - # TODO not sure if worth splitting into Comment and Story? @dataclass(unsafe_hash=True) class Item: id: str type: str - # TODO is it urc?? - created: datetime + created: datetime_aware # checked and it's utc title: Optional[str] # only present for Story - text_html: Optional[str] # should be present for Comment and might for Story - url: Optional[str] # might be present for Story + text_html: Optional[str] # should be present for Comment and might for Story + url: Optional[str] # might be present for Story # todo process 'deleted'? fields? # todo process 'parent'? @property def permalink(self) -> str: return hackernews_link(self.id) + + # TODO hmm kinda annoying that permalink isn't getting serialized # maybe won't be such a big problem if we used hpi query directly on objects, without jsons? # so we could just take .permalink thing @@ -56,7 +56,7 @@ def items() -> Iterator[Res[Item]]: yield Item( id=r['id'], type=r['type'], - created=datetime.fromtimestamp(r['time']), + created=datetime.fromtimestamp(r['time'], tz=timezone.utc), title=r['title'], # todo hmm maybe a method to strip off html tags would be nice text_html=r['text'],