my.hackernews.dogsheep: use utc datetime + minor cleanup

This commit is contained in:
karlicoss 2023-10-27 02:27:04 +01:00
parent bef0423b4f
commit 3a25c9042c

View file

@ -4,18 +4,19 @@ Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogshee
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Iterator, Sequence, Optional from typing import Iterator, Sequence, Optional
from my.core import get_files, Paths, Res from my.core import get_files, Paths, Res, datetime_aware
from my.core.sqlite import sqlite_connection from my.core.sqlite import sqlite_connection
import my.config
from my.config import hackernews as user_config from .common import hackernews_link
@dataclass @dataclass
class config(user_config.dogsheep): class config(my.config.hackernews.dogsheep):
# paths[s]/glob to the dogsheep database # paths[s]/glob to the dogsheep database
export_path: Paths export_path: Paths
@ -26,15 +27,12 @@ def inputs() -> Sequence[Path]:
return get_files(config.export_path) return get_files(config.export_path)
from .common import hackernews_link
# TODO not sure if worth splitting into Comment and Story? # TODO not sure if worth splitting into Comment and Story?
@dataclass(unsafe_hash=True) @dataclass(unsafe_hash=True)
class Item: class Item:
id: str id: str
type: str type: str
# TODO is it urc?? created: datetime_aware # checked and it's utc
created: datetime
title: Optional[str] # only present for Story title: Optional[str] # only present for Story
text_html: Optional[str] # should be present for Comment and might for Story text_html: Optional[str] # should be present for Comment and might for Story
url: Optional[str] # might be present for Story url: Optional[str] # might be present for Story
@ -44,6 +42,8 @@ class Item:
@property @property
def permalink(self) -> str: def permalink(self) -> str:
return hackernews_link(self.id) return hackernews_link(self.id)
# TODO hmm kinda annoying that permalink isn't getting serialized # TODO hmm kinda annoying that permalink isn't getting serialized
# maybe won't be such a big problem if we used hpi query directly on objects, without jsons? # maybe won't be such a big problem if we used hpi query directly on objects, without jsons?
# so we could just take .permalink thing # so we could just take .permalink thing
@ -56,7 +56,7 @@ def items() -> Iterator[Res[Item]]:
yield Item( yield Item(
id=r['id'], id=r['id'],
type=r['type'], type=r['type'],
created=datetime.fromtimestamp(r['time']), created=datetime.fromtimestamp(r['time'], tz=timezone.utc),
title=r['title'], title=r['title'],
# todo hmm maybe a method to strip off html tags would be nice # todo hmm maybe a method to strip off html tags would be nice
text_html=r['text'], text_html=r['text'],