HPI/my/hackernews/harmonic.py

"""
[[https://play.google.com/store/apps/details?id=com.simon.harmonichackernews][Harmonic]] app for Hackernews
"""

from __future__ import annotations

REQUIRES = ['lxml', 'orjson']

from collections.abc import Iterator, Sequence
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, TypedDict, cast

import orjson
from lxml import etree
from more_itertools import one

import my.config
from my.core import (
    Paths,
    Res,
    Stats,
    datetime_aware,
    get_files,
    make_logger,
    stat,
)
from my.core.common import unique_everseen

from .common import SavedBase, hackernews_link

import my.config  # isort: skip


logger = make_logger(__name__)


@dataclass
class harmonic(my.config.harmonic):
    export_path: Paths


def inputs() -> Sequence[Path]:
    return get_files(harmonic.export_path)


class Cached(TypedDict):
    author: str
    created_at_i: int
    id: str
    points: int
    test: str | None
    title: str
    type: str  # TODO Literal['story', 'comment']? comments are only in 'children' field tho
    url: str
    # TODO also has children with comments, but not sure I need it?


# TODO if we ever add use .text property, need to html.unescape it first
# TODO reuse SavedBase in materialistic?
@dataclass
class Saved(SavedBase):
    raw: Cached

    @property
    def when(self) -> datetime_aware:
        ts = self.raw['created_at_i']
        return datetime.fromtimestamp(ts, tz=timezone.utc)

    @property
    def uid(self) -> str:
        return self.raw['id']

    @property
    def url(self) -> str:
        return self.raw['url']

    @property
    def title(self) -> str:
        return self.raw['title']

    @property
    def hackernews_link(self) -> str:
        return hackernews_link(self.uid)

    def __hash__(self) -> int:
        # meh. but seems like the easiest and fastest way to hash a dict?
        return hash(orjson.dumps(self.raw))


_PREFIX = 'com.simon.harmonichackernews.KEY_SHARED_PREFERENCES'


def _saved() -> Iterator[Res[Saved]]:
    paths = inputs()
    total = len(paths)
    width = len(str(total))
    for idx, path in enumerate(paths):
        logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
        # TODO defensive for each item!
        tr = etree.parse(path)

        res = one(cast(list[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORIES_STRINGS"]')))
        cached_ids = [x.text.split('-')[0] for x in res]

        cached: dict[str, Cached] = {}
        for sid in cached_ids:
            res = one(cast(list[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORY{sid}"]')))
            j = orjson.loads(res.text)
            cached[sid] = j

        res = one(cast(list[Any], tr.xpath(f'//*[@name="{_PREFIX}_BOOKMARKS"]')))
        for x in res.text.split('-'):
            ids, item_timestamp = x.split('q')
            # not sure if timestamp is any useful?

            cc = cached.get(ids, None)
            if cc is None:
                # TODO warn or error?
                continue

            yield Saved(cc)


def saved() -> Iterator[Res[Saved]]:
    yield from unique_everseen(_saved)


def stats() -> Stats:
    return {
        **stat(inputs),
        **stat(saved),
    }