HPI/my/reading/polar.py

#!/usr/bin/python3
from pathlib import Path
from datetime import datetime
import logging
from typing import List, Dict, Iterator, NamedTuple, Sequence, Optional
import json

import pytz

from ..common import setup_logger

from kython.kerror import ResT, echain, unwrap, sort_res_by
from kython.konsume import wrap, zoom, ignore


_POLAR_DIR = Path('~/.polar')


def get_logger():
    return logging.getLogger('my.reading.polar')


def _get_datas() -> List[Path]:
    return list(sorted(_POLAR_DIR.expanduser().glob('*/state.json')))


def parse_dt(s: str) -> datetime:
    return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ'))

Uid = str

class Error(Exception):
    def __init__(self, p: Path, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs) # type: ignore
        self.uid: Uid = p.parent.name

# Ok I guess handling comment-level errors is a bit too much..
Cid = str
class Comment(NamedTuple):
    cid: Cid
    created: datetime
    text: str

Hid = str
class Highlight(NamedTuple):
    hid: Hid
    created: datetime
    selection: str
    comments: Sequence[Comment]


Result = ResT['Book', Error]

class Book(NamedTuple):
    uid: Uid
    created: datetime
    filename: str
    title: Optional[str]
    items: Sequence[Highlight]


class Loader:
    def __init__(self, p: Path) -> None:
        self.path = p
        self.uid = self.path.parent.name
        self.err = Error(p)
        self.logger = get_logger()

    def error(self, cause, extra=''):
        return echain(Error(self.path, extra), cause)

    def load_item(self, meta) -> Iterator[Highlight]:
        # TODO this should be destructive zoom?
        meta['notes'].zoom()
        meta['pagemarks'].zoom()
        if 'notes' in meta:
            # TODO something nicer?
            notes = meta['notes'].zoom()
        else:
            notes = [] # TODO FIXME dict?
        comments = meta['comments'].zoom()
        meta['questions'].zoom()
        meta['flashcards'].zoom()
        highlights = meta['textHighlights'].zoom()
        meta['areaHighlights'].zoom()
        meta['screenshots'].zoom()
        meta['thumbnails'].zoom()
        if 'readingProgress' in meta:
            meta['readingProgress'].zoom()

        # TODO want to ignore the whold subtree..
        pi = meta['pageInfo'].zoom()
        pi['num'].zoom()

        # TODO how to make it nicer?
        cmap: Dict[Hid, List[Comment]] = {}
        vals = list(comments.values())
        for v in vals:
            cid = v['id'].zoom()
            v['guid'].zoom()
            # TODO values should probably be checked by flow analysis??
            crt = v['created'].zoom()
            updated = v['lastUpdated'].zoom()
            content = v['content'].zoom()
            html = content['HTML'].zoom()
            refv = v['ref'].zoom().value
            [_, hlid] = refv.split(':')
            ccs = cmap.get(hlid, [])
            cmap[hlid] = ccs
            ccs.append(Comment(
                cid=cid.value,
                created=parse_dt(crt.value),
                text=html.value, # TODO perhaps coonvert from html to text or org?
            ))
            v.consume()
        for h in list(highlights.values()):
            hid = h['id'].zoom().value
            if hid in cmap:
                comments = cmap[hid]
                del cmap[hid]
            else:
                comments = []

            h['guid'].consume()
            crt = h['created'].zoom().value
            updated = h['lastUpdated'].zoom().value
            h['rects'].ignore()

            h['textSelections'].ignore()
            h['notes'].consume()
            h['questions'].consume()
            h['flashcards'].consume()
            h['color'].consume()
            h['images'].ignore()
            # TODO eh, quite excessive \ns...
            text = h['text'].zoom()['TEXT'].zoom().value

            yield Highlight(
                hid=hid,
                created=parse_dt(crt),
                selection=text,
                comments=tuple(comments),
            )
            h.consume()

        if len(cmap) > 0:
            raise RuntimeError(f'Unconsumed comments: {cmap}')
        # TODO sort by date?


    def load_items(self, metas) -> Iterator[Highlight]:
        for p, meta in metas.items():
            with wrap(meta) as meta:
                yield from self.load_item(meta)

    def load(self) -> Iterator[Result]:
        self.logger.info('processing %s', self.path)
        j = json.loads(self.path.read_text())

        # TODO konsume here as well?
        di = j['docInfo']
        added = di['added']
        filename = di['filename']
        title = di.get('title', None)
        tags = di['tags']
        pm = j['pageMetas']

        yield Book(
            uid=self.uid,
            created=parse_dt(added),
            filename=filename,
            title=title,
            items=list(self.load_items(pm)),
        )


def iter_entries() -> Iterator[Result]:
    logger = get_logger()
    for d in _get_datas():
        loader = Loader(d)
        try:
            yield from loader.load()
        except Exception as ee:
            err = loader.error(ee)
            logger.exception(err)
            yield err


def get_entries() -> List[Result]:
    # sorting by first annotation is reasonable I guess???
    # TODO
    return list(sort_res_by(iter_entries(), key=lambda e: e.created))


def main():
    logger = get_logger()
    setup_logger(logger, level=logging.DEBUG)

    for entry in iter_entries():
        logger.info('processed %s', entry.uid)
        try:
            ee = unwrap(entry)
        except Error as e:
            logger.exception(e)
        else:
            for i in ee.items:
                logger.info(i)


if __name__ == '__main__':
    main()