diff --git a/reading/polar.py b/reading/polar.py new file mode 100755 index 0000000..50034d8 --- /dev/null +++ b/reading/polar.py @@ -0,0 +1,207 @@ +#!/usr/bin/python3 +from pathlib import Path +from datetime import datetime +import logging +from typing import List, Dict, Iterator, NamedTuple, Sequence, Optional +import json +import pytz + +from kython.kerror import ResT, echain, unwrap, sort_res_by +from kython.klogging import setup_logzero +from kython.konsume import wrap, zoom, ignore + + +BDIR = Path('/L/zzz_syncthing/data/.polar') + + +def get_logger(): + return logging.getLogger('polar-provider') + + +def _get_datas() -> List[Path]: + return list(sorted(BDIR.glob('*/state.json'))) + + +def parse_dt(s: str) -> datetime: + return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ')) + +Uid = str + +class Error(Exception): + def __init__(self, p: Path, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) # type: ignore + self.uid: Uid = p.parent.name + +# Ok I guess handling comment-level errors is a bit too much.. +Cid = str +class Comment(NamedTuple): + cid: Cid + created: datetime + text: str + +Hid = str +class Highlight(NamedTuple): + hid: Hid + created: datetime + selection: str + comments: Sequence[Comment] + + +Result = ResT['Book', Error] + +class Book(NamedTuple): + uid: Uid + created: datetime + filename: str + title: Optional[str] + items: Sequence[Highlight] + + +class Loader: + def __init__(self, p: Path) -> None: + self.path = p + self.uid = self.path.parent.name + self.err = Error(p) + self.logger = get_logger() + + def error(self, cause, extra=''): + return echain(Error(self.path, extra), cause) + + def load_item(self, meta) -> Iterator[Highlight]: + # TODO this should be destructive zoom? + meta['notes'].zoom() + meta['pagemarks'].zoom() + if 'notes' in meta: + # TODO something nicer? + notes = meta['notes'].zoom() + else: + notes = [] # TODO FIXME dict? + comments = meta['comments'].zoom() + meta['questions'].zoom() + meta['flashcards'].zoom() + highlights = meta['textHighlights'].zoom() + meta['areaHighlights'].zoom() + meta['screenshots'].zoom() + meta['thumbnails'].zoom() + if 'readingProgress' in meta: + meta['readingProgress'].zoom() + + # TODO want to ignore the whold subtree.. + pi = meta['pageInfo'].zoom() + pi['num'].zoom() + + # TODO how to make it nicer? + cmap: Dict[Hid, List[Comment]] = {} + vals = list(comments.values()) + for v in vals: + cid = v['id'].zoom() + v['guid'].zoom() + # TODO values should probably be checked by flow analysis?? + crt = v['created'].zoom() + updated = v['lastUpdated'].zoom() + content = v['content'].zoom() + html = content['HTML'].zoom() + refv = v['ref'].zoom().value + [_, hlid] = refv.split(':') + ccs = cmap.get(hlid, []) + cmap[hlid] = ccs + ccs.append(Comment( + cid=cid.value, + created=parse_dt(crt.value), + text=html.value, # TODO perhaps coonvert from html to text or org? + )) + v.consume() + for h in list(highlights.values()): + hid = h['id'].zoom().value + if hid in cmap: + comments = cmap[hid] + del cmap[hid] + else: + comments = [] + + h['guid'].consume() + crt = h['created'].zoom().value + updated = h['lastUpdated'].zoom().value + h['rects'].ignore() + + h['textSelections'].ignore() + h['notes'].consume() + h['questions'].consume() + h['flashcards'].consume() + h['color'].consume() + h['images'].ignore() + # TODO eh, quite excessive \ns... + text = h['text'].zoom()['TEXT'].zoom().value + + yield Highlight( + hid=hid, + created=parse_dt(crt), + selection=text, + comments=tuple(comments), + ) + h.consume() + + if len(cmap) > 0: + raise RuntimeError(f'Unconsumed comments: {cmap}') + # TODO sort by date? + + + def load_items(self, metas) -> Iterator[Highlight]: + for p, meta in metas.items(): + with wrap(meta) as meta: + yield from self.load_item(meta) + + def load(self) -> Iterator[Result]: + self.logger.info('processing %s', self.path) + j = json.loads(self.path.read_text()) + + # TODO konsume here as well? + di = j['docInfo'] + added = di['added'] + filename = di['filename'] + title = di.get('title', None) + tags = di['tags'] + pm = j['pageMetas'] + + yield Book( + uid=self.uid, + created=parse_dt(added), + filename=filename, + title=title, + items=list(self.load_items(pm)), + ) + + +def iter_entries() -> Iterator[Result]: + logger = get_logger() + for d in _get_datas(): + loader = Loader(d) + try: + yield from loader.load() + except Exception as ee: + err = loader.error(ee) + logger.exception(err) + yield err + +def get_entries() -> List[Result]: + # sorting by first annotation is reasonable I guess??? + # TODO + return list(sort_res_by(iter_entries(), key=lambda e: e.created)) + +def main(): + logger = get_logger() + setup_logzero(logger, level=logging.DEBUG) + + for entry in iter_entries(): + logger.info('processed %s', entry.uid) + try: + ee = unwrap(entry) + except Error as e: + logger.exception(e) + else: + for i in ee.items: + logger.info(i) + + +if __name__ == '__main__': + main()