HPI/reading/polar.py
2019-05-03 23:30:59 +01:00

170 lines
4.5 KiB
Python
Executable file

#!/usr/bin/python3
from pathlib import Path
import logging
from typing import List, Dict, Iterator, NamedTuple, Sequence, Optional
import json
from kython.kerror import ResT, echain, unwrap, sort_res_by
from kython.klogging import setup_logzero
BDIR = Path('/L/zzz_syncthing/data/.polar')
def get_logger():
return logging.getLogger('polar-provider')
def _get_datas() -> List[Path]:
return list(sorted(BDIR.glob('*/state.json')))
Uid = str
class Error(Exception):
def __init__(self, p: Path, *args, **kwargs) -> None:
super().__init__(*args, **kwargs) # type: ignore
self.uid: Uid = p.parent.name
# TODO not sure if I even need comment?
# Ok I guess handling comment-level errors is a bit too much..
Cid = str
class Item(NamedTuple):
cid: Cid
created: str # TODO datetime (parse iso)
comment: str
ResultBook = ResT['Book', Error]
class Book(NamedTuple):
uid: Uid
filename: str
title: Optional[str]
items: Sequence[Item]
from kython.konsume import zoom, akeq
class Loader:
def __init__(self, p: Path) -> None:
self.path = p
self.uid = self.path.parent.name
self.err = Error(p)
self.logger = get_logger()
def error(self, cause, extra):
return echain(Error(self.path, extra), cause)
def load_item(self, meta) -> Iterator[Item]:
# TODO this should be destructive zoom?
meta['notes'].zoom()
meta['pagemarks'].zoom()
if 'notes' in meta:
# TODO something nicer?
notes = meta['notes'].zoom()
else:
notes = [] # TODO FIXME dict?
comments = meta['comments'].zoom()
meta['questions'].zoom()
meta['flashcards'].zoom()
highlights = meta['textHighlights'].zoom()
meta['areaHighlights'].zoom()
meta['screenshots'].zoom()
meta['thumbnails'].zoom()
if 'readingProgress' in meta:
meta['readingProgress'].zoom()
# TODO want to ignore the whold subtree..
pi = meta['pageInfo'].zoom()
pi['num'].zoom()
# TODO how to make it nicer?
vals = list(comments.values())
for v in vals:
cid = v['id'].zoom()
v['guid'].zoom()
# TODO values should probably be checked by flow analysis??
crt = v['created'].zoom()
updated = v['lastUpdated'].zoom()
content = v['content'].zoom()
html = content['HTML'].zoom()
v['ref'].zoom() # TODO it actually might be pretty useful.. similar to hypothesis??
yield Item(
cid=cid.value,
created=crt.value,
comment=html.value, # TODO perhaps coonvert from html to text or org?
)
v.consume()
highlights.consume_all()
# TODO need to process text highlights...
def load_items(self, metas) -> Iterator[Item]:
from kython.konsume import wrap
for p, meta in metas.items():
with wrap(meta) as meta:
yield from self.load_item(meta)
def load(self) -> Iterator[ResultBook]:
self.logger.info('processing %s', self.path)
j = json.loads(self.path.read_text())
try:
di = j['docInfo']
filename = di['filename']
title = di.get('title', None)
tags = di['tags']
pm = j['pageMetas']
except Exception as ex:
err = self.error(ex, j)
self.logger.exception(err)
yield err
return
# TODO should I group by book???
yield Book(
uid=self.uid,
filename=filename,
title=title,
items=list(self.load_items(pm)),
)
# "textHighlights": {},
# "comments": {},
# TODO
# "pagemarks": {},
# "notes": {},
# "questions": {},
# "flashcards": {},
# "areaHighlights": {},
# "screenshots": {},
# "thumbnails": {},
# "readingProgress": {},
# "pageInfo": {
# "num": 1
# }
def iter_entries() -> Iterator[ResultBook]:
for d in _get_datas():
yield from Loader(d).load()
def main():
logger = get_logger()
setup_logzero(logger, level=logging.DEBUG)
for entry in iter_entries():
logger.info('processed %s', entry.uid)
for i in entry.items:
try:
ii = unwrap(i)
except Error as e:
logger.exception(e)
else:
logger.info(ii)
if __name__ == '__main__':
main()