diff --git a/reading/pdfs.py b/reading/pdfs.py new file mode 100755 index 0000000..56af874 --- /dev/null +++ b/reading/pdfs.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +from pathlib import Path +from datetime import datetime +import re +from multiprocessing.pool import Pool +from subprocess import CompletedProcess +import sys +import io +from typing import NamedTuple, List, Optional +from contextlib import redirect_stderr +import logging +from pprint import pprint +import itertools + +from kython import import_file +from kython.klogging import setup_logzero + + +from ..ext.pdfannots import pdfannots # type: ignore + +from .private import ROOT_PATHS, is_ignored + + +def get_logger(): + return logging.getLogger('annotation-crawler') + + +def get_candidates() -> List[Path]: + pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in ROOT_PATHS) + return list(sorted(pdfs)) + + +# TODO cachew? +class Annotation(NamedTuple): + author: Optional[str] + page: int + highlight: Optional[str] + comment: Optional[str] + date: Optional[datetime] + + +class Pdf(NamedTuple): + path: Path + annotations: List[Annotation] + stderr: str + + @property + def date(self): + return self.annotations[-1].date + + +def as_annotation(ann) -> Annotation: + d = vars(ann) + d['page'] = ann.page.pageno + for a in ('boxes', 'rect'): + if a in d: + del d[a] + dates = d['date'] + date: Optional[datetime] = None + if dates is not None: + dates = dates.replace("'", "") + # 20190630213504+0100 + dates = re.sub('Z0000$', '+0000', dates) + FMT = '%Y%m%d%H%M%S' + # TODO is it utc if there is not timestamp? + for fmt in [FMT, FMT + '%z']: + try: + date = datetime.strptime(dates, fmt) + break + except ValueError: + pass + else: + raise RuntimeError(dates) + return Annotation( + author =d['author'], + page =d['page'], + highlight=d['text'], + comment =d['contents'], + date =date, + ) + + +class PdfAnnotsException(Exception): + def __init__(self, path: Path) -> None: + self.path = path + + +def _get_annots(p: Path) -> Pdf: + progress = False + with p.open('rb') as fo: + f = io.StringIO() + with redirect_stderr(f): + (annots, outlines) = pdfannots.process_file(fo, emit_progress=progress) + # outlines are kinda like TOC, I don't really need them + return Pdf( + path=p, + annotations=list(map(as_annotation, annots)), + stderr=f.getvalue(), + ) + + +def get_annots(p: Path) -> Pdf: + try: + return _get_annots(p) + except Exception as e: + raise PdfAnnotsException(p) from e + + +def test(): + res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf')) + assert len(res.annotations) > 0 + + +def test2(): + res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf')) + print(res) + + +def get_annotated_pdfs(pdfs=None) -> List[Pdf]: + logger = get_logger() + setup_logzero(logger, level=logging.DEBUG) + + if pdfs is None: + pdfs = get_candidates() + logger.info('processing %d pdfs', len(pdfs)) + + collected = [] + errors = [] + def callback(res: Pdf): + if is_ignored(res.path): + return + logger.info('processed %s', res.path) + + if len(res.stderr) > 0: + err = 'while processing %s: %s' % (res.path, res.stderr) + logger.error(err) + errors.append(err) + elif len(res.annotations) > 0: + logger.info('collected %s annotations', len(res.annotations)) + collected.append(res) + + def error_cb(err): + if isinstance(err, PdfAnnotsException): + if is_ignored(err.path): + # TODO log? + return + logger.error('while processing %s', err.path) + err = err.__cause__ + logger.exception(err) + errors.append(str(err)) + + with Pool() as p: + handles = [p.apply_async( + get_annots, + (pdf, ), + callback=callback, + error_callback=error_cb, + ) for pdf in pdfs if not is_ignored(pdf)] # TODO log if we skip? + for h in handles: + h.wait() + + # TODO more defensive error processing? + if len(errors) > 0: + logger.error('had %d errors while processing', len(errors)) + sys.exit(2) + + return collected + + +def main(): + logger = get_logger() + + collected = get_annotated_pdfs() + if len(collected) > 0: + for r in collected: + logger.warning('collected annotations in: %s', r.path) + for a in r.annotations: + pprint(a) + + +if __name__ == '__main__': + main()