#!/usr/bin/env python3 from .common import import_file from pathlib import Path # path to pdfannots (https://github.com/0xabu/pdfannots) import mycfg.repos.pdfannots.pdfannots as pdfannots from mycfg import paths from datetime import datetime import re from subprocess import CompletedProcess import sys import io from typing import NamedTuple, List, Optional from contextlib import redirect_stderr import logging def get_logger(): return logging.getLogger('my.pdfs') def get_candidates(roots=None) -> List[Path]: if roots is None: roots = paths.pdfs.roots import itertools pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in roots) return list(sorted(pdfs)) def is_ignored(p): return paths.pdfs.is_ignored(p) # TODO cachew? class Annotation(NamedTuple): author: Optional[str] page: int highlight: Optional[str] comment: Optional[str] date: Optional[datetime] class Pdf(NamedTuple): path: Path annotations: List[Annotation] stderr: str @property def date(self): return self.annotations[-1].date def as_annotation(ann) -> Annotation: d = vars(ann) d['page'] = ann.page.pageno for a in ('boxes', 'rect'): if a in d: del d[a] dates = d['date'] date: Optional[datetime] = None if dates is not None: dates = dates.replace("'", "") # 20190630213504+0100 dates = re.sub('Z0000$', '+0000', dates) FMT = '%Y%m%d%H%M%S' # TODO is it utc if there is not timestamp? for fmt in [FMT, FMT + '%z']: try: date = datetime.strptime(dates, fmt) break except ValueError: pass else: raise RuntimeError(dates) return Annotation( author =d['author'], page =d['page'], highlight=d['text'], comment =d['contents'], date =date, ) class PdfAnnotsException(Exception): def __init__(self, path: Path) -> None: self.path = path def _get_annots(p: Path) -> Pdf: progress = False with p.open('rb') as fo: f = io.StringIO() with redirect_stderr(f): (annots, outlines) = pdfannots.process_file(fo, emit_progress=progress) # outlines are kinda like TOC, I don't really need them return Pdf( path=p, annotations=list(map(as_annotation, annots)), stderr=f.getvalue(), ) def get_annots(p: Path) -> Pdf: try: return _get_annots(p) except Exception as e: raise PdfAnnotsException(p) from e def get_annotated_pdfs(roots=None) -> List[Pdf]: logger = get_logger() pdfs = get_candidates(roots=roots) logger.info('processing %d pdfs', len(pdfs)) collected = [] errors = [] def callback(res: Pdf): if is_ignored(res.path): return logger.info('processed %s', res.path) if len(res.stderr) > 0: err = 'while processing %s: %s' % (res.path, res.stderr) logger.error(err) errors.append(err) elif len(res.annotations) > 0: logger.info('collected %s annotations', len(res.annotations)) collected.append(res) def error_cb(err): if isinstance(err, PdfAnnotsException): if is_ignored(err.path): # TODO log? return logger.error('while processing %s', err.path) err = err.__cause__ logger.exception(err) errors.append(str(err)) from multiprocessing.pool import Pool with Pool() as p: handles = [p.apply_async( get_annots, (pdf, ), callback=callback, error_callback=error_cb, ) for pdf in pdfs if not is_ignored(pdf)] # TODO log if we skip? for h in handles: h.wait() # TODO more defensive error processing? if len(errors) > 0: logger.error('had %d errors while processing', len(errors)) sys.exit(2) return collected def test(): res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf')) assert len(res.annotations) > 0 def test2(): res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf')) print(res) def main(): from pprint import pprint logger = get_logger() from .common import setup_logger setup_logger(logger, level=logging.DEBUG) collected = get_annotated_pdfs() if len(collected) > 0: for r in collected: logger.warning('collected annotations in: %s', r.path) for a in r.annotations: pprint(a)