HPI/my/pdfs.py

#!/usr/bin/env python3
from pathlib import Path
from datetime import datetime
import re
from multiprocessing.pool import Pool
from subprocess import CompletedProcess
import sys
import io
from typing import NamedTuple, List, Optional
from contextlib import redirect_stderr
import logging
from pprint import pprint
import itertools

from kython import import_file
from kython.klogging import setup_logzero


from ..ext.pdfannots import pdfannots # type: ignore

from .private import ROOT_PATHS, is_ignored


def get_logger():
    return logging.getLogger('annotation-crawler')


def get_candidates() -> List[Path]:
    pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in ROOT_PATHS)
    return list(sorted(pdfs))


# TODO cachew?
class Annotation(NamedTuple):
    author: Optional[str]
    page: int
    highlight: Optional[str]
    comment: Optional[str]
    date: Optional[datetime]


class Pdf(NamedTuple):
    path: Path
    annotations: List[Annotation]
    stderr: str

    @property
    def date(self):
        return self.annotations[-1].date


def as_annotation(ann) -> Annotation:
    d = vars(ann)
    d['page'] = ann.page.pageno
    for a in ('boxes', 'rect'):
        if a in d:
            del d[a]
    dates = d['date']
    date: Optional[datetime] = None
    if dates is not None:
        dates = dates.replace("'", "")
        # 20190630213504+0100
        dates = re.sub('Z0000$', '+0000', dates)
        FMT = '%Y%m%d%H%M%S'
        # TODO is it utc if there is not timestamp?
        for fmt in [FMT, FMT + '%z']:
            try:
                date = datetime.strptime(dates, fmt)
                break
            except ValueError:
                pass
        else:
            raise RuntimeError(dates)
    return Annotation(
        author   =d['author'],
        page     =d['page'],
        highlight=d['text'],
        comment  =d['contents'],
        date     =date,
    )


class PdfAnnotsException(Exception):
    def __init__(self, path: Path) -> None:
        self.path = path


def _get_annots(p: Path) -> Pdf:
    progress = False
    with p.open('rb') as fo:
        f = io.StringIO()
        with redirect_stderr(f):
            (annots, outlines) = pdfannots.process_file(fo, emit_progress=progress)
            # outlines are kinda like TOC, I don't really need them
    return Pdf(
        path=p,
        annotations=list(map(as_annotation, annots)),
        stderr=f.getvalue(),
    )


def get_annots(p: Path) -> Pdf:
    try:
        return _get_annots(p)
    except Exception as e:
        raise PdfAnnotsException(p) from e


def test():
    res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
    assert len(res.annotations) > 0


def test2():
    res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf'))
    print(res)


def get_annotated_pdfs(pdfs=None) -> List[Pdf]:
    logger = get_logger()
    setup_logzero(logger, level=logging.DEBUG)

    if pdfs is None:
        pdfs = get_candidates()
    logger.info('processing %d pdfs', len(pdfs))

    collected = []
    errors = []
    def callback(res: Pdf):
        if is_ignored(res.path):
            return
        logger.info('processed %s', res.path)

        if len(res.stderr) > 0:
            err = 'while processing %s: %s' % (res.path, res.stderr)
            logger.error(err)
            errors.append(err)
        elif len(res.annotations) > 0:
            logger.info('collected %s annotations', len(res.annotations))
            collected.append(res)

    def error_cb(err):
        if isinstance(err, PdfAnnotsException):
            if is_ignored(err.path):
                # TODO log?
                return
            logger.error('while processing %s', err.path)
            err = err.__cause__
        logger.exception(err)
        errors.append(str(err))

    with Pool() as p:
        handles = [p.apply_async(
            get_annots,
            (pdf, ),
            callback=callback,
            error_callback=error_cb,
        ) for pdf in pdfs if not is_ignored(pdf)] # TODO log if we skip?
        for h in handles:
            h.wait()

    # TODO more defensive error processing?
    if len(errors) > 0:
        logger.error('had %d errors while processing', len(errors))
        sys.exit(2)

    return collected


def main():
    logger = get_logger()

    collected = get_annotated_pdfs()
    if len(collected) > 0:
        for r in collected:
            logger.warning('collected annotations in: %s', r.path)
            for a in r.annotations:
                pprint(a)


if __name__ == '__main__':
    main()