HPI/my/pdfs.py

#!/usr/bin/env python3
'''
PDF documents and annotations on your filesystem
'''
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime
import re
import sys
import io
import logging
from pathlib import Path
from typing import NamedTuple, List, Optional, Iterator
from contextlib import redirect_stderr


from .common import mcachew, group_by_key
from .error import Res, split_errors

# path to pdfannots (https://github.com/0xabu/pdfannots)
import my.config.repos.pdfannots.pdfannots as pdfannots
from my.config import pdfs as config


def get_logger():
    return logging.getLogger('my.pdfs')


def is_ignored(p: Path) -> bool:
    # ignore some extremely heavy files
    return config.is_ignored(p)


def candidates(roots=None) -> Iterator[Path]:
    if roots is None:
        roots = config.roots

    for r in roots:
        for p in Path(r).rglob('*.pdf'):
            if not is_ignored(p):
                yield p

# TODO canonical names
# TODO defensive if pdf was removed, also cachew key needs to be defensive


class Annotation(NamedTuple):
    path: str
    author: Optional[str]
    page: int
    highlight: Optional[str]
    comment: Optional[str]
    date: Optional[datetime]


def as_annotation(*, raw_ann, path: str) -> Annotation:
    d = vars(raw_ann)
    d['page'] = raw_ann.page.pageno
    for a in ('boxes', 'rect'):
        if a in d:
            del d[a]
    dates = d['date']
    date: Optional[datetime] = None
    if dates is not None:
        dates = dates.replace("'", "")
        # 20190630213504+0100
        dates = re.sub('Z0000$', '+0000', dates)
        FMT = '%Y%m%d%H%M%S'
        # TODO is it utc if there is not timestamp?
        for fmt in [FMT, FMT + '%z']:
            try:
                date = datetime.strptime(dates, fmt)
                break
            except ValueError:
                pass
        else:
            # TODO defensive?
            raise RuntimeError(dates)
    return Annotation(
        path      = path,
        author    = d['author'],
        page      = d['page'],
        highlight = d['text'],
        comment   = d['contents'],
        date      = date,
    )


def get_annots(p: Path) -> List[Annotation]:
    with p.open('rb') as fo:
        f = io.StringIO()
        with redirect_stderr(f):
            (annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
            # outlines are kinda like TOC, I don't really need them
    return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
    # TODO stderr?


def hash_files(pdfs: List[Path]):
    # if mtime hasn't changed then the file hasn't changed either
    return [(pdf, pdf.stat().st_mtime) for pdf in pdfs]

# TODO might make more sense to be more fine grained here, e.g. cache annotations for indifidual files

@mcachew(hashf=hash_files)
def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
    logger = get_logger()

    logger.info('processing %d pdfs', len(pdfs))

    # TODO how to print to stdout synchronously?
    with ProcessPoolExecutor() as pool:
        futures = [
            pool.submit(get_annots, pdf)
            for pdf in pdfs
        ]
        for f, pdf in zip(futures, pdfs):
            try:
                yield from f.result()
            except Exception as e:
                logger.error('While processing %s:', pdf)
                logger.exception(e)
                # TODO not sure if should attach pdf as well; it's a bit annoying to pass around?
                # also really have to think about interaction with cachew...
                yield e


def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
    pdfs = list(sorted(candidates(roots=roots)))
    yield from _iter_annotations(pdfs=pdfs)


class Pdf(NamedTuple):
    path: Path
    annotations: List[Annotation]

    @property
    def date(self):
        return self.annotations[-1].date


def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]:
    it = iter_annotations(roots=roots)
    vit, eit = split_errors(it, ET=Exception)

    for k, g in group_by_key(vit, key=lambda a: a.path).items():
        yield Pdf(path=Path(k), annotations=g)
    yield from eit


def test():
    res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
    assert len(res) > 3


def test2():
    res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf'))
    print(res)


def test_with_error():
    # TODO need example of pdf file...
    import tempfile
    with tempfile.TemporaryDirectory() as td:
        root = Path(td)
        g = root / 'garbage.pdf'
        g.write_text('garbage')
        roots = [
            root,
            # '/usr/share/doc/texlive-doc/latex/amsrefs/',
        ]
        # TODO find some pdfs that actually has annotations...
        annots = list(iter_annotations(roots=roots))
    assert len(annots) == 1
    assert isinstance(annots[0], Exception)


def main():
    from pprint import pprint

    logger = get_logger()
    from .common import setup_logger
    setup_logger(logger, level=logging.DEBUG)

    collected = list(annotated_pdfs())
    if len(collected) > 0:
        for r in collected:
            if isinstance(r, Exception):
                logger.exception(r)
            else:
                logger.info('collected annotations in: %s', r.path)
                for a in r.annotations:
                    pprint(a)