HPI/my/pdfs.py

'''
PDF documents and annotations on your filesystem
'''
REQUIRES = [
    'git+https://github.com/0xabu/pdfannots',
    # todo not sure if should use pypi version?
]

from datetime import datetime
from dataclasses import dataclass
import io
from pathlib import Path
import time
from typing import NamedTuple, List, Optional, Iterator, Sequence


from my.core import LazyLogger, get_files, Paths, PathIsh
from my.core.cachew import mcachew
from my.core.cfg import Attrs, make_config
from my.core.error import Res, split_errors


from more_itertools import bucket
import pdfannots


from my.config import pdfs as user_config

@dataclass
class pdfs(user_config):
    paths: Paths = ()  # allowed to be empty for 'filelist' logic

    def is_ignored(self, p: Path) -> bool:
        """
        Used to ignore some extremely heavy files
        is_ignored function taken either from config,
        or if not defined, it's a function that returns False
        """
        user_ignore = getattr(user_config, 'is_ignored', None)
        if user_ignore is not None:
            return user_ignore(p)

        return False

    @staticmethod
    def _migration(attrs: Attrs) -> Attrs:
        roots = 'roots'
        if roots in attrs:  # legacy name
            attrs['paths'] = attrs[roots]
            from my.core.warnings import high
            high(f'"{roots}" is deprecated! Use "paths" instead.')
        return attrs


config = make_config(pdfs, migration=pdfs._migration)

logger = LazyLogger(__name__)

def inputs() -> Sequence[Path]:
    all_files = get_files(config.paths, glob='**/*.pdf')
    return [p for p in all_files if not config.is_ignored(p)]


# TODO canonical names/fingerprinting?
# TODO defensive if pdf was removed, also cachew key needs to be defensive
class Annotation(NamedTuple):
    path: str
    author: Optional[str]
    page: int
    highlight: Optional[str]
    comment: Optional[str]
    created: Optional[datetime]  # note: can be tz unaware in some bad pdfs...

    @property
    def date(self) -> Optional[datetime]:
        # legacy name
        return self.created


def _as_annotation(*, raw: pdfannots.Annotation, path: str) -> Annotation:
    d = vars(raw)
    pos = raw.pos
    # make mypy happy (pos always present for Annotation https://github.com/0xabu/pdfannots/blob/dbdfefa158971e1746fae2da139918e9f59439ea/pdfannots/types.py#L302)
    assert pos is not None
    d['page'] = pos.page.pageno
    return Annotation(
        path      = path,
        author    = d['author'],
        page      = d['page'],
        highlight = raw.gettext(),
        comment   = d['contents'],
        created   = d['created'],
    )


def get_annots(p: Path) -> List[Annotation]:
    b = time.time()
    with p.open('rb') as fo:
        doc = pdfannots.process_file(fo, emit_progress_to=None)
        annots = [a for a in doc.iter_annots()]
        # also has outlines are kinda like TOC, I don't really need them
    a = time.time()
    took = a - b
    tooks = f'took {took:0.1f} seconds'
    if took > 5:
        tooks = tooks.upper()
    logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots))
    return [_as_annotation(raw=a, path=str(p)) for a in annots]


def _hash_files(pdfs: Sequence[Path]):
    # if mtime hasn't changed then the file hasn't changed either
    return [(pdf, pdf.stat().st_mtime) for pdf in pdfs]


# TODO might make more sense to be more fine grained here, e.g. cache annotations for indifidual files
@mcachew(depends_on=_hash_files)
def _iter_annotations(pdfs: Sequence[Path]) -> Iterator[Res[Annotation]]:
    logger.info('processing %d pdfs', len(pdfs))

    # todo how to print to stdout synchronously?
    # todo global config option not to use pools? useful for debugging..
    from concurrent.futures import ProcessPoolExecutor
    from my.core.utils.concurrent import DummyExecutor
    workers = None  # use 0 for debugging
    Pool = DummyExecutor if workers == 0 else ProcessPoolExecutor
    with Pool(workers) as pool:
        futures = [
            pool.submit(get_annots, pdf)
            for pdf in pdfs
        ]
        for f, pdf in zip(futures, pdfs):
            try:
                yield from f.result()
            except Exception as e:
                logger.error('While processing %s:', pdf)
                logger.exception(e)
                # todo add a comment that it can be ignored... or something like that
                # TODO not sure if should attach pdf as well; it's a bit annoying to pass around?
                # also really have to think about interaction with cachew...
                yield e


def annotations() -> Iterator[Res[Annotation]]:
    pdfs = inputs()
    yield from _iter_annotations(pdfs=pdfs)


class Pdf(NamedTuple):
    path: Path
    annotations: Sequence[Annotation]

    @property
    def created(self) -> Optional[datetime]:
        annots = self.annotations
        return None if len(annots) == 0 else annots[-1].created

    @property
    def date(self) -> Optional[datetime]:
        # legacy
        return self.created


def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]:
    if filelist is not None:
        # hacky... keeping it backwards compatible
        # https://github.com/karlicoss/HPI/pull/74
        config.paths = filelist
    ait = annotations()
    vit, eit = split_errors(ait, ET=Exception)

    bucketed = bucket(vit, key=lambda a: a.path)
    for k in bucketed:
        g = list(bucketed[k])
        yield Pdf(path=Path(k), annotations=g)
    yield from eit


from my.core import stat, Stats
def stats() -> Stats:
    return {
        **stat(annotations)   ,
        **stat(annotated_pdfs),
    }


### legacy/misc stuff
iter_annotations = annotations  # for backwards compatibility
###

# can use 'hpi query my.pdfs.annotations -o pprint' to test
#