my.pdfs: cleanup/refactor

- modernize: - add REQUIRES spec for pdfannots library - config dataclass/config stub - stats function - absolute my.core imports in anticipation of splitting core - use 'paths' instead of 'roots' (better reflects the semantics), use get_files backward compatible via config migration - properly run tests/mypy
2021-03-30 22:00:21 +01:00 · 2021-03-30 22:00:21 +01:00 · ad177a1ccd
commit ad177a1ccd
parent e7604c188e
6 changed files with 177 additions and 108 deletions
--- a/my/config.py
+++ b/my/config.py
@ -83,3 +83,7 @@ class commits:
    emails: Optional[Sequence[str]]
    names: Optional[Sequence[str]]
    roots: Sequence[PathIsh]
 class pdfs:
    paths: Paths
--- a/my/core/common.py
+++ b/my/core/common.py
@ -164,7 +164,9 @@ def get_files(
                warnings.warn(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!")
            paths.extend(map(Path, do_glob(gs)))
        elif src.is_dir():
-            gp: Iterable[Path] = src.glob(glob) # todo not sure if should be recursive?
+            # todo not sure if should be recursive?
            # note: glob='**/*.ext' works without any changes.. so perhaps it's ok as it is
            gp: Iterable[Path] = src.glob(glob)
            paths.extend(gp)
        else:
            if not src.is_file():
--- a/my/core/stats.py
+++ b/my/core/stats.py
@ -29,6 +29,8 @@ def is_data_provider(fun) -> bool:
    1. returns iterable or something like that
    2. takes no arguments? (otherwise not callable by stats anyway?)
    """
    # todo maybe for 2 allow default arguments? not sure
    # one example which could benefit is my.pdfs
    if fun is None:
        return False
    # todo. uh.. very similar to what cachew is trying to do?
--- a/my/pdfs.py
+++ b/my/pdfs.py
@ -1,75 +1,77 @@
 #!/usr/bin/env python3
 '''
 PDF documents and annotations on your filesystem
 '''
-from concurrent.futures import ProcessPoolExecutor
+REQUIRES = [
    'git+https://github.com/0xabu/pdfannots',
 ]
 from contextlib import redirect_stderr
 from datetime import datetime
 from dataclasses import dataclass
 import io
 from pathlib import Path
 import re
 import sys
-import io
+import time
-import logging
+from typing import NamedTuple, List, Optional, Iterator, Sequence
 from pathlib import Path
 from typing import NamedTuple, List, Optional, Iterator
 from contextlib import redirect_stderr
-from .common import mcachew, group_by_key
+from my.core import LazyLogger, get_files, Paths, PathIsh
-from .error import Res, split_errors
+from my.core.cfg import Attrs, make_config
-
+from my.core.common import mcachew, group_by_key
-# path to pdfannots (https://github.com/0xabu/pdfannots)
+from my.core.error import Res, split_errors
 import my.config.repos.pdfannots.pdfannots as pdfannots
 from my.config import pdfs as config
-def get_logger():
+import pdfannots  # type: ignore[import]
    return logging.getLogger('my.pdfs')
-def is_ignored(p: Path) -> bool:
+from my.config import pdfs as user_config
    """
    Used to ignore some extremely heavy files
    is_ignored function taken either from config,
    or if not defined, it's a function that returns False
    """
    if hasattr(config, 'is_ignored'):
        return config.is_ignored(p)
-    # Default
+@dataclass
-    return lambda x: False
+class pdfs(user_config):
    paths: Paths = ()  # allowed to be empty for 'filelist' logic
    def is_ignored(self, p: Path) -> bool:
        """
        Used to ignore some extremely heavy files
        is_ignored function taken either from config,
        or if not defined, it's a function that returns False
        """
        user_ignore = getattr(user_config, 'is_ignored', None)
        if user_ignore is not None:
            return user_ignore(p)
        return False
    @staticmethod
    def _migration(attrs: Attrs) -> Attrs:
        roots = 'roots'
        if roots in attrs:  # legacy name
            attrs['paths'] = attrs[roots]
            from my.core.warnings import high
            high(f'"{roots}" is deprecated! Use "paths" instead.')
        return attrs
-def candidates(filelist=None, roots=None) -> Iterator[Path]:
+config = make_config(pdfs, migration=pdfs._migration)
    if filelist is not None:
        return candidates_from_filelist(filelist)
    else:
        return candidates_from_roots(roots)
-def candidates_from_filelist(filelist) -> Iterator[Path]:
+logger = LazyLogger(__name__)
    for f in filelist:
        p = Path(f)
        if not is_ignored(p):
            yield p
-def candidates_from_roots(roots=None) -> Iterator[Path]:
+def inputs() -> Sequence[Path]:
-    if roots is None:
+    # TODO ignoring could be handled on get_files/user config site as well?..
-        roots = config.roots
+    all_files = get_files(config.paths, glob='**/*.pdf')
    return [p for p in all_files if not config.is_ignored(p)]
    for r in roots:
        for p in Path(r).rglob('*.pdf'):
            if not is_ignored(p):
                yield p
-# TODO canonical names
+# TODO canonical names/fingerprinting?
 # TODO defensive if pdf was removed, also cachew key needs to be defensive
 class Annotation(NamedTuple):
    path: str
    author: Optional[str]
    page: int
    highlight: Optional[str]
    comment: Optional[str]
-    date: Optional[datetime]
+    date: Optional[datetime]  # TODO tz aware/unaware?
 def as_annotation(*, raw_ann, path: str) -> Annotation:
@ -106,29 +108,40 @@ def as_annotation(*, raw_ann, path: str) -> Annotation:
 def get_annots(p: Path) -> List[Annotation]:
    b = time.time()
    with p.open('rb') as fo:
        f = io.StringIO()
        with redirect_stderr(f):
            # FIXME
            (annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
            # outlines are kinda like TOC, I don't really need them
    a = time.time()
    took = a - b
    tooks = f'took {took:0.1f} seconds'
    if took > 5:
        tooks = tooks.upper()
    logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots))
    return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
    # TODO stderr?
-def hash_files(pdfs: List[Path]):
+def _hash_files(pdfs: Sequence[Path]):
    # if mtime hasn't changed then the file hasn't changed either
    return [(pdf, pdf.stat().st_mtime) for pdf in pdfs]
 # TODO might make more sense to be more fine grained here, e.g. cache annotations for indifidual files
-
+@mcachew(depends_on=_hash_files)
-@mcachew(hashf=hash_files)
+def _iter_annotations(pdfs: Sequence[Path]) -> Iterator[Res[Annotation]]:
 def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
    logger = get_logger()
    logger.info('processing %d pdfs', len(pdfs))
-    # TODO how to print to stdout synchronously?
+    # todo how to print to stdout synchronously?
-    with ProcessPoolExecutor() as pool:
+    # todo global config option not to use pools? useful for debugging..
    from concurrent.futures import ProcessPoolExecutor
    from my.core.common import DummyExecutor
    workers = None  # use 0 for debugging
    Pool = DummyExecutor if workers == 0 else ProcessPoolExecutor
    with Pool(workers) as pool:
        futures = [
            pool.submit(get_annots, pdf)
            for pdf in pdfs
@ -139,75 +152,61 @@ def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
            except Exception as e:
                logger.error('While processing %s:', pdf)
                logger.exception(e)
                # todo add a comment that it can be ignored... or something like that
                # TODO not sure if should attach pdf as well; it's a bit annoying to pass around?
                # also really have to think about interaction with cachew...
                yield e
-def iter_annotations(filelist=None, roots=None) -> Iterator[Res[Annotation]]:
+def annotations() -> Iterator[Res[Annotation]]:
-    pdfs = list(sorted(candidates(filelist=filelist, roots=None)))
+    pdfs = inputs()
    yield from _iter_annotations(pdfs=pdfs)
 class Pdf(NamedTuple):
    path: Path
-    annotations: List[Annotation]
+    annotations: Sequence[Annotation]
    @property
-    def date(self):
+    def date(self) -> Optional[datetime]:
        # TODO tz aware/unaware
        return self.annotations[-1].date
-def annotated_pdfs(filelist=None, roots=None) -> Iterator[Res[Pdf]]:
+def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]:
-    it = iter_annotations(filelist=filelist, roots=roots)
+    if filelist is not None:
-    vit, eit = split_errors(it, ET=Exception)
+        # hacky... keeping it backwards compatible
        # https://github.com/karlicoss/HPI/pull/74
        config.paths = filelist
    ait = annotations()
    vit, eit = split_errors(ait, ET=Exception)
    for k, g in group_by_key(vit, key=lambda a: a.path).items():
        yield Pdf(path=Path(k), annotations=g)
    yield from eit
-def test():
+from my.core import stat, Stats
-    res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
+def stats() -> Stats:
-    assert len(res) > 3
+    return {
        **stat(annotations)   ,
        **stat(annotated_pdfs),
    }
-def test2():
+### legacy/misc stuff
    res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf'))
    print(res)
-
+# todo retire later if favor of hpi query?
-def test_with_error():
+def main() -> None:
    # TODO need example of pdf file...
    import tempfile
    with tempfile.TemporaryDirectory() as td:
        root = Path(td)
        g = root / 'garbage.pdf'
        g.write_text('garbage')
        roots = [
            root,
            # '/usr/share/doc/texlive-doc/latex/amsrefs/',
        ]
        # TODO find some pdfs that actually has annotations...
        annots = list(iter_annotations(roots=roots))
    assert len(annots) == 1
    assert isinstance(annots[0], Exception)
 def main():
    from pprint import pprint
    collected = annotated_pdfs()
    for r in collected:
        if isinstance(r, Exception):
            logger.exception(r)
        else:
            logger.info('collected annotations in: %s', r.path)
            for a in r.annotations:
                pprint(a)
-    logger = get_logger()
+iter_annotations = annotations  # for backwards compatibility
-    from .common import setup_logger
+###
    setup_logger(logger, level=logging.DEBUG)
    collected = list(annotated_pdfs())
    if len(collected) > 0:
        for r in collected:
            if isinstance(r, Exception):
                logger.exception(r)
            else:
                logger.info('collected annotations in: %s', r.path)
                for a in r.annotations:
                    pprint(a)
--- a/tests/pdfs.py
+++ b/tests/pdfs.py
@ -1,11 +1,60 @@
 import inspect
 from pathlib import Path
 import tempfile
-from my.pdfs import get_annots, annotated_pdfs
+from more_itertools import ilen
 import pytest
 from .common import testdata
 def test_module(with_config) -> None:
    # TODO crap. if module is imported too early (on the top level, it makes it super hard to overrride config)
    # need to at least detect it...
    from my.pdfs import annotations, annotated_pdfs
    # todo check types etc as well
    assert ilen(annotations()) >= 3
    assert ilen(annotated_pdfs()) >= 1
 def test_with_error(with_config, tmp_path: Path) -> None:
    """should handle crappy files gracefully"""
    root = tmp_path
    g = root / 'garbage.pdf'
    g.write_text('garbage')
    from my.config import pdfs
    del pdfs.roots  # meh. otherwise legacy config value 'wins'
    pdfs.paths = (root,)
    from my.pdfs import annotations
    annots = list(annotations())
    [annot] = annots
    assert isinstance(annot, Exception)
@pytest.fixture
 def with_config():
    from .common import reset_modules
    reset_modules()  # todo ugh.. getting boilerplaty.. need to make it a bit more automatic..
    # extra_data = Path(__file__).absolute().parent / 'extra/data/polar'
    # assert extra_data.exists(), extra_data
    # todo hmm, turned out no annotations in these ones.. whatever
    class user_config:
        roots = [
            testdata(),
        ]
    import my.core.cfg as C
    with C.tmp_config() as config:
        config.pdfs = user_config  # type: ignore
        try:
            yield
        finally:
            reset_modules()
 EXPECTED_HIGHLIGHTS = {
    'Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ',
    'And yet, unlearn we must, ',
@ -18,6 +67,8 @@ def test_get_annots() -> None:
    Test get_annots, with a real PDF file
    get_annots should return a list of three Annotation objects
    """
    from my.pdfs import get_annots
    annotations = get_annots(testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf')
    assert len(annotations) == 3
    assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS
@ -28,9 +79,12 @@ def test_annotated_pdfs_with_filelist() -> None:
    Test annotated_pdfs, with a real PDF file
    annotated_pdfs should return a list of one Pdf object, with three Annotations
    """
-    filelist = [testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf']
+    from my.pdfs import annotated_pdfs
    annotations_generator = annotated_pdfs(filelist=filelist, roots=None)
    filelist = [testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf']
    annotations_generator = annotated_pdfs(filelist=filelist)
    import inspect
    assert inspect.isgeneratorfunction(annotated_pdfs)
    highlights_from_pdfs = []
@ -41,3 +95,8 @@ def test_annotated_pdfs_with_filelist() -> None:
    assert len(highlights_from_pdfs) == 3
    assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS
 # todo old test on my(karlicoss) computer:
 # - mature-optimization_wtf.pdf: >3 annotations?
 # - nonlinear2.pdf
--- a/tox.ini
+++ b/tox.ini
@ -37,11 +37,12 @@ commands =
    hpi module install my.coding.commits
    hpi module install my.pdfs
    python3 -m pytest tests \
            # ignore some tests which might take a while to run on ci..
            --ignore tests/takeout.py        \
-            --ignore tests/extra/polar.py    \
+            --ignore tests/extra/polar.py
            --ignore tests/pdfs/test_pdfs.py \
            {posargs}
@ -82,6 +83,7 @@ commands =
    hpi module install my.arbtt
    hpi module install my.coding.commits
    hpi module install my.goodreads
    hpi module install my.pdfs
    # todo fuck. -p my.github isn't checking the subpackages?? wtf...
    # guess it wants .pyi file??
@ -103,6 +105,7 @@ commands =
                    -p my.arbtt                       \
                    -p my.coding.commits              \
                    -p my.goodreads                   \
                    -p my.pdfs                        \
                    --txt-report  .coverage.mypy-misc \
                    --html-report .coverage.mypy-misc \
                    {posargs}