diff --git a/my/pdfs.py b/my/pdfs.py index 0ab4af3..524c68b 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -1,64 +1,64 @@ ''' PDF documents and annotations on your filesystem ''' + REQUIRES = [ 'git+https://github.com/0xabu/pdfannots', # todo not sure if should use pypi version? ] -from datetime import datetime -from dataclasses import dataclass -import io -from pathlib import Path import time -from typing import NamedTuple, List, Optional, Iterator, Sequence +from datetime import datetime +from pathlib import Path +from typing import Iterator, List, NamedTuple, Optional, Protocol, Sequence +import pdfannots +from more_itertools import bucket -from my.core import LazyLogger, get_files, Paths, PathIsh +from my.core import PathIsh, Paths, Stats, get_files, make_logger, stat from my.core.cachew import mcachew -from my.core.cfg import Attrs, make_config from my.core.error import Res, split_errors -from more_itertools import bucket -import pdfannots - - -from my.config import pdfs as user_config - -@dataclass -class pdfs(user_config): - paths: Paths = () # allowed to be empty for 'filelist' logic +class config(Protocol): + @property + def paths(self) -> Paths: + return () # allowed to be empty for 'filelist' logic def is_ignored(self, p: Path) -> bool: """ - Used to ignore some extremely heavy files - is_ignored function taken either from config, - or if not defined, it's a function that returns False + You can override this in user config if you want to ignore some files that are tooheavy """ - user_ignore = getattr(user_config, 'is_ignored', None) - if user_ignore is not None: - return user_ignore(p) - return False - @staticmethod - def _migration(attrs: Attrs) -> Attrs: - roots = 'roots' - if roots in attrs: # legacy name - attrs['paths'] = attrs[roots] - from my.core.warnings import high - high(f'"{roots}" is deprecated! Use "paths" instead.') - return attrs + +def make_config() -> config: + from my.config import pdfs as user_config + + class migration: + @property + def paths(self) -> Paths: + roots = getattr(user_config, 'roots', None) + if roots is not None: + from my.core.warnings import high + + high('"roots" is deprecated! Use "paths" instead.') + return roots + else: + return () + + class combined_config(user_config, migration, config): ... + + return combined_config() -config = make_config(pdfs, migration=pdfs._migration) +logger = make_logger(__name__) -logger = LazyLogger(__name__) def inputs() -> Sequence[Path]: - all_files = get_files(config.paths, glob='**/*.pdf') - return [p for p in all_files if not config.is_ignored(p)] + cfg = make_config() + all_files = get_files(cfg.paths, glob='**/*.pdf') + return [p for p in all_files if not cfg.is_ignored(p)] # TODO canonical names/fingerprinting? @@ -121,14 +121,13 @@ def _iter_annotations(pdfs: Sequence[Path]) -> Iterator[Res[Annotation]]: # todo how to print to stdout synchronously? # todo global config option not to use pools? useful for debugging.. from concurrent.futures import ProcessPoolExecutor + from my.core.utils.concurrent import DummyExecutor + workers = None # use 0 for debugging Pool = DummyExecutor if workers == 0 else ProcessPoolExecutor with Pool(workers) as pool: - futures = [ - pool.submit(get_annots, pdf) - for pdf in pdfs - ] + futures = [pool.submit(get_annots, pdf) for pdf in pdfs] for f, pdf in zip(futures, pdfs): try: yield from f.result() @@ -161,11 +160,13 @@ class Pdf(NamedTuple): return self.created -def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]: +def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]] = None) -> Iterator[Res[Pdf]]: if filelist is not None: # hacky... keeping it backwards compatible # https://github.com/karlicoss/HPI/pull/74 - config.paths = filelist + from my.config import pdfs as user_config + + user_config.paths = filelist ait = annotations() vit, eit = split_errors(ait, ET=Exception) @@ -176,10 +177,9 @@ def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Re yield from eit -from my.core import stat, Stats def stats() -> Stats: return { - **stat(annotations) , + **stat(annotations), **stat(annotated_pdfs), } diff --git a/my/tests/common.py b/my/tests/common.py index e3060e1..f8b645d 100644 --- a/my/tests/common.py +++ b/my/tests/common.py @@ -20,6 +20,10 @@ def reset_modules() -> None: ''' to_unload = [m for m in sys.modules if re.match(r'my[.]?', m)] for m in to_unload: + if 'my.pdfs' in m: + # temporary hack -- since my.pdfs migrated to a 'lazy' config, this isn't necessary anymore + # but if we reset module anyway, it confuses the ProcessPool inside my.pdfs + continue del sys.modules[m] diff --git a/tests/pdfs.py b/tests/pdfs.py index 63b1319..6db669f 100644 --- a/tests/pdfs.py +++ b/tests/pdfs.py @@ -1,17 +1,16 @@ +import inspect from pathlib import Path +import pytest from more_itertools import ilen -import pytest - +from my.core.cfg import tmp_config from my.tests.common import testdata +from my.pdfs import annotated_pdfs, annotations, get_annots + def test_module(with_config) -> None: - # TODO crap. if module is imported too early (on the top level, it makes it super hard to override config) - # need to at least detect it... - from my.pdfs import annotations, annotated_pdfs - # todo check types etc as well assert ilen(annotations()) >= 3 assert ilen(annotated_pdfs()) >= 1 @@ -22,12 +21,13 @@ def test_with_error(with_config, tmp_path: Path) -> None: root = tmp_path g = root / 'garbage.pdf' g.write_text('garbage') + from my.config import pdfs + # meh. otherwise legacy config value 'wins' del pdfs.roots # type: ignore[attr-defined] pdfs.paths = (root,) - from my.pdfs import annotations annots = list(annotations()) [annot] = annots assert isinstance(annot, Exception) @@ -35,9 +35,6 @@ def test_with_error(with_config, tmp_path: Path) -> None: @pytest.fixture def with_config(): - from my.tests.common import reset_modules - reset_modules() # todo ugh.. getting boilerplaty.. need to make it a bit more automatic.. - # extra_data = Path(__file__).absolute().parent / 'extra/data/polar' # assert extra_data.exists(), extra_data # todo hmm, turned out no annotations in these ones.. whatever @@ -47,13 +44,9 @@ def with_config(): testdata(), ] - import my.core.cfg as C - with C.tmp_config() as config: + with tmp_config() as config: config.pdfs = user_config - try: - yield - finally: - reset_modules() + yield EXPECTED_HIGHLIGHTS = { @@ -68,8 +61,6 @@ def test_get_annots() -> None: Test get_annots, with a real PDF file get_annots should return a list of three Annotation objects """ - from my.pdfs import get_annots - annotations = get_annots(testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf') assert len(annotations) == 3 assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS @@ -80,12 +71,9 @@ def test_annotated_pdfs_with_filelist() -> None: Test annotated_pdfs, with a real PDF file annotated_pdfs should return a list of one Pdf object, with three Annotations """ - from my.pdfs import annotated_pdfs - filelist = [testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf'] annotations_generator = annotated_pdfs(filelist=filelist) - import inspect assert inspect.isgeneratorfunction(annotated_pdfs) highlights_from_pdfs = []