pdfs: migrate config to Protocol with properties

allowes to remove a whole bunch of hacky crap from tests!
2024-08-23 00:47:00 +01:00 · 2024-08-23 00:47:00 +01:00 · 5a67f0bafe
commit 5a67f0bafe
parent d154825591
3 changed files with 56 additions and 64 deletions
--- a/my/pdfs.py
+++ b/my/pdfs.py
@ -1,64 +1,64 @@
 '''
 PDF documents and annotations on your filesystem
 '''
+
 REQUIRES = [
    'git+https://github.com/0xabu/pdfannots',
    # todo not sure if should use pypi version?
 ]

-from datetime import datetime
-from dataclasses import dataclass
-import io
-from pathlib import Path
 import time
-from typing import NamedTuple, List, Optional, Iterator, Sequence
+from datetime import datetime
+from pathlib import Path
+from typing import Iterator, List, NamedTuple, Optional, Protocol, Sequence

+import pdfannots
+from more_itertools import bucket

-from my.core import LazyLogger, get_files, Paths, PathIsh
+from my.core import PathIsh, Paths, Stats, get_files, make_logger, stat
 from my.core.cachew import mcachew
-from my.core.cfg import Attrs, make_config
 from my.core.error import Res, split_errors


-from more_itertools import bucket
-import pdfannots
-
-
-from my.config import pdfs as user_config
-
-@dataclass
-class pdfs(user_config):
-    paths: Paths = ()  # allowed to be empty for 'filelist' logic
+class config(Protocol):
+    @property
+    def paths(self) -> Paths:
+        return ()  # allowed to be empty for 'filelist' logic

    def is_ignored(self, p: Path) -> bool:
        """
-        Used to ignore some extremely heavy files
-        is_ignored function taken either from config,
-        or if not defined, it's a function that returns False
+        You can override this in user config if you want to ignore some files that are tooheavy
        """
-        user_ignore = getattr(user_config, 'is_ignored', None)
-        if user_ignore is not None:
-            return user_ignore(p)
-
        return False

-    @staticmethod
-    def _migration(attrs: Attrs) -> Attrs:
-        roots = 'roots'
-        if roots in attrs:  # legacy name
-            attrs['paths'] = attrs[roots]
+
+def make_config() -> config:
+    from my.config import pdfs as user_config
+
+    class migration:
+        @property
+        def paths(self) -> Paths:
+            roots = getattr(user_config, 'roots', None)
+            if roots is not None:
                from my.core.warnings import high
-            high(f'"{roots}" is deprecated! Use "paths" instead.')
-        return attrs
+
+                high('"roots" is deprecated! Use "paths" instead.')
+                return roots
+            else:
+                return ()
+
+    class combined_config(user_config, migration, config): ...
+
+    return combined_config()


-config = make_config(pdfs, migration=pdfs._migration)
+logger = make_logger(__name__)

-logger = LazyLogger(__name__)

 def inputs() -> Sequence[Path]:
-    all_files = get_files(config.paths, glob='**/*.pdf')
-    return [p for p in all_files if not config.is_ignored(p)]
+    cfg = make_config()
+    all_files = get_files(cfg.paths, glob='**/*.pdf')
+    return [p for p in all_files if not cfg.is_ignored(p)]


 # TODO canonical names/fingerprinting?
@ -121,14 +121,13 @@ def _iter_annotations(pdfs: Sequence[Path]) -> Iterator[Res[Annotation]]:
    # todo how to print to stdout synchronously?
    # todo global config option not to use pools? useful for debugging..
    from concurrent.futures import ProcessPoolExecutor
+
    from my.core.utils.concurrent import DummyExecutor
+
    workers = None  # use 0 for debugging
    Pool = DummyExecutor if workers == 0 else ProcessPoolExecutor
    with Pool(workers) as pool:
-        futures = [
-            pool.submit(get_annots, pdf)
-            for pdf in pdfs
-        ]
+        futures = [pool.submit(get_annots, pdf) for pdf in pdfs]
        for f, pdf in zip(futures, pdfs):
            try:
                yield from f.result()
@ -161,11 +160,13 @@ class Pdf(NamedTuple):
        return self.created


-def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]:
+def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]] = None) -> Iterator[Res[Pdf]]:
    if filelist is not None:
        # hacky... keeping it backwards compatible
        # https://github.com/karlicoss/HPI/pull/74
-        config.paths = filelist
+        from my.config import pdfs as user_config
+
+        user_config.paths = filelist
    ait = annotations()
    vit, eit = split_errors(ait, ET=Exception)

@ -176,10 +177,9 @@ def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Re
    yield from eit


-from my.core import stat, Stats
 def stats() -> Stats:
    return {
-        **stat(annotations)   ,
+        **stat(annotations),
        **stat(annotated_pdfs),
    }

--- a/my/tests/common.py
+++ b/my/tests/common.py
@ -20,6 +20,10 @@ def reset_modules() -> None:
    '''
    to_unload = [m for m in sys.modules if re.match(r'my[.]?', m)]
    for m in to_unload:
+        if 'my.pdfs' in m:
+            # temporary hack -- since my.pdfs migrated to a 'lazy' config, this isn't necessary anymore
+            # but if we reset module anyway, it confuses the ProcessPool inside my.pdfs
+            continue
        del sys.modules[m]


--- a/tests/pdfs.py
+++ b/tests/pdfs.py
@ -1,17 +1,16 @@
+import inspect
 from pathlib import Path

+import pytest
 from more_itertools import ilen

-import pytest
-
+from my.core.cfg import tmp_config
 from my.tests.common import testdata

+from my.pdfs import annotated_pdfs, annotations, get_annots
+

 def test_module(with_config) -> None:
-    # TODO crap. if module is imported too early (on the top level, it makes it super hard to override config)
-    # need to at least detect it...
-    from my.pdfs import annotations, annotated_pdfs
-
    # todo check types etc as well
    assert ilen(annotations()) >= 3
    assert ilen(annotated_pdfs()) >= 1
@ -22,12 +21,13 @@ def test_with_error(with_config, tmp_path: Path) -> None:
    root = tmp_path
    g = root / 'garbage.pdf'
    g.write_text('garbage')
+
    from my.config import pdfs
+
    # meh. otherwise legacy config value 'wins'
    del pdfs.roots  # type: ignore[attr-defined]
    pdfs.paths = (root,)

-    from my.pdfs import annotations
    annots = list(annotations())
    [annot] = annots
    assert isinstance(annot, Exception)
@ -35,9 +35,6 @@ def test_with_error(with_config, tmp_path: Path) -> None:

@pytest.fixture
 def with_config():
-    from my.tests.common import reset_modules
-    reset_modules()  # todo ugh.. getting boilerplaty.. need to make it a bit more automatic..
-
    # extra_data = Path(__file__).absolute().parent / 'extra/data/polar'
    # assert extra_data.exists(), extra_data
    # todo hmm, turned out no annotations in these ones.. whatever
@ -47,13 +44,9 @@ def with_config():
            testdata(),
        ]

-    import my.core.cfg as C
-    with C.tmp_config() as config:
+    with tmp_config() as config:
        config.pdfs = user_config
-        try:
        yield
-        finally:
-            reset_modules()


 EXPECTED_HIGHLIGHTS = {
@ -68,8 +61,6 @@ def test_get_annots() -> None:
    Test get_annots, with a real PDF file
    get_annots should return a list of three Annotation objects
    """
-    from my.pdfs import get_annots
-
    annotations = get_annots(testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf')
    assert len(annotations) == 3
    assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS
@ -80,12 +71,9 @@ def test_annotated_pdfs_with_filelist() -> None:
    Test annotated_pdfs, with a real PDF file
    annotated_pdfs should return a list of one Pdf object, with three Annotations
    """
-    from my.pdfs import annotated_pdfs
-
    filelist = [testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf']
    annotations_generator = annotated_pdfs(filelist=filelist)

-    import inspect
    assert inspect.isgeneratorfunction(annotated_pdfs)

    highlights_from_pdfs = []