Merge pull request #74 from thetomcraig/pdfs-process-filelist

Add "filelist" parameter to annotated_pdfs
2020-08-20 21:08:57 +01:00 · 2020-08-20 21:08:57 +01:00 · cde5502151
commit cde5502151
parent 5b2cc577f2 5dc62ff085
5 changed files with 70 additions and 7 deletions
--- a/my/pdfs.py
+++ b/my/pdfs.py
@ -26,11 +26,31 @@ def get_logger():
 def is_ignored(p: Path) -> bool:
-    # ignore some extremely heavy files
+    """
    Used to ignore some extremely heavy files
    is_ignored function taken either from config,
    or if not defined, it's a function that returns False
    """
    if hasattr(config, 'is_ignored'):
        return config.is_ignored(p)
    # Default
    return lambda x: False
-def candidates(roots=None) -> Iterator[Path]:
+
 def candidates(filelist=None, roots=None) -> Iterator[Path]:
    if filelist is not None:
        return candidates_from_filelist(filelist)
    else:
        return candidates_from_roots(roots)
 def candidates_from_filelist(filelist) -> Iterator[Path]:
    for f in filelist:
        p = Path(f)
        if not is_ignored(p):
            yield p
 def candidates_from_roots(roots=None) -> Iterator[Path]:
    if roots is None:
        roots = config.roots
@ -124,8 +144,8 @@ def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
                yield e
-def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
+def iter_annotations(filelist=None, roots=None) -> Iterator[Res[Annotation]]:
-    pdfs = list(sorted(candidates(roots=roots)))
+    pdfs = list(sorted(candidates(filelist=filelist, roots=None)))
    yield from _iter_annotations(pdfs=pdfs)
@ -138,8 +158,8 @@ class Pdf(NamedTuple):
        return self.annotations[-1].date
-def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]:
+def annotated_pdfs(filelist=None, roots=None) -> Iterator[Res[Pdf]]:
-    it = iter_annotations(roots=roots)
+    it = iter_annotations(filelist=filelist, roots=roots)
    vit, eit = split_errors(it, ET=Exception)
    for k, g in group_by_key(vit, key=lambda a: a.path).items():
@ -190,3 +210,4 @@ def main():
                logger.info('collected annotations in: %s', r.path)
                for a in r.annotations:
                    pprint(a)
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/pdfs/Information
+++ b/tests/pdfs/Information
--- a/tests/pdfs/init.py
+++ b/tests/pdfs/init.py
--- a/tests/pdfs/test_pdfs.py
+++ b/tests/pdfs/test_pdfs.py
@ -0,0 +1,42 @@
 #!/usr/bin/env python3
 import inspect
 from pathlib import Path
 import tempfile
 from my.pdfs import get_annots, annotated_pdfs
 ROOT = Path(__file__).parent.absolute()
 EXPECTED_HIGHLIGHTS = set(['Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ',
                           'And yet, unlearn we must, ',
                           '',
                         ])
 def test_get_annots():
    """
    Test get_annots, with a real PDF file
    get_annots should return a list of three Annotation objects
    """
    annotations = get_annots(Path(ROOT / 'Information Architecture for the World Wide Web.pdf'))
    assert len(annotations) == 3
    assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS
 def test_annotated_pdfs_with_filelist():
    """
    Test annotated_pdfs, with a real PDF file
    annotated_pdfs should return a list of one Pdf object, with three Annotations
    """
    filelist = [Path(ROOT / 'Information Architecture for the World Wide Web.pdf')]
    annotations_generator = annotated_pdfs(filelist=filelist, roots=None)
    assert inspect.isgeneratorfunction(annotated_pdfs)
    highlights_from_pdfs = []
    for pdf_object in list(annotations_generator):
        highlights_from_pdfs.extend([a.highlight for a in pdf_object.annotations])
    assert len(highlights_from_pdfs) == 3
    assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS