Merge pull request #74 from thetomcraig/pdfs-process-filelist

Add "filelist" parameter to annotated_pdfs
2020-08-20 21:08:57 +01:00 · 2020-08-20 21:08:57 +01:00 · cde5502151
commit cde5502151
parent 5b2cc577f2 5dc62ff085
5 changed files with 70 additions and 7 deletions
--- a/my/pdfs.py
+++ b/my/pdfs.py
@ -26,11 +26,31 @@ def get_logger():


 def is_ignored(p: Path) -> bool:
-    # ignore some extremely heavy files
-    return config.is_ignored(p)
+    """
+    Used to ignore some extremely heavy files
+    is_ignored function taken either from config,
+    or if not defined, it's a function that returns False
+    """
+    if hasattr(config, 'is_ignored'):
+        return config.is_ignored(p)
+
+    # Default
+    return lambda x: False


-def candidates(roots=None) -> Iterator[Path]:
+def candidates(filelist=None, roots=None) -> Iterator[Path]:
+    if filelist is not None:
+        return candidates_from_filelist(filelist)
+    else:
+        return candidates_from_roots(roots)
+
+def candidates_from_filelist(filelist) -> Iterator[Path]:
+    for f in filelist:
+        p = Path(f)
+        if not is_ignored(p):
+            yield p
+
+def candidates_from_roots(roots=None) -> Iterator[Path]:
    if roots is None:
        roots = config.roots

@ -124,8 +144,8 @@ def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
                yield e


-def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
-    pdfs = list(sorted(candidates(roots=roots)))
+def iter_annotations(filelist=None, roots=None) -> Iterator[Res[Annotation]]:
+    pdfs = list(sorted(candidates(filelist=filelist, roots=None)))
    yield from _iter_annotations(pdfs=pdfs)


@ -138,8 +158,8 @@ class Pdf(NamedTuple):
        return self.annotations[-1].date


-def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]:
-    it = iter_annotations(roots=roots)
+def annotated_pdfs(filelist=None, roots=None) -> Iterator[Res[Pdf]]:
+    it = iter_annotations(filelist=filelist, roots=roots)
    vit, eit = split_errors(it, ET=Exception)

    for k, g in group_by_key(vit, key=lambda a: a.path).items():
@ -190,3 +210,4 @@ def main():
                logger.info('collected annotations in: %s', r.path)
                for a in r.annotations:
                    pprint(a)
+
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/pdfs/Information
+++ b/tests/pdfs/Information
--- a/tests/pdfs/init.py
+++ b/tests/pdfs/init.py
--- a/tests/pdfs/test_pdfs.py
+++ b/tests/pdfs/test_pdfs.py
@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+import inspect
+from pathlib import Path
+import tempfile
+
+from my.pdfs import get_annots, annotated_pdfs
+
+
+ROOT = Path(__file__).parent.absolute()
+EXPECTED_HIGHLIGHTS = set(['Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ',
+                           'And yet, unlearn we must, ',
+                           '',
+                         ])
+
+def test_get_annots():
+    """
+    Test get_annots, with a real PDF file
+    get_annots should return a list of three Annotation objects
+    """
+    annotations = get_annots(Path(ROOT / 'Information Architecture for the World Wide Web.pdf'))
+    assert len(annotations) == 3
+    assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS
+
+
+def test_annotated_pdfs_with_filelist():
+    """
+    Test annotated_pdfs, with a real PDF file
+    annotated_pdfs should return a list of one Pdf object, with three Annotations
+    """
+    filelist = [Path(ROOT / 'Information Architecture for the World Wide Web.pdf')]
+    annotations_generator = annotated_pdfs(filelist=filelist, roots=None)
+
+    assert inspect.isgeneratorfunction(annotated_pdfs)
+
+    highlights_from_pdfs = []
+
+    for pdf_object in list(annotations_generator):
+        highlights_from_pdfs.extend([a.highlight for a in pdf_object.annotations])
+
+    assert len(highlights_from_pdfs) == 3
+    assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS