diff --git a/my/pdfs.py b/my/pdfs.py index aab8d2d..3a4905d 100755 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -26,11 +26,31 @@ def get_logger(): def is_ignored(p: Path) -> bool: - # ignore some extremely heavy files - return config.is_ignored(p) + """ + Used to ignore some extremely heavy files + is_ignored function taken either from config, + or if not defined, it's a function that returns False + """ + if hasattr(config, 'is_ignored'): + return config.is_ignored(p) + + # Default + return lambda x: False -def candidates(roots=None) -> Iterator[Path]: +def candidates(filelist=None, roots=None) -> Iterator[Path]: + if filelist is not None: + return candidates_from_filelist(filelist) + else: + return candidates_from_roots(roots) + +def candidates_from_filelist(filelist) -> Iterator[Path]: + for f in filelist: + p = Path(f) + if not is_ignored(p): + yield p + +def candidates_from_roots(roots=None) -> Iterator[Path]: if roots is None: roots = config.roots @@ -124,8 +144,8 @@ def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]: yield e -def iter_annotations(roots=None) -> Iterator[Res[Annotation]]: - pdfs = list(sorted(candidates(roots=roots))) +def iter_annotations(filelist=None, roots=None) -> Iterator[Res[Annotation]]: + pdfs = list(sorted(candidates(filelist=filelist, roots=None))) yield from _iter_annotations(pdfs=pdfs) @@ -138,8 +158,8 @@ class Pdf(NamedTuple): return self.annotations[-1].date -def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]: - it = iter_annotations(roots=roots) +def annotated_pdfs(filelist=None, roots=None) -> Iterator[Res[Pdf]]: + it = iter_annotations(filelist=filelist, roots=roots) vit, eit = split_errors(it, ET=Exception) for k, g in group_by_key(vit, key=lambda a: a.path).items(): @@ -190,3 +210,4 @@ def main(): logger.info('collected annotations in: %s', r.path) for a in r.annotations: pprint(a) + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pdfs/Information Architecture for the World Wide Web.pdf b/tests/pdfs/Information Architecture for the World Wide Web.pdf new file mode 100644 index 0000000..4c69942 Binary files /dev/null and b/tests/pdfs/Information Architecture for the World Wide Web.pdf differ diff --git a/tests/pdfs/__init__.py b/tests/pdfs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pdfs/test_pdfs.py b/tests/pdfs/test_pdfs.py new file mode 100644 index 0000000..afc35d9 --- /dev/null +++ b/tests/pdfs/test_pdfs.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +import inspect +from pathlib import Path +import tempfile + +from my.pdfs import get_annots, annotated_pdfs + + +ROOT = Path(__file__).parent.absolute() +EXPECTED_HIGHLIGHTS = set(['Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ', + 'And yet, unlearn we must, ', + '', + ]) + +def test_get_annots(): + """ + Test get_annots, with a real PDF file + get_annots should return a list of three Annotation objects + """ + annotations = get_annots(Path(ROOT / 'Information Architecture for the World Wide Web.pdf')) + assert len(annotations) == 3 + assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS + + +def test_annotated_pdfs_with_filelist(): + """ + Test annotated_pdfs, with a real PDF file + annotated_pdfs should return a list of one Pdf object, with three Annotations + """ + filelist = [Path(ROOT / 'Information Architecture for the World Wide Web.pdf')] + annotations_generator = annotated_pdfs(filelist=filelist, roots=None) + + assert inspect.isgeneratorfunction(annotated_pdfs) + + highlights_from_pdfs = [] + + for pdf_object in list(annotations_generator): + highlights_from_pdfs.extend([a.highlight for a in pdf_object.annotations]) + + assert len(highlights_from_pdfs) == 3 + assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS