Merge pull request #74 from thetomcraig/pdfs-process-filelist
Add "filelist" parameter to annotated_pdfs
This commit is contained in:
commit
cde5502151
5 changed files with 70 additions and 7 deletions
33
my/pdfs.py
33
my/pdfs.py
|
@ -26,11 +26,31 @@ def get_logger():
|
||||||
|
|
||||||
|
|
||||||
def is_ignored(p: Path) -> bool:
|
def is_ignored(p: Path) -> bool:
|
||||||
# ignore some extremely heavy files
|
"""
|
||||||
|
Used to ignore some extremely heavy files
|
||||||
|
is_ignored function taken either from config,
|
||||||
|
or if not defined, it's a function that returns False
|
||||||
|
"""
|
||||||
|
if hasattr(config, 'is_ignored'):
|
||||||
return config.is_ignored(p)
|
return config.is_ignored(p)
|
||||||
|
|
||||||
|
# Default
|
||||||
|
return lambda x: False
|
||||||
|
|
||||||
def candidates(roots=None) -> Iterator[Path]:
|
|
||||||
|
def candidates(filelist=None, roots=None) -> Iterator[Path]:
|
||||||
|
if filelist is not None:
|
||||||
|
return candidates_from_filelist(filelist)
|
||||||
|
else:
|
||||||
|
return candidates_from_roots(roots)
|
||||||
|
|
||||||
|
def candidates_from_filelist(filelist) -> Iterator[Path]:
|
||||||
|
for f in filelist:
|
||||||
|
p = Path(f)
|
||||||
|
if not is_ignored(p):
|
||||||
|
yield p
|
||||||
|
|
||||||
|
def candidates_from_roots(roots=None) -> Iterator[Path]:
|
||||||
if roots is None:
|
if roots is None:
|
||||||
roots = config.roots
|
roots = config.roots
|
||||||
|
|
||||||
|
@ -124,8 +144,8 @@ def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
|
|
||||||
def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
|
def iter_annotations(filelist=None, roots=None) -> Iterator[Res[Annotation]]:
|
||||||
pdfs = list(sorted(candidates(roots=roots)))
|
pdfs = list(sorted(candidates(filelist=filelist, roots=None)))
|
||||||
yield from _iter_annotations(pdfs=pdfs)
|
yield from _iter_annotations(pdfs=pdfs)
|
||||||
|
|
||||||
|
|
||||||
|
@ -138,8 +158,8 @@ class Pdf(NamedTuple):
|
||||||
return self.annotations[-1].date
|
return self.annotations[-1].date
|
||||||
|
|
||||||
|
|
||||||
def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]:
|
def annotated_pdfs(filelist=None, roots=None) -> Iterator[Res[Pdf]]:
|
||||||
it = iter_annotations(roots=roots)
|
it = iter_annotations(filelist=filelist, roots=roots)
|
||||||
vit, eit = split_errors(it, ET=Exception)
|
vit, eit = split_errors(it, ET=Exception)
|
||||||
|
|
||||||
for k, g in group_by_key(vit, key=lambda a: a.path).items():
|
for k, g in group_by_key(vit, key=lambda a: a.path).items():
|
||||||
|
@ -190,3 +210,4 @@ def main():
|
||||||
logger.info('collected annotations in: %s', r.path)
|
logger.info('collected annotations in: %s', r.path)
|
||||||
for a in r.annotations:
|
for a in r.annotations:
|
||||||
pprint(a)
|
pprint(a)
|
||||||
|
|
||||||
|
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
BIN
tests/pdfs/Information Architecture for the World Wide Web.pdf
Normal file
BIN
tests/pdfs/Information Architecture for the World Wide Web.pdf
Normal file
Binary file not shown.
0
tests/pdfs/__init__.py
Normal file
0
tests/pdfs/__init__.py
Normal file
42
tests/pdfs/test_pdfs.py
Normal file
42
tests/pdfs/test_pdfs.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import inspect
|
||||||
|
from pathlib import Path
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from my.pdfs import get_annots, annotated_pdfs
|
||||||
|
|
||||||
|
|
||||||
|
ROOT = Path(__file__).parent.absolute()
|
||||||
|
EXPECTED_HIGHLIGHTS = set(['Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ',
|
||||||
|
'And yet, unlearn we must, ',
|
||||||
|
'',
|
||||||
|
])
|
||||||
|
|
||||||
|
def test_get_annots():
|
||||||
|
"""
|
||||||
|
Test get_annots, with a real PDF file
|
||||||
|
get_annots should return a list of three Annotation objects
|
||||||
|
"""
|
||||||
|
annotations = get_annots(Path(ROOT / 'Information Architecture for the World Wide Web.pdf'))
|
||||||
|
assert len(annotations) == 3
|
||||||
|
assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS
|
||||||
|
|
||||||
|
|
||||||
|
def test_annotated_pdfs_with_filelist():
|
||||||
|
"""
|
||||||
|
Test annotated_pdfs, with a real PDF file
|
||||||
|
annotated_pdfs should return a list of one Pdf object, with three Annotations
|
||||||
|
"""
|
||||||
|
filelist = [Path(ROOT / 'Information Architecture for the World Wide Web.pdf')]
|
||||||
|
annotations_generator = annotated_pdfs(filelist=filelist, roots=None)
|
||||||
|
|
||||||
|
assert inspect.isgeneratorfunction(annotated_pdfs)
|
||||||
|
|
||||||
|
highlights_from_pdfs = []
|
||||||
|
|
||||||
|
for pdf_object in list(annotations_generator):
|
||||||
|
highlights_from_pdfs.extend([a.highlight for a in pdf_object.annotations])
|
||||||
|
|
||||||
|
assert len(highlights_from_pdfs) == 3
|
||||||
|
assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS
|
Loading…
Add table
Reference in a new issue