Add a "filelist" paramter to annotated_pdfs

This commit is contained in:
Tom Craig 2020-08-16 12:57:20 -07:00
parent 5eecd8721d
commit 882ceb62fc

View file

@ -26,11 +26,31 @@ def get_logger():
def is_ignored(p: Path) -> bool: def is_ignored(p: Path) -> bool:
# ignore some extremely heavy files """
return config.is_ignored(p) Used to ignore some extremely heavy files
is_ignored function taken either from config,
or if not defined, it's a function that returns False
"""
if hasattr(config, 'is_ignored'):
return config.is_ignored(p)
# Default
return lambda x: False
def candidates(roots=None) -> Iterator[Path]: def candidates(filelist=None, roots=None) -> Iterator[Path]:
if filelist is not None:
return candidates_from_filelist(filelist)
else:
return candidates_from_roots(roots)
def candidates_from_filelist(filelist) -> Iterator[Path]:
for f in filelist:
p = Path(f)
if not is_ignored(p):
yield p
def candidates_from_roots(roots=None) -> Iterator[Path]:
if roots is None: if roots is None:
roots = config.roots roots = config.roots
@ -58,7 +78,7 @@ def as_annotation(*, raw_ann, path: str) -> Annotation:
for a in ('boxes', 'rect'): for a in ('boxes', 'rect'):
if a in d: if a in d:
del d[a] del d[a]
dates = d['date'] dates = d.get('date')
date: Optional[datetime] = None date: Optional[datetime] = None
if dates is not None: if dates is not None:
dates = dates.replace("'", "") dates = dates.replace("'", "")
@ -124,8 +144,8 @@ def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
yield e yield e
def iter_annotations(roots=None) -> Iterator[Res[Annotation]]: def iter_annotations(filelist=None, roots=None) -> Iterator[Res[Annotation]]:
pdfs = list(sorted(candidates(roots=roots))) pdfs = list(sorted(candidates(filelist=filelist, roots=None)))
yield from _iter_annotations(pdfs=pdfs) yield from _iter_annotations(pdfs=pdfs)
@ -138,8 +158,8 @@ class Pdf(NamedTuple):
return self.annotations[-1].date return self.annotations[-1].date
def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]: def annotated_pdfs(filelist=None, roots=None) -> Iterator[Res[Pdf]]:
it = iter_annotations(roots=roots) it = iter_annotations(filelist=filelist, roots=roots)
vit, eit = split_errors(it, ET=Exception) vit, eit = split_errors(it, ET=Exception)
for k, g in group_by_key(vit, key=lambda a: a.path).items(): for k, g in group_by_key(vit, key=lambda a: a.path).items():
@ -190,3 +210,4 @@ def main():
logger.info('collected annotations in: %s', r.path) logger.info('collected annotations in: %s', r.path)
for a in r.annotations: for a in r.annotations:
pprint(a) pprint(a)