Use experimental cachew Exceptions

This commit is contained in:
Dima Gerasimov 2020-01-08 22:07:34 +00:00
parent a57be019d0
commit 012249ceca
2 changed files with 16 additions and 5 deletions

View file

@ -125,4 +125,6 @@ def mcachew(*args, **kwargs):
warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew') warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew')
return lambda orig_func: orig_func return lambda orig_func: orig_func
else: else:
import cachew.experimental
cachew.experimental.enable_exceptions() # TODO do it only once?
return cachew.cachew(*args, **kwargs) return cachew.cachew(*args, **kwargs)

View file

@ -23,6 +23,7 @@ def get_logger():
def is_ignored(p: Path) -> bool: def is_ignored(p: Path) -> bool:
# ignore some extremely heavy files
return paths.pdfs.is_ignored(p) return paths.pdfs.is_ignored(p)
@ -85,19 +86,22 @@ def get_annots(p: Path) -> List[Annotation]:
with p.open('rb') as fo: with p.open('rb') as fo:
f = io.StringIO() f = io.StringIO()
with redirect_stderr(f): with redirect_stderr(f):
# TODO FIXME defensive, try on garbage file (s)
(annots, outlines) = pdfannots.process_file(fo, emit_progress=False) (annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
# outlines are kinda like TOC, I don't really need them # outlines are kinda like TOC, I don't really need them
return [as_annotation(raw_ann=a, path=str(p)) for a in annots] return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
# TODO stderr? # TODO stderr?
# TODO cachew needs to be based on mtime, hence take candidates, not roots def hash_files(pdfs: List[Path]):
# @mcachew # if mtime hasn't changed then the file hasn't changed either
def iter_annotations(roots=None) -> Iterator[Res[Annotation]]: return [(pdf, pdf.stat().st_mtime) for pdf in pdfs]
# TODO might make more sense to be more fine grained here, e.g. cache annotations for indifidual files
@mcachew(hashf=hash_files)
def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
logger = get_logger() logger = get_logger()
pdfs = list(sorted(candidates(roots=roots)))
logger.info('processing %d pdfs', len(pdfs)) logger.info('processing %d pdfs', len(pdfs))
# TODO how to print to stdout synchronously? # TODO how to print to stdout synchronously?
@ -117,6 +121,11 @@ def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
yield e yield e
def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
pdfs = list(sorted(candidates(roots=roots)))
yield from _iter_annotations(pdfs=pdfs)
class Pdf(NamedTuple): class Pdf(NamedTuple):
path: Path path: Path
annotations: List[Annotation] annotations: List[Annotation]