implement proper provider for pdf annotations

This commit is contained in:
Dima Gerasimov 2019-08-29 21:22:54 +01:00
parent f94c9ab997
commit a016e494c2

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from pathlib import Path from pathlib import Path
import codecs from datetime import datetime
import re
from multiprocessing.pool import Pool from multiprocessing.pool import Pool
from subprocess import CompletedProcess from subprocess import CompletedProcess
import sys import sys
@ -17,30 +18,35 @@ from kython.klogging import setup_logzero
from ..ext.pdfannots import pdfannots # type: ignore from ..ext.pdfannots import pdfannots # type: ignore
from .private import ROOT_PATHS, is_handled from .private import ROOT_PATHS, is_ignored
def get_logger(): def get_logger():
return logging.getLogger('annotation-crawler') return logging.getLogger('annotation-crawler')
def get_pdfs() -> List[Path]: def get_candidates() -> List[Path]:
pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in ROOT_PATHS) pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in ROOT_PATHS)
return list(sorted(pdfs)) return list(sorted(pdfs))
# TODO cachew? # TODO cachew?
class Result(NamedTuple):
path: Path
annotations: List
stderr: str
class Annotation(NamedTuple): class Annotation(NamedTuple):
author: Optional[str] author: Optional[str]
page: int page: int
highlight: Optional[str] highlight: Optional[str]
comment: Optional[str] comment: Optional[str]
date: Optional[datetime]
class Pdf(NamedTuple):
path: Path
annotations: List[Annotation]
stderr: str
@property
def date(self):
return self.annotations[-1].date
def as_annotation(ann) -> Annotation: def as_annotation(ann) -> Annotation:
@ -49,11 +55,28 @@ def as_annotation(ann) -> Annotation:
for a in ('boxes', 'rect'): for a in ('boxes', 'rect'):
if a in d: if a in d:
del d[a] del d[a]
dates = d['date']
date: Optional[datetime] = None
if dates is not None:
dates = dates.replace("'", "")
# 20190630213504+0100
dates = re.sub('Z0000$', '+0000', dates)
FMT = '%Y%m%d%H%M%S'
# TODO is it utc if there is not timestamp?
for fmt in [FMT, FMT + '%z']:
try:
date = datetime.strptime(dates, fmt)
break
except ValueError:
pass
else:
raise RuntimeError(dates)
return Annotation( return Annotation(
author =d['author'], author =d['author'],
page =d['page'], page =d['page'],
highlight=d['text'], highlight=d['text'],
comment =d['contents'], comment =d['contents'],
date =date,
) )
@ -62,21 +85,21 @@ class PdfAnnotsException(Exception):
self.path = path self.path = path
def _get_annots(p: Path) -> Result: def _get_annots(p: Path) -> Pdf:
progress = False progress = False
with p.open('rb') as fo: with p.open('rb') as fo:
f = io.StringIO() f = io.StringIO()
with redirect_stderr(f): with redirect_stderr(f):
(annots, outlines) = pdfannots.process_file(fo, emit_progress=progress) (annots, outlines) = pdfannots.process_file(fo, emit_progress=progress)
# outlines are kinda like TOC, I don't really need them # outlines are kinda like TOC, I don't really need them
return Result( return Pdf(
path=p, path=p,
annotations=list(map(as_annotation, annots)), annotations=list(map(as_annotation, annots)),
stderr=f.getvalue(), stderr=f.getvalue(),
) )
def get_annots(p: Path) -> Result: def get_annots(p: Path) -> Pdf:
try: try:
return _get_annots(p) return _get_annots(p)
except Exception as e: except Exception as e:
@ -93,17 +116,17 @@ def test2():
print(res) print(res)
def main(): def get_annotated_pdfs() -> List[Pdf]:
logger = get_logger() logger = get_logger()
setup_logzero(logger, level=logging.DEBUG) setup_logzero(logger, level=logging.DEBUG)
pdfs = get_pdfs() pdfs = get_candidates()
logger.info('processing %d pdfs', len(pdfs)) logger.info('processing %d pdfs', len(pdfs))
unhandled = [] collected = []
errors = [] errors = []
def callback(res: Result): def callback(res: Pdf):
if is_handled(res.path): if is_ignored(res.path):
return return
logger.info('processed %s', res.path) logger.info('processed %s', res.path)
@ -112,12 +135,12 @@ def main():
logger.error(err) logger.error(err)
errors.append(err) errors.append(err)
elif len(res.annotations) > 0: elif len(res.annotations) > 0:
logger.warning('unhandled: %s', res) logger.info('collected %s annotations', len(res.annotations))
unhandled.append(res) collected.append(res)
def error_cb(err): def error_cb(err):
if isinstance(err, PdfAnnotsException): if isinstance(err, PdfAnnotsException):
if is_handled(err.path): if is_ignored(err.path):
# TODO log? # TODO log?
return return
logger.error('while processing %s', err.path) logger.error('while processing %s', err.path)
@ -131,21 +154,27 @@ def main():
(pdf, ), (pdf, ),
callback=callback, callback=callback,
error_callback=error_cb, error_callback=error_cb,
) for pdf in pdfs if not is_handled(pdf)] # TODO log if we skip? ) for pdf in pdfs if not is_ignored(pdf)] # TODO log if we skip?
for h in handles: for h in handles:
h.wait() h.wait()
if len(unhandled) > 0: # TODO more defensive error processing?
for r in unhandled:
logger.warning('unhandled annotations in: %s', r.path)
for a in r.annotations:
pprint(a)
sys.exit(1)
if len(errors) > 0: if len(errors) > 0:
logger.error('had %d errors while processing', len(errors)) logger.error('had %d errors while processing', len(errors))
sys.exit(2) sys.exit(2)
return collected
def main():
logger = get_logger()
collected = get_annotated_pdfs()
if len(collected) > 0:
for r in collected:
logger.warning('collected annotations in: %s', r.path)
for a in r.annotations:
pprint(a)
if __name__ == '__main__': if __name__ == '__main__':