implement proper provider for pdf annotations
This commit is contained in:
parent
f94c9ab997
commit
a016e494c2
1 changed files with 61 additions and 32 deletions
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import codecs
|
from datetime import datetime
|
||||||
|
import re
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
from subprocess import CompletedProcess
|
from subprocess import CompletedProcess
|
||||||
import sys
|
import sys
|
||||||
|
@ -17,30 +18,35 @@ from kython.klogging import setup_logzero
|
||||||
|
|
||||||
from ..ext.pdfannots import pdfannots # type: ignore
|
from ..ext.pdfannots import pdfannots # type: ignore
|
||||||
|
|
||||||
from .private import ROOT_PATHS, is_handled
|
from .private import ROOT_PATHS, is_ignored
|
||||||
|
|
||||||
|
|
||||||
def get_logger():
|
def get_logger():
|
||||||
return logging.getLogger('annotation-crawler')
|
return logging.getLogger('annotation-crawler')
|
||||||
|
|
||||||
|
|
||||||
def get_pdfs() -> List[Path]:
|
def get_candidates() -> List[Path]:
|
||||||
pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in ROOT_PATHS)
|
pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in ROOT_PATHS)
|
||||||
return list(sorted(pdfs))
|
return list(sorted(pdfs))
|
||||||
|
|
||||||
|
|
||||||
# TODO cachew?
|
# TODO cachew?
|
||||||
class Result(NamedTuple):
|
|
||||||
path: Path
|
|
||||||
annotations: List
|
|
||||||
stderr: str
|
|
||||||
|
|
||||||
|
|
||||||
class Annotation(NamedTuple):
|
class Annotation(NamedTuple):
|
||||||
author: Optional[str]
|
author: Optional[str]
|
||||||
page: int
|
page: int
|
||||||
highlight: Optional[str]
|
highlight: Optional[str]
|
||||||
comment: Optional[str]
|
comment: Optional[str]
|
||||||
|
date: Optional[datetime]
|
||||||
|
|
||||||
|
|
||||||
|
class Pdf(NamedTuple):
|
||||||
|
path: Path
|
||||||
|
annotations: List[Annotation]
|
||||||
|
stderr: str
|
||||||
|
|
||||||
|
@property
|
||||||
|
def date(self):
|
||||||
|
return self.annotations[-1].date
|
||||||
|
|
||||||
|
|
||||||
def as_annotation(ann) -> Annotation:
|
def as_annotation(ann) -> Annotation:
|
||||||
|
@ -49,11 +55,28 @@ def as_annotation(ann) -> Annotation:
|
||||||
for a in ('boxes', 'rect'):
|
for a in ('boxes', 'rect'):
|
||||||
if a in d:
|
if a in d:
|
||||||
del d[a]
|
del d[a]
|
||||||
|
dates = d['date']
|
||||||
|
date: Optional[datetime] = None
|
||||||
|
if dates is not None:
|
||||||
|
dates = dates.replace("'", "")
|
||||||
|
# 20190630213504+0100
|
||||||
|
dates = re.sub('Z0000$', '+0000', dates)
|
||||||
|
FMT = '%Y%m%d%H%M%S'
|
||||||
|
# TODO is it utc if there is not timestamp?
|
||||||
|
for fmt in [FMT, FMT + '%z']:
|
||||||
|
try:
|
||||||
|
date = datetime.strptime(dates, fmt)
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise RuntimeError(dates)
|
||||||
return Annotation(
|
return Annotation(
|
||||||
author =d['author'],
|
author =d['author'],
|
||||||
page =d['page'],
|
page =d['page'],
|
||||||
highlight=d['text'],
|
highlight=d['text'],
|
||||||
comment =d['contents'],
|
comment =d['contents'],
|
||||||
|
date =date,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -62,21 +85,21 @@ class PdfAnnotsException(Exception):
|
||||||
self.path = path
|
self.path = path
|
||||||
|
|
||||||
|
|
||||||
def _get_annots(p: Path) -> Result:
|
def _get_annots(p: Path) -> Pdf:
|
||||||
progress = False
|
progress = False
|
||||||
with p.open('rb') as fo:
|
with p.open('rb') as fo:
|
||||||
f = io.StringIO()
|
f = io.StringIO()
|
||||||
with redirect_stderr(f):
|
with redirect_stderr(f):
|
||||||
(annots, outlines) = pdfannots.process_file(fo, emit_progress=progress)
|
(annots, outlines) = pdfannots.process_file(fo, emit_progress=progress)
|
||||||
# outlines are kinda like TOC, I don't really need them
|
# outlines are kinda like TOC, I don't really need them
|
||||||
return Result(
|
return Pdf(
|
||||||
path=p,
|
path=p,
|
||||||
annotations=list(map(as_annotation, annots)),
|
annotations=list(map(as_annotation, annots)),
|
||||||
stderr=f.getvalue(),
|
stderr=f.getvalue(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_annots(p: Path) -> Result:
|
def get_annots(p: Path) -> Pdf:
|
||||||
try:
|
try:
|
||||||
return _get_annots(p)
|
return _get_annots(p)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -93,17 +116,17 @@ def test2():
|
||||||
print(res)
|
print(res)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def get_annotated_pdfs() -> List[Pdf]:
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
setup_logzero(logger, level=logging.DEBUG)
|
setup_logzero(logger, level=logging.DEBUG)
|
||||||
|
|
||||||
pdfs = get_pdfs()
|
pdfs = get_candidates()
|
||||||
logger.info('processing %d pdfs', len(pdfs))
|
logger.info('processing %d pdfs', len(pdfs))
|
||||||
|
|
||||||
unhandled = []
|
collected = []
|
||||||
errors = []
|
errors = []
|
||||||
def callback(res: Result):
|
def callback(res: Pdf):
|
||||||
if is_handled(res.path):
|
if is_ignored(res.path):
|
||||||
return
|
return
|
||||||
logger.info('processed %s', res.path)
|
logger.info('processed %s', res.path)
|
||||||
|
|
||||||
|
@ -112,12 +135,12 @@ def main():
|
||||||
logger.error(err)
|
logger.error(err)
|
||||||
errors.append(err)
|
errors.append(err)
|
||||||
elif len(res.annotations) > 0:
|
elif len(res.annotations) > 0:
|
||||||
logger.warning('unhandled: %s', res)
|
logger.info('collected %s annotations', len(res.annotations))
|
||||||
unhandled.append(res)
|
collected.append(res)
|
||||||
|
|
||||||
def error_cb(err):
|
def error_cb(err):
|
||||||
if isinstance(err, PdfAnnotsException):
|
if isinstance(err, PdfAnnotsException):
|
||||||
if is_handled(err.path):
|
if is_ignored(err.path):
|
||||||
# TODO log?
|
# TODO log?
|
||||||
return
|
return
|
||||||
logger.error('while processing %s', err.path)
|
logger.error('while processing %s', err.path)
|
||||||
|
@ -131,21 +154,27 @@ def main():
|
||||||
(pdf, ),
|
(pdf, ),
|
||||||
callback=callback,
|
callback=callback,
|
||||||
error_callback=error_cb,
|
error_callback=error_cb,
|
||||||
) for pdf in pdfs if not is_handled(pdf)] # TODO log if we skip?
|
) for pdf in pdfs if not is_ignored(pdf)] # TODO log if we skip?
|
||||||
for h in handles:
|
for h in handles:
|
||||||
h.wait()
|
h.wait()
|
||||||
|
|
||||||
if len(unhandled) > 0:
|
# TODO more defensive error processing?
|
||||||
for r in unhandled:
|
|
||||||
logger.warning('unhandled annotations in: %s', r.path)
|
|
||||||
for a in r.annotations:
|
|
||||||
pprint(a)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if len(errors) > 0:
|
if len(errors) > 0:
|
||||||
logger.error('had %d errors while processing', len(errors))
|
logger.error('had %d errors while processing', len(errors))
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
|
return collected
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
collected = get_annotated_pdfs()
|
||||||
|
if len(collected) > 0:
|
||||||
|
for r in collected:
|
||||||
|
logger.warning('collected annotations in: %s', r.path)
|
||||||
|
for a in r.annotations:
|
||||||
|
pprint(a)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Add table
Reference in a new issue