move pdf processor
This commit is contained in:
commit
184d2eeb94
1 changed files with 150 additions and 0 deletions
150
reading/pdfs.py
Executable file
150
reading/pdfs.py
Executable file
|
@ -0,0 +1,150 @@
|
|||
#!/usr/bin/env python3
|
||||
from pathlib import Path
|
||||
import codecs
|
||||
from multiprocessing.pool import Pool
|
||||
from subprocess import CompletedProcess
|
||||
import sys
|
||||
import io
|
||||
from typing import NamedTuple, List, Optional
|
||||
from contextlib import redirect_stderr
|
||||
import logging
|
||||
from pprint import pprint
|
||||
import itertools
|
||||
|
||||
from kython import import_file
|
||||
from kython.klogging import setup_logzero
|
||||
|
||||
|
||||
from annotations_crawler_config import ROOT_PATHS, is_handled
|
||||
pdfannots = import_file('/L/soft/pdfannots/pdfannots.py')
|
||||
|
||||
|
||||
def get_logger():
|
||||
return logging.getLogger('annotation-crawler')
|
||||
|
||||
|
||||
def get_pdfs():
|
||||
pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in ROOT_PATHS)
|
||||
return list(sorted(pdfs))
|
||||
|
||||
|
||||
class Result(NamedTuple):
|
||||
path: Path
|
||||
annotations: List
|
||||
stderr: str
|
||||
|
||||
|
||||
class Annotation(NamedTuple):
|
||||
author: Optional[str]
|
||||
page: int
|
||||
highlight: Optional[str]
|
||||
comment: Optional[str]
|
||||
|
||||
|
||||
def as_annotation(ann) -> Annotation:
|
||||
d = vars(ann)
|
||||
d['page'] = ann.page.pageno
|
||||
for a in ('boxes', 'rect'):
|
||||
if a in d:
|
||||
del d[a]
|
||||
return Annotation(
|
||||
author =d['author'],
|
||||
page =d['page'],
|
||||
highlight=d['text'],
|
||||
comment =d['contents'],
|
||||
)
|
||||
|
||||
|
||||
class PdfAnnotsException(Exception):
|
||||
def __init__(self, path: Path) -> None:
|
||||
self.path = path
|
||||
|
||||
|
||||
def _get_annots(p: Path) -> Result:
|
||||
progress = False
|
||||
with p.open('rb') as fo:
|
||||
f = io.StringIO()
|
||||
with redirect_stderr(f):
|
||||
(annots, outlines) = pdfannots.process_file(fo, emit_progress=progress)
|
||||
# outlines are kinda like TOC, I don't really need them
|
||||
return Result(
|
||||
path=p,
|
||||
annotations=list(map(as_annotation, annots)),
|
||||
stderr=f.getvalue(),
|
||||
)
|
||||
|
||||
|
||||
def get_annots(p: Path) -> Result:
|
||||
try:
|
||||
return _get_annots(p)
|
||||
except Exception as e:
|
||||
raise PdfAnnotsException(p) from e
|
||||
|
||||
|
||||
def test():
|
||||
res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
|
||||
assert len(res.annotations) > 0
|
||||
|
||||
|
||||
def test2():
|
||||
res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf'))
|
||||
print(res)
|
||||
|
||||
|
||||
def main():
|
||||
logger = get_logger()
|
||||
setup_logzero(logger, level=logging.DEBUG)
|
||||
|
||||
pdfs = get_pdfs()
|
||||
logger.info('processing %d pdfs', len(pdfs))
|
||||
|
||||
unhandled = []
|
||||
errors = []
|
||||
def callback(res: Result):
|
||||
if is_handled(res.path):
|
||||
return
|
||||
logger.info('processed %s', res.path)
|
||||
|
||||
if len(res.stderr) > 0:
|
||||
err = 'while processing %s: %s' % (res.path, res.stderr)
|
||||
logger.error(err)
|
||||
errors.append(err)
|
||||
elif len(res.annotations) > 0:
|
||||
logger.warning('unhandled: %s', res)
|
||||
unhandled.append(res)
|
||||
|
||||
def error_cb(err):
|
||||
if isinstance(err, PdfAnnotsException):
|
||||
if is_handled(err.path):
|
||||
# TODO log?
|
||||
return
|
||||
logger.error('while processing %s', err.path)
|
||||
err = err.__cause__
|
||||
logger.exception(err)
|
||||
errors.append(str(err))
|
||||
|
||||
with Pool() as p:
|
||||
handles = [p.apply_async(
|
||||
get_annots,
|
||||
(pdf, ),
|
||||
callback=callback,
|
||||
error_callback=error_cb,
|
||||
) for pdf in pdfs if not is_handled(pdf)] # TODO log if we skip?
|
||||
for h in handles:
|
||||
h.wait()
|
||||
|
||||
if len(unhandled) > 0:
|
||||
for r in unhandled:
|
||||
logger.warning('unhandled annotations in: %s', r.path)
|
||||
for a in r.annotations:
|
||||
pprint(a)
|
||||
sys.exit(1)
|
||||
|
||||
if len(errors) > 0:
|
||||
logger.error('had %d errors while processing', len(errors))
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Add table
Reference in a new issue