prettier pdf handling; prepare for cachew

This commit is contained in:
Dima Gerasimov 2020-01-08 20:59:22 +00:00
parent ea4dcdafb0
commit a57be019d0
2 changed files with 99 additions and 106 deletions

View file

@ -19,7 +19,7 @@ try:
import ijson.backends.yajl2_cffi as ijson # type: ignore import ijson.backends.yajl2_cffi as ijson # type: ignore
except: except:
# fallback to default backend. warning? # fallback to default backend. warning?
import ijson import ijson # type: ignore
from kython import kompress # TODO from kython import kompress # TODO

View file

@ -1,43 +1,46 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from .common import import_file from concurrent.futures import ProcessPoolExecutor
from datetime import datetime
import re
import sys
import io
import logging
from pathlib import Path from pathlib import Path
from typing import NamedTuple, List, Optional, Iterator
from contextlib import redirect_stderr
from .common import import_file, mcachew, group_by_key
from .error import Res, split_errors
# path to pdfannots (https://github.com/0xabu/pdfannots) # path to pdfannots (https://github.com/0xabu/pdfannots)
import mycfg.repos.pdfannots.pdfannots as pdfannots import mycfg.repos.pdfannots.pdfannots as pdfannots
from mycfg import paths from mycfg import paths
from datetime import datetime
import re
from subprocess import CompletedProcess
import sys
import io
from typing import NamedTuple, List, Optional
from contextlib import redirect_stderr
import logging
def get_logger(): def get_logger():
return logging.getLogger('my.pdfs') return logging.getLogger('my.pdfs')
def get_candidates(roots=None) -> List[Path]: def is_ignored(p: Path) -> bool:
if roots is None:
roots = paths.pdfs.roots
import itertools
pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in roots)
return list(sorted(pdfs))
def is_ignored(p):
return paths.pdfs.is_ignored(p) return paths.pdfs.is_ignored(p)
# TODO cachew? def candidates(roots=None) -> Iterator[Path]:
if roots is None:
roots = paths.pdfs.roots
for r in roots:
for p in Path(r).rglob('*.pdf'):
if not is_ignored(p):
yield p
# TODO canonical names
# TODO defensive if pdf was removed, also cachew key needs to be defensive
class Annotation(NamedTuple): class Annotation(NamedTuple):
path: str
author: Optional[str] author: Optional[str]
page: int page: int
highlight: Optional[str] highlight: Optional[str]
@ -45,19 +48,9 @@ class Annotation(NamedTuple):
date: Optional[datetime] date: Optional[datetime]
class Pdf(NamedTuple): def as_annotation(*, raw_ann, path: str) -> Annotation:
path: Path d = vars(raw_ann)
annotations: List[Annotation] d['page'] = raw_ann.page.pageno
stderr: str
@property
def date(self):
return self.annotations[-1].date
def as_annotation(ann) -> Annotation:
d = vars(ann)
d['page'] = ann.page.pageno
for a in ('boxes', 'rect'): for a in ('boxes', 'rect'):
if a in d: if a in d:
del d[a] del d[a]
@ -76,95 +69,75 @@ def as_annotation(ann) -> Annotation:
except ValueError: except ValueError:
pass pass
else: else:
# TODO defensive?
raise RuntimeError(dates) raise RuntimeError(dates)
return Annotation( return Annotation(
author =d['author'], path = path,
page =d['page'], author = d['author'],
highlight=d['text'], page = d['page'],
comment =d['contents'], highlight = d['text'],
date =date, comment = d['contents'],
date = date,
) )
class PdfAnnotsException(Exception): def get_annots(p: Path) -> List[Annotation]:
def __init__(self, path: Path) -> None:
self.path = path
def _get_annots(p: Path) -> Pdf:
progress = False
with p.open('rb') as fo: with p.open('rb') as fo:
f = io.StringIO() f = io.StringIO()
with redirect_stderr(f): with redirect_stderr(f):
(annots, outlines) = pdfannots.process_file(fo, emit_progress=progress) # TODO FIXME defensive, try on garbage file (s)
(annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
# outlines are kinda like TOC, I don't really need them # outlines are kinda like TOC, I don't really need them
return Pdf( return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
path=p, # TODO stderr?
annotations=list(map(as_annotation, annots)),
stderr=f.getvalue(),
)
def get_annots(p: Path) -> Pdf: # TODO cachew needs to be based on mtime, hence take candidates, not roots
try: # @mcachew
return _get_annots(p) def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
except Exception as e:
raise PdfAnnotsException(p) from e
def get_annotated_pdfs(roots=None) -> List[Pdf]:
logger = get_logger() logger = get_logger()
pdfs = get_candidates(roots=roots) pdfs = list(sorted(candidates(roots=roots)))
logger.info('processing %d pdfs', len(pdfs)) logger.info('processing %d pdfs', len(pdfs))
collected = [] # TODO how to print to stdout synchronously?
errors = [] with ProcessPoolExecutor() as pool:
def callback(res: Pdf): futures = [
if is_ignored(res.path): pool.submit(get_annots, pdf)
return for pdf in pdfs
logger.info('processed %s', res.path) ]
for f, pdf in zip(futures, pdfs):
try:
yield from f.result()
except Exception as e:
logger.error('While processing %s:', pdf)
logger.exception(e)
# TODO not sure if should attach pdf as well; it's a bit annoying to pass around?
# also really have to think about interaction with cachew...
yield e
if len(res.stderr) > 0:
err = 'while processing %s: %s' % (res.path, res.stderr)
logger.error(err)
errors.append(err)
elif len(res.annotations) > 0:
logger.info('collected %s annotations', len(res.annotations))
collected.append(res)
def error_cb(err): class Pdf(NamedTuple):
if isinstance(err, PdfAnnotsException): path: Path
if is_ignored(err.path): annotations: List[Annotation]
# TODO log?
return
logger.error('while processing %s', err.path)
err = err.__cause__
logger.exception(err)
errors.append(str(err))
from multiprocessing.pool import Pool @property
with Pool() as p: def date(self):
handles = [p.apply_async( return self.annotations[-1].date
get_annots,
(pdf, ),
callback=callback,
error_callback=error_cb,
) for pdf in pdfs if not is_ignored(pdf)] # TODO log if we skip?
for h in handles:
h.wait()
# TODO more defensive error processing?
if len(errors) > 0:
logger.error('had %d errors while processing', len(errors))
sys.exit(2)
return collected def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]:
it = iter_annotations(roots=roots)
vit, eit = split_errors(it, ET=Exception)
for k, g in group_by_key(vit, key=lambda a: a.path).items():
yield Pdf(path=Path(k), annotations=g)
yield from eit
def test(): def test():
res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf')) res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
assert len(res.annotations) > 0 assert len(res) > 3
def test2(): def test2():
@ -172,6 +145,23 @@ def test2():
print(res) print(res)
def test_with_error():
# TODO need example of pdf file...
import tempfile
with tempfile.TemporaryDirectory() as td:
root = Path(td)
g = root / 'garbage.pdf'
g.write_text('garbage')
roots = [
root,
# '/usr/share/doc/texlive-doc/latex/amsrefs/',
]
# TODO find some pdfs that actually has annotations...
annots = list(iter_annotations(roots=roots))
assert len(annots) == 1
assert isinstance(annots[0], Exception)
def main(): def main():
from pprint import pprint from pprint import pprint
@ -179,9 +169,12 @@ def main():
from .common import setup_logger from .common import setup_logger
setup_logger(logger, level=logging.DEBUG) setup_logger(logger, level=logging.DEBUG)
collected = get_annotated_pdfs() collected = list(annotated_pdfs())
if len(collected) > 0: if len(collected) > 0:
for r in collected: for r in collected:
logger.warning('collected annotations in: %s', r.path) if isinstance(r, Exception):
for a in r.annotations: logger.exception(r)
pprint(a) else:
logger.info('collected annotations in: %s', r.path)
for a in r.annotations:
pprint(a)