HPI/my/pdfs.py
2020-03-24 21:15:21 +01:00

189 lines
5.2 KiB
Python
Executable file

#!/usr/bin/env python3
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime
import re
import sys
import io
import logging
from pathlib import Path
from typing import NamedTuple, List, Optional, Iterator
from contextlib import redirect_stderr
from .common import mcachew, group_by_key
from .error import Res, split_errors
# path to pdfannots (https://github.com/0xabu/pdfannots)
import mycfg.repos.pdfannots.pdfannots as pdfannots
from mycfg import paths
def get_logger():
return logging.getLogger('my.pdfs')
def is_ignored(p: Path) -> bool:
# ignore some extremely heavy files
return paths.pdfs.is_ignored(p)
def candidates(roots=None) -> Iterator[Path]:
if roots is None:
roots = paths.pdfs.roots
for r in roots:
for p in Path(r).rglob('*.pdf'):
if not is_ignored(p):
yield p
# TODO canonical names
# TODO defensive if pdf was removed, also cachew key needs to be defensive
class Annotation(NamedTuple):
path: str
author: Optional[str]
page: int
highlight: Optional[str]
comment: Optional[str]
date: Optional[datetime]
def as_annotation(*, raw_ann, path: str) -> Annotation:
d = vars(raw_ann)
d['page'] = raw_ann.page.pageno
for a in ('boxes', 'rect'):
if a in d:
del d[a]
dates = d['date']
date: Optional[datetime] = None
if dates is not None:
dates = dates.replace("'", "")
# 20190630213504+0100
dates = re.sub('Z0000$', '+0000', dates)
FMT = '%Y%m%d%H%M%S'
# TODO is it utc if there is not timestamp?
for fmt in [FMT, FMT + '%z']:
try:
date = datetime.strptime(dates, fmt)
break
except ValueError:
pass
else:
# TODO defensive?
raise RuntimeError(dates)
return Annotation(
path = path,
author = d['author'],
page = d['page'],
highlight = d['text'],
comment = d['contents'],
date = date,
)
def get_annots(p: Path) -> List[Annotation]:
with p.open('rb') as fo:
f = io.StringIO()
with redirect_stderr(f):
(annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
# outlines are kinda like TOC, I don't really need them
return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
# TODO stderr?
def hash_files(pdfs: List[Path]):
# if mtime hasn't changed then the file hasn't changed either
return [(pdf, pdf.stat().st_mtime) for pdf in pdfs]
# TODO might make more sense to be more fine grained here, e.g. cache annotations for indifidual files
@mcachew(hashf=hash_files)
def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
logger = get_logger()
logger.info('processing %d pdfs', len(pdfs))
# TODO how to print to stdout synchronously?
with ProcessPoolExecutor() as pool:
futures = [
pool.submit(get_annots, pdf)
for pdf in pdfs
]
for f, pdf in zip(futures, pdfs):
try:
yield from f.result()
except Exception as e:
logger.error('While processing %s:', pdf)
logger.exception(e)
# TODO not sure if should attach pdf as well; it's a bit annoying to pass around?
# also really have to think about interaction with cachew...
yield e
def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
pdfs = list(sorted(candidates(roots=roots)))
yield from _iter_annotations(pdfs=pdfs)
class Pdf(NamedTuple):
path: Path
annotations: List[Annotation]
@property
def date(self):
return self.annotations[-1].date
def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]:
it = iter_annotations(roots=roots)
vit, eit = split_errors(it, ET=Exception)
for k, g in group_by_key(vit, key=lambda a: a.path).items():
yield Pdf(path=Path(k), annotations=g)
yield from eit
def test():
res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
assert len(res) > 3
def test2():
res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf'))
print(res)
def test_with_error():
# TODO need example of pdf file...
import tempfile
with tempfile.TemporaryDirectory() as td:
root = Path(td)
g = root / 'garbage.pdf'
g.write_text('garbage')
roots = [
root,
# '/usr/share/doc/texlive-doc/latex/amsrefs/',
]
# TODO find some pdfs that actually has annotations...
annots = list(iter_annotations(roots=roots))
assert len(annots) == 1
assert isinstance(annots[0], Exception)
def main():
from pprint import pprint
logger = get_logger()
from .common import setup_logger
setup_logger(logger, level=logging.DEBUG)
collected = list(annotated_pdfs())
if len(collected) > 0:
for r in collected:
if isinstance(r, Exception):
logger.exception(r)
else:
logger.info('collected annotations in: %s', r.path)
for a in r.annotations:
pprint(a)