prettier pdf handling; prepare for cachew

2020-01-08 20:59:22 +00:00 · 2020-01-08 20:59:22 +00:00 · a57be019d0
commit a57be019d0
parent ea4dcdafb0
2 changed files with 99 additions and 106 deletions
--- a/my/location/takeout.py
+++ b/my/location/takeout.py
@ -19,7 +19,7 @@ try:
    import ijson.backends.yajl2_cffi as ijson # type: ignore
 except:
    # fallback to default backend. warning?
-    import ijson
+    import ijson # type: ignore
 from kython import kompress # TODO
--- a/my/pdfs.py
+++ b/my/pdfs.py
@ -1,43 +1,46 @@
 #!/usr/bin/env python3
-from .common import import_file
+from concurrent.futures import ProcessPoolExecutor
-
+from datetime import datetime
 import re
 import sys
 import io
 import logging
 from pathlib import Path
 from typing import NamedTuple, List, Optional, Iterator
 from contextlib import redirect_stderr
 from .common import import_file, mcachew, group_by_key
 from .error import Res, split_errors
 # path to pdfannots (https://github.com/0xabu/pdfannots)
 import mycfg.repos.pdfannots.pdfannots as pdfannots
 from mycfg import paths
 from datetime import datetime
 import re
 from subprocess import CompletedProcess
 import sys
 import io
 from typing import NamedTuple, List, Optional
 from contextlib import redirect_stderr
 import logging
 def get_logger():
    return logging.getLogger('my.pdfs')
-def get_candidates(roots=None) -> List[Path]:
+def is_ignored(p: Path) -> bool:
    if roots is None:
        roots = paths.pdfs.roots
    import itertools
    pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in roots)
    return list(sorted(pdfs))
 def is_ignored(p):
    return paths.pdfs.is_ignored(p)
-# TODO cachew?
+def candidates(roots=None) -> Iterator[Path]:
    if roots is None:
        roots = paths.pdfs.roots
    for r in roots:
        for p in Path(r).rglob('*.pdf'):
            if not is_ignored(p):
                yield p
 # TODO canonical names
 # TODO defensive if pdf was removed, also cachew key needs to be defensive
 class Annotation(NamedTuple):
    path: str
    author: Optional[str]
    page: int
    highlight: Optional[str]
@ -45,19 +48,9 @@ class Annotation(NamedTuple):
    date: Optional[datetime]
-class Pdf(NamedTuple):
+def as_annotation(*, raw_ann, path: str) -> Annotation:
-    path: Path
+    d = vars(raw_ann)
-    annotations: List[Annotation]
+    d['page'] = raw_ann.page.pageno
    stderr: str
    @property
    def date(self):
        return self.annotations[-1].date
 def as_annotation(ann) -> Annotation:
    d = vars(ann)
    d['page'] = ann.page.pageno
    for a in ('boxes', 'rect'):
        if a in d:
            del d[a]
@ -76,95 +69,75 @@ def as_annotation(ann) -> Annotation:
            except ValueError:
                pass
        else:
            # TODO defensive?
            raise RuntimeError(dates)
    return Annotation(
-        author   =d['author'],
+        path      = path,
-        page     =d['page'],
+        author    = d['author'],
-        highlight=d['text'],
+        page      = d['page'],
-        comment  =d['contents'],
+        highlight = d['text'],
-        date     =date,
+        comment   = d['contents'],
        date      = date,
    )
-class PdfAnnotsException(Exception):
+def get_annots(p: Path) -> List[Annotation]:
    def __init__(self, path: Path) -> None:
        self.path = path
 def _get_annots(p: Path) -> Pdf:
    progress = False
    with p.open('rb') as fo:
        f = io.StringIO()
        with redirect_stderr(f):
-            (annots, outlines) = pdfannots.process_file(fo, emit_progress=progress)
+            # TODO FIXME defensive, try on garbage file (s)
            (annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
            # outlines are kinda like TOC, I don't really need them
-    return Pdf(
+    return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
-        path=p,
+    # TODO stderr?
        annotations=list(map(as_annotation, annots)),
        stderr=f.getvalue(),
    )
-def get_annots(p: Path) -> Pdf:
+# TODO cachew needs to be based on mtime, hence take candidates, not roots
-    try:
+# @mcachew
-        return _get_annots(p)
+def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
    except Exception as e:
        raise PdfAnnotsException(p) from e
 def get_annotated_pdfs(roots=None) -> List[Pdf]:
    logger = get_logger()
-    pdfs = get_candidates(roots=roots)
+    pdfs = list(sorted(candidates(roots=roots)))
    logger.info('processing %d pdfs', len(pdfs))
-    collected = []
+    # TODO how to print to stdout synchronously?
-    errors = []
+    with ProcessPoolExecutor() as pool:
-    def callback(res: Pdf):
+        futures = [
-        if is_ignored(res.path):
+            pool.submit(get_annots, pdf)
-            return
+            for pdf in pdfs
-        logger.info('processed %s', res.path)
+        ]
        for f, pdf in zip(futures, pdfs):
            try:
                yield from f.result()
            except Exception as e:
                logger.error('While processing %s:', pdf)
                logger.exception(e)
                # TODO not sure if should attach pdf as well; it's a bit annoying to pass around?
                # also really have to think about interaction with cachew...
                yield e
        if len(res.stderr) > 0:
            err = 'while processing %s: %s' % (res.path, res.stderr)
            logger.error(err)
            errors.append(err)
        elif len(res.annotations) > 0:
            logger.info('collected %s annotations', len(res.annotations))
            collected.append(res)
-    def error_cb(err):
+class Pdf(NamedTuple):
-        if isinstance(err, PdfAnnotsException):
+    path: Path
-            if is_ignored(err.path):
+    annotations: List[Annotation]
                # TODO log?
                return
            logger.error('while processing %s', err.path)
            err = err.__cause__
        logger.exception(err)
        errors.append(str(err))
-    from multiprocessing.pool import Pool
+    @property
-    with Pool() as p:
+    def date(self):
-        handles = [p.apply_async(
+        return self.annotations[-1].date
            get_annots,
            (pdf, ),
            callback=callback,
            error_callback=error_cb,
        ) for pdf in pdfs if not is_ignored(pdf)] # TODO log if we skip?
        for h in handles:
            h.wait()
    # TODO more defensive error processing?
    if len(errors) > 0:
        logger.error('had %d errors while processing', len(errors))
        sys.exit(2)
-    return collected
+def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]:
    it = iter_annotations(roots=roots)
    vit, eit = split_errors(it, ET=Exception)
    for k, g in group_by_key(vit, key=lambda a: a.path).items():
        yield Pdf(path=Path(k), annotations=g)
    yield from eit
 def test():
    res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
-    assert len(res.annotations) > 0
+    assert len(res) > 3
 def test2():
@ -172,6 +145,23 @@ def test2():
    print(res)
 def test_with_error():
    # TODO need example of pdf file...
    import tempfile
    with tempfile.TemporaryDirectory() as td:
        root = Path(td)
        g = root / 'garbage.pdf'
        g.write_text('garbage')
        roots = [
            root,
            # '/usr/share/doc/texlive-doc/latex/amsrefs/',
        ]
        # TODO find some pdfs that actually has annotations...
        annots = list(iter_annotations(roots=roots))
    assert len(annots) == 1
    assert isinstance(annots[0], Exception)
 def main():
    from pprint import pprint
@ -179,9 +169,12 @@ def main():
    from .common import setup_logger
    setup_logger(logger, level=logging.DEBUG)
-    collected = get_annotated_pdfs()
+    collected = list(annotated_pdfs())
    if len(collected) > 0:
        for r in collected:
-            logger.warning('collected annotations in: %s', r.path)
+            if isinstance(r, Exception):
-            for a in r.annotations:
+                logger.exception(r)
-                pprint(a)
+            else:
                logger.info('collected annotations in: %s', r.path)
                for a in r.annotations:
                    pprint(a)