prettier pdf handling; prepare for cachew
This commit is contained in:
parent
ea4dcdafb0
commit
a57be019d0
2 changed files with 99 additions and 106 deletions
|
@ -19,7 +19,7 @@ try:
|
||||||
import ijson.backends.yajl2_cffi as ijson # type: ignore
|
import ijson.backends.yajl2_cffi as ijson # type: ignore
|
||||||
except:
|
except:
|
||||||
# fallback to default backend. warning?
|
# fallback to default backend. warning?
|
||||||
import ijson
|
import ijson # type: ignore
|
||||||
|
|
||||||
from kython import kompress # TODO
|
from kython import kompress # TODO
|
||||||
|
|
||||||
|
|
203
my/pdfs.py
203
my/pdfs.py
|
@ -1,43 +1,46 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from .common import import_file
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
from datetime import datetime
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import NamedTuple, List, Optional, Iterator
|
||||||
|
from contextlib import redirect_stderr
|
||||||
|
|
||||||
|
|
||||||
|
from .common import import_file, mcachew, group_by_key
|
||||||
|
from .error import Res, split_errors
|
||||||
|
|
||||||
# path to pdfannots (https://github.com/0xabu/pdfannots)
|
# path to pdfannots (https://github.com/0xabu/pdfannots)
|
||||||
import mycfg.repos.pdfannots.pdfannots as pdfannots
|
import mycfg.repos.pdfannots.pdfannots as pdfannots
|
||||||
from mycfg import paths
|
from mycfg import paths
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
import re
|
|
||||||
from subprocess import CompletedProcess
|
|
||||||
import sys
|
|
||||||
import io
|
|
||||||
from typing import NamedTuple, List, Optional
|
|
||||||
from contextlib import redirect_stderr
|
|
||||||
import logging
|
|
||||||
|
|
||||||
|
|
||||||
def get_logger():
|
def get_logger():
|
||||||
return logging.getLogger('my.pdfs')
|
return logging.getLogger('my.pdfs')
|
||||||
|
|
||||||
|
|
||||||
def get_candidates(roots=None) -> List[Path]:
|
def is_ignored(p: Path) -> bool:
|
||||||
if roots is None:
|
|
||||||
roots = paths.pdfs.roots
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in roots)
|
|
||||||
return list(sorted(pdfs))
|
|
||||||
|
|
||||||
|
|
||||||
def is_ignored(p):
|
|
||||||
return paths.pdfs.is_ignored(p)
|
return paths.pdfs.is_ignored(p)
|
||||||
|
|
||||||
|
|
||||||
# TODO cachew?
|
def candidates(roots=None) -> Iterator[Path]:
|
||||||
|
if roots is None:
|
||||||
|
roots = paths.pdfs.roots
|
||||||
|
|
||||||
|
for r in roots:
|
||||||
|
for p in Path(r).rglob('*.pdf'):
|
||||||
|
if not is_ignored(p):
|
||||||
|
yield p
|
||||||
|
|
||||||
|
# TODO canonical names
|
||||||
|
# TODO defensive if pdf was removed, also cachew key needs to be defensive
|
||||||
|
|
||||||
|
|
||||||
class Annotation(NamedTuple):
|
class Annotation(NamedTuple):
|
||||||
|
path: str
|
||||||
author: Optional[str]
|
author: Optional[str]
|
||||||
page: int
|
page: int
|
||||||
highlight: Optional[str]
|
highlight: Optional[str]
|
||||||
|
@ -45,19 +48,9 @@ class Annotation(NamedTuple):
|
||||||
date: Optional[datetime]
|
date: Optional[datetime]
|
||||||
|
|
||||||
|
|
||||||
class Pdf(NamedTuple):
|
def as_annotation(*, raw_ann, path: str) -> Annotation:
|
||||||
path: Path
|
d = vars(raw_ann)
|
||||||
annotations: List[Annotation]
|
d['page'] = raw_ann.page.pageno
|
||||||
stderr: str
|
|
||||||
|
|
||||||
@property
|
|
||||||
def date(self):
|
|
||||||
return self.annotations[-1].date
|
|
||||||
|
|
||||||
|
|
||||||
def as_annotation(ann) -> Annotation:
|
|
||||||
d = vars(ann)
|
|
||||||
d['page'] = ann.page.pageno
|
|
||||||
for a in ('boxes', 'rect'):
|
for a in ('boxes', 'rect'):
|
||||||
if a in d:
|
if a in d:
|
||||||
del d[a]
|
del d[a]
|
||||||
|
@ -76,95 +69,75 @@ def as_annotation(ann) -> Annotation:
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
# TODO defensive?
|
||||||
raise RuntimeError(dates)
|
raise RuntimeError(dates)
|
||||||
return Annotation(
|
return Annotation(
|
||||||
author =d['author'],
|
path = path,
|
||||||
page =d['page'],
|
author = d['author'],
|
||||||
highlight=d['text'],
|
page = d['page'],
|
||||||
comment =d['contents'],
|
highlight = d['text'],
|
||||||
date =date,
|
comment = d['contents'],
|
||||||
|
date = date,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class PdfAnnotsException(Exception):
|
def get_annots(p: Path) -> List[Annotation]:
|
||||||
def __init__(self, path: Path) -> None:
|
|
||||||
self.path = path
|
|
||||||
|
|
||||||
|
|
||||||
def _get_annots(p: Path) -> Pdf:
|
|
||||||
progress = False
|
|
||||||
with p.open('rb') as fo:
|
with p.open('rb') as fo:
|
||||||
f = io.StringIO()
|
f = io.StringIO()
|
||||||
with redirect_stderr(f):
|
with redirect_stderr(f):
|
||||||
(annots, outlines) = pdfannots.process_file(fo, emit_progress=progress)
|
# TODO FIXME defensive, try on garbage file (s)
|
||||||
|
(annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
|
||||||
# outlines are kinda like TOC, I don't really need them
|
# outlines are kinda like TOC, I don't really need them
|
||||||
return Pdf(
|
return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
|
||||||
path=p,
|
# TODO stderr?
|
||||||
annotations=list(map(as_annotation, annots)),
|
|
||||||
stderr=f.getvalue(),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_annots(p: Path) -> Pdf:
|
# TODO cachew needs to be based on mtime, hence take candidates, not roots
|
||||||
try:
|
# @mcachew
|
||||||
return _get_annots(p)
|
def iter_annotations(roots=None) -> Iterator[Res[Annotation]]:
|
||||||
except Exception as e:
|
|
||||||
raise PdfAnnotsException(p) from e
|
|
||||||
|
|
||||||
|
|
||||||
def get_annotated_pdfs(roots=None) -> List[Pdf]:
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
pdfs = get_candidates(roots=roots)
|
pdfs = list(sorted(candidates(roots=roots)))
|
||||||
logger.info('processing %d pdfs', len(pdfs))
|
logger.info('processing %d pdfs', len(pdfs))
|
||||||
|
|
||||||
collected = []
|
# TODO how to print to stdout synchronously?
|
||||||
errors = []
|
with ProcessPoolExecutor() as pool:
|
||||||
def callback(res: Pdf):
|
futures = [
|
||||||
if is_ignored(res.path):
|
pool.submit(get_annots, pdf)
|
||||||
return
|
for pdf in pdfs
|
||||||
logger.info('processed %s', res.path)
|
]
|
||||||
|
for f, pdf in zip(futures, pdfs):
|
||||||
|
try:
|
||||||
|
yield from f.result()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error('While processing %s:', pdf)
|
||||||
|
logger.exception(e)
|
||||||
|
# TODO not sure if should attach pdf as well; it's a bit annoying to pass around?
|
||||||
|
# also really have to think about interaction with cachew...
|
||||||
|
yield e
|
||||||
|
|
||||||
if len(res.stderr) > 0:
|
|
||||||
err = 'while processing %s: %s' % (res.path, res.stderr)
|
|
||||||
logger.error(err)
|
|
||||||
errors.append(err)
|
|
||||||
elif len(res.annotations) > 0:
|
|
||||||
logger.info('collected %s annotations', len(res.annotations))
|
|
||||||
collected.append(res)
|
|
||||||
|
|
||||||
def error_cb(err):
|
class Pdf(NamedTuple):
|
||||||
if isinstance(err, PdfAnnotsException):
|
path: Path
|
||||||
if is_ignored(err.path):
|
annotations: List[Annotation]
|
||||||
# TODO log?
|
|
||||||
return
|
|
||||||
logger.error('while processing %s', err.path)
|
|
||||||
err = err.__cause__
|
|
||||||
logger.exception(err)
|
|
||||||
errors.append(str(err))
|
|
||||||
|
|
||||||
from multiprocessing.pool import Pool
|
@property
|
||||||
with Pool() as p:
|
def date(self):
|
||||||
handles = [p.apply_async(
|
return self.annotations[-1].date
|
||||||
get_annots,
|
|
||||||
(pdf, ),
|
|
||||||
callback=callback,
|
|
||||||
error_callback=error_cb,
|
|
||||||
) for pdf in pdfs if not is_ignored(pdf)] # TODO log if we skip?
|
|
||||||
for h in handles:
|
|
||||||
h.wait()
|
|
||||||
|
|
||||||
# TODO more defensive error processing?
|
|
||||||
if len(errors) > 0:
|
|
||||||
logger.error('had %d errors while processing', len(errors))
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
return collected
|
def annotated_pdfs(roots=None) -> Iterator[Res[Pdf]]:
|
||||||
|
it = iter_annotations(roots=roots)
|
||||||
|
vit, eit = split_errors(it, ET=Exception)
|
||||||
|
|
||||||
|
for k, g in group_by_key(vit, key=lambda a: a.path).items():
|
||||||
|
yield Pdf(path=Path(k), annotations=g)
|
||||||
|
yield from eit
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
|
res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
|
||||||
assert len(res.annotations) > 0
|
assert len(res) > 3
|
||||||
|
|
||||||
|
|
||||||
def test2():
|
def test2():
|
||||||
|
@ -172,6 +145,23 @@ def test2():
|
||||||
print(res)
|
print(res)
|
||||||
|
|
||||||
|
|
||||||
|
def test_with_error():
|
||||||
|
# TODO need example of pdf file...
|
||||||
|
import tempfile
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
root = Path(td)
|
||||||
|
g = root / 'garbage.pdf'
|
||||||
|
g.write_text('garbage')
|
||||||
|
roots = [
|
||||||
|
root,
|
||||||
|
# '/usr/share/doc/texlive-doc/latex/amsrefs/',
|
||||||
|
]
|
||||||
|
# TODO find some pdfs that actually has annotations...
|
||||||
|
annots = list(iter_annotations(roots=roots))
|
||||||
|
assert len(annots) == 1
|
||||||
|
assert isinstance(annots[0], Exception)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
|
@ -179,9 +169,12 @@ def main():
|
||||||
from .common import setup_logger
|
from .common import setup_logger
|
||||||
setup_logger(logger, level=logging.DEBUG)
|
setup_logger(logger, level=logging.DEBUG)
|
||||||
|
|
||||||
collected = get_annotated_pdfs()
|
collected = list(annotated_pdfs())
|
||||||
if len(collected) > 0:
|
if len(collected) > 0:
|
||||||
for r in collected:
|
for r in collected:
|
||||||
logger.warning('collected annotations in: %s', r.path)
|
if isinstance(r, Exception):
|
||||||
for a in r.annotations:
|
logger.exception(r)
|
||||||
pprint(a)
|
else:
|
||||||
|
logger.info('collected annotations in: %s', r.path)
|
||||||
|
for a in r.annotations:
|
||||||
|
pprint(a)
|
||||||
|
|
Loading…
Add table
Reference in a new issue