my.pdf: handle update to pdfannots 0.2

undoes f5b47dd695 , tests work properly now

resolves https://github.com/karlicoss/HPI/issues/180
This commit is contained in:
Dima Gerasimov 2021-12-19 18:03:43 +00:00 committed by karlicoss
parent 074b8685d6
commit 9578b13fca
2 changed files with 19 additions and 35 deletions

View file

@ -3,9 +3,9 @@ PDF documents and annotations on your filesystem
''' '''
REQUIRES = [ REQUIRES = [
'git+https://github.com/0xabu/pdfannots', 'git+https://github.com/0xabu/pdfannots',
# todo not sure if should use pypi version?
] ]
from contextlib import redirect_stderr
from datetime import datetime from datetime import datetime
from dataclasses import dataclass from dataclasses import dataclass
import io import io
@ -20,7 +20,7 @@ from my.core.common import mcachew, group_by_key
from my.core.error import Res, split_errors from my.core.error import Res, split_errors
import pdfannots # type: ignore[import] import pdfannots
from my.config import pdfs as user_config from my.config import pdfs as user_config
@ -56,7 +56,6 @@ config = make_config(pdfs, migration=pdfs._migration)
logger = LazyLogger(__name__) logger = LazyLogger(__name__)
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
# TODO ignoring could be handled on get_files/user config site as well?..
all_files = get_files(config.paths, glob='**/*.pdf') all_files = get_files(config.paths, glob='**/*.pdf')
return [p for p in all_files if not config.is_ignored(p)] return [p for p in all_files if not config.is_ignored(p)]
@ -77,38 +76,35 @@ class Annotation(NamedTuple):
return self.created return self.created
def as_annotation(*, raw_ann, path: str) -> Annotation: def _as_annotation(*, raw: pdfannots.Annotation, path: str) -> Annotation:
d = vars(raw_ann) d = vars(raw)
d['page'] = raw_ann.page.pageno pos = raw.pos
for a in ('boxes', 'rect'): # make mypy happy (pos alwasy present for Annotation https://github.com/0xabu/pdfannots/blob/dbdfefa158971e1746fae2da139918e9f59439ea/pdfannots/types.py#L302)
if a in d: assert pos is not None
del d[a] d['page'] = pos.page.pageno
return Annotation( return Annotation(
path = path, path = path,
author = d['author'], author = d['author'],
page = d['page'], page = d['page'],
highlight = d['text'], highlight = raw.gettext(),
comment = d['contents'], comment = d['contents'],
created = d.get('created'), # todo can be non-defensive once pr is merged created = d['created'],
) )
def get_annots(p: Path) -> List[Annotation]: def get_annots(p: Path) -> List[Annotation]:
b = time.time() b = time.time()
with p.open('rb') as fo: with p.open('rb') as fo:
f = io.StringIO() doc = pdfannots.process_file(fo, emit_progress_to=None)
with redirect_stderr(f): annots = [a for a in doc.iter_annots()]
# FIXME # also has outlines are kinda like TOC, I don't really need them
(annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
# outlines are kinda like TOC, I don't really need them
a = time.time() a = time.time()
took = a - b took = a - b
tooks = f'took {took:0.1f} seconds' tooks = f'took {took:0.1f} seconds'
if took > 5: if took > 5:
tooks = tooks.upper() tooks = tooks.upper()
logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots)) logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots))
return [as_annotation(raw_ann=a, path=str(p)) for a in annots] return [_as_annotation(raw=a, path=str(p)) for a in annots]
# TODO stderr?
def _hash_files(pdfs: Sequence[Path]): def _hash_files(pdfs: Sequence[Path]):
@ -186,18 +182,8 @@ def stats() -> Stats:
### legacy/misc stuff ### legacy/misc stuff
# todo retire later if favor of hpi query?
def main() -> None:
from pprint import pprint
collected = annotated_pdfs()
for r in collected:
if isinstance(r, Exception):
logger.exception(r)
else:
logger.info('collected annotations in: %s', r.path)
for a in r.annotations:
pprint(a)
iter_annotations = annotations # for backwards compatibility iter_annotations = annotations # for backwards compatibility
### ###
# can use 'hpi query my.pdfs.annotations -o pprint' to test
#

View file

@ -4,8 +4,6 @@ from more_itertools import ilen
import pytest import pytest
pytestmark = pytest.mark.skip("TODO fix pdfannots (see https://github.com/karlicoss/HPI/issues/180)")
from .common import testdata from .common import testdata
@ -58,8 +56,8 @@ def with_config():
EXPECTED_HIGHLIGHTS = { EXPECTED_HIGHLIGHTS = {
'Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ', 'Since 1994, when we first began organizing web sites, we have enjoyed a rare oppor-tunity to participate in the birth of a new discipline.',
'And yet, unlearn we must, ', 'And yet, unlearn we must,',
'', '',
} }