my.pdf: handle update to pdfannots 0.2
undoes f5b47dd695
, tests work properly now
resolves https://github.com/karlicoss/HPI/issues/180
This commit is contained in:
parent
074b8685d6
commit
9578b13fca
2 changed files with 19 additions and 35 deletions
48
my/pdfs.py
48
my/pdfs.py
|
@ -3,9 +3,9 @@ PDF documents and annotations on your filesystem
|
||||||
'''
|
'''
|
||||||
REQUIRES = [
|
REQUIRES = [
|
||||||
'git+https://github.com/0xabu/pdfannots',
|
'git+https://github.com/0xabu/pdfannots',
|
||||||
|
# todo not sure if should use pypi version?
|
||||||
]
|
]
|
||||||
|
|
||||||
from contextlib import redirect_stderr
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import io
|
import io
|
||||||
|
@ -20,7 +20,7 @@ from my.core.common import mcachew, group_by_key
|
||||||
from my.core.error import Res, split_errors
|
from my.core.error import Res, split_errors
|
||||||
|
|
||||||
|
|
||||||
import pdfannots # type: ignore[import]
|
import pdfannots
|
||||||
|
|
||||||
|
|
||||||
from my.config import pdfs as user_config
|
from my.config import pdfs as user_config
|
||||||
|
@ -56,7 +56,6 @@ config = make_config(pdfs, migration=pdfs._migration)
|
||||||
logger = LazyLogger(__name__)
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
# TODO ignoring could be handled on get_files/user config site as well?..
|
|
||||||
all_files = get_files(config.paths, glob='**/*.pdf')
|
all_files = get_files(config.paths, glob='**/*.pdf')
|
||||||
return [p for p in all_files if not config.is_ignored(p)]
|
return [p for p in all_files if not config.is_ignored(p)]
|
||||||
|
|
||||||
|
@ -77,38 +76,35 @@ class Annotation(NamedTuple):
|
||||||
return self.created
|
return self.created
|
||||||
|
|
||||||
|
|
||||||
def as_annotation(*, raw_ann, path: str) -> Annotation:
|
def _as_annotation(*, raw: pdfannots.Annotation, path: str) -> Annotation:
|
||||||
d = vars(raw_ann)
|
d = vars(raw)
|
||||||
d['page'] = raw_ann.page.pageno
|
pos = raw.pos
|
||||||
for a in ('boxes', 'rect'):
|
# make mypy happy (pos alwasy present for Annotation https://github.com/0xabu/pdfannots/blob/dbdfefa158971e1746fae2da139918e9f59439ea/pdfannots/types.py#L302)
|
||||||
if a in d:
|
assert pos is not None
|
||||||
del d[a]
|
d['page'] = pos.page.pageno
|
||||||
return Annotation(
|
return Annotation(
|
||||||
path = path,
|
path = path,
|
||||||
author = d['author'],
|
author = d['author'],
|
||||||
page = d['page'],
|
page = d['page'],
|
||||||
highlight = d['text'],
|
highlight = raw.gettext(),
|
||||||
comment = d['contents'],
|
comment = d['contents'],
|
||||||
created = d.get('created'), # todo can be non-defensive once pr is merged
|
created = d['created'],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_annots(p: Path) -> List[Annotation]:
|
def get_annots(p: Path) -> List[Annotation]:
|
||||||
b = time.time()
|
b = time.time()
|
||||||
with p.open('rb') as fo:
|
with p.open('rb') as fo:
|
||||||
f = io.StringIO()
|
doc = pdfannots.process_file(fo, emit_progress_to=None)
|
||||||
with redirect_stderr(f):
|
annots = [a for a in doc.iter_annots()]
|
||||||
# FIXME
|
# also has outlines are kinda like TOC, I don't really need them
|
||||||
(annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
|
|
||||||
# outlines are kinda like TOC, I don't really need them
|
|
||||||
a = time.time()
|
a = time.time()
|
||||||
took = a - b
|
took = a - b
|
||||||
tooks = f'took {took:0.1f} seconds'
|
tooks = f'took {took:0.1f} seconds'
|
||||||
if took > 5:
|
if took > 5:
|
||||||
tooks = tooks.upper()
|
tooks = tooks.upper()
|
||||||
logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots))
|
logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots))
|
||||||
return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
|
return [_as_annotation(raw=a, path=str(p)) for a in annots]
|
||||||
# TODO stderr?
|
|
||||||
|
|
||||||
|
|
||||||
def _hash_files(pdfs: Sequence[Path]):
|
def _hash_files(pdfs: Sequence[Path]):
|
||||||
|
@ -186,18 +182,8 @@ def stats() -> Stats:
|
||||||
|
|
||||||
|
|
||||||
### legacy/misc stuff
|
### legacy/misc stuff
|
||||||
|
|
||||||
# todo retire later if favor of hpi query?
|
|
||||||
def main() -> None:
|
|
||||||
from pprint import pprint
|
|
||||||
collected = annotated_pdfs()
|
|
||||||
for r in collected:
|
|
||||||
if isinstance(r, Exception):
|
|
||||||
logger.exception(r)
|
|
||||||
else:
|
|
||||||
logger.info('collected annotations in: %s', r.path)
|
|
||||||
for a in r.annotations:
|
|
||||||
pprint(a)
|
|
||||||
|
|
||||||
iter_annotations = annotations # for backwards compatibility
|
iter_annotations = annotations # for backwards compatibility
|
||||||
###
|
###
|
||||||
|
|
||||||
|
# can use 'hpi query my.pdfs.annotations -o pprint' to test
|
||||||
|
#
|
||||||
|
|
|
@ -4,8 +4,6 @@ from more_itertools import ilen
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
pytestmark = pytest.mark.skip("TODO fix pdfannots (see https://github.com/karlicoss/HPI/issues/180)")
|
|
||||||
|
|
||||||
from .common import testdata
|
from .common import testdata
|
||||||
|
|
||||||
|
|
||||||
|
@ -58,8 +56,8 @@ def with_config():
|
||||||
|
|
||||||
|
|
||||||
EXPECTED_HIGHLIGHTS = {
|
EXPECTED_HIGHLIGHTS = {
|
||||||
'Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ',
|
'Since 1994, when we first began organizing web sites, we have enjoyed a rare oppor-tunity to participate in the birth of a new discipline.',
|
||||||
'And yet, unlearn we must, ',
|
'And yet, unlearn we must,',
|
||||||
'',
|
'',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue