my.pdfs: cleanup/refactor

- modernize:
  - add REQUIRES spec for pdfannots library
  - config dataclass/config stub
  - stats function
  - absolute my.core imports in anticipation of splitting core
- use 'paths' instead of 'roots' (better reflects the semantics), use get_files
  backward compatible via config migration
- properly run tests/mypy
This commit is contained in:
Dima Gerasimov 2021-03-30 22:00:21 +01:00 committed by karlicoss
parent e7604c188e
commit ad177a1ccd
6 changed files with 177 additions and 108 deletions

View file

@ -83,3 +83,7 @@ class commits:
emails: Optional[Sequence[str]] emails: Optional[Sequence[str]]
names: Optional[Sequence[str]] names: Optional[Sequence[str]]
roots: Sequence[PathIsh] roots: Sequence[PathIsh]
class pdfs:
paths: Paths

View file

@ -164,7 +164,9 @@ def get_files(
warnings.warn(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!") warnings.warn(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!")
paths.extend(map(Path, do_glob(gs))) paths.extend(map(Path, do_glob(gs)))
elif src.is_dir(): elif src.is_dir():
gp: Iterable[Path] = src.glob(glob) # todo not sure if should be recursive? # todo not sure if should be recursive?
# note: glob='**/*.ext' works without any changes.. so perhaps it's ok as it is
gp: Iterable[Path] = src.glob(glob)
paths.extend(gp) paths.extend(gp)
else: else:
if not src.is_file(): if not src.is_file():

View file

@ -29,6 +29,8 @@ def is_data_provider(fun) -> bool:
1. returns iterable or something like that 1. returns iterable or something like that
2. takes no arguments? (otherwise not callable by stats anyway?) 2. takes no arguments? (otherwise not callable by stats anyway?)
""" """
# todo maybe for 2 allow default arguments? not sure
# one example which could benefit is my.pdfs
if fun is None: if fun is None:
return False return False
# todo. uh.. very similar to what cachew is trying to do? # todo. uh.. very similar to what cachew is trying to do?

View file

@ -1,75 +1,77 @@
#!/usr/bin/env python3
''' '''
PDF documents and annotations on your filesystem PDF documents and annotations on your filesystem
''' '''
from concurrent.futures import ProcessPoolExecutor REQUIRES = [
'git+https://github.com/0xabu/pdfannots',
]
from contextlib import redirect_stderr
from datetime import datetime from datetime import datetime
from dataclasses import dataclass
import io
from pathlib import Path
import re import re
import sys import sys
import io import time
import logging from typing import NamedTuple, List, Optional, Iterator, Sequence
from pathlib import Path
from typing import NamedTuple, List, Optional, Iterator
from contextlib import redirect_stderr
from .common import mcachew, group_by_key from my.core import LazyLogger, get_files, Paths, PathIsh
from .error import Res, split_errors from my.core.cfg import Attrs, make_config
from my.core.common import mcachew, group_by_key
# path to pdfannots (https://github.com/0xabu/pdfannots) from my.core.error import Res, split_errors
import my.config.repos.pdfannots.pdfannots as pdfannots
from my.config import pdfs as config
def get_logger(): import pdfannots # type: ignore[import]
return logging.getLogger('my.pdfs')
def is_ignored(p: Path) -> bool: from my.config import pdfs as user_config
"""
Used to ignore some extremely heavy files
is_ignored function taken either from config,
or if not defined, it's a function that returns False
"""
if hasattr(config, 'is_ignored'):
return config.is_ignored(p)
# Default @dataclass
return lambda x: False class pdfs(user_config):
paths: Paths = () # allowed to be empty for 'filelist' logic
def is_ignored(self, p: Path) -> bool:
"""
Used to ignore some extremely heavy files
is_ignored function taken either from config,
or if not defined, it's a function that returns False
"""
user_ignore = getattr(user_config, 'is_ignored', None)
if user_ignore is not None:
return user_ignore(p)
return False
@staticmethod
def _migration(attrs: Attrs) -> Attrs:
roots = 'roots'
if roots in attrs: # legacy name
attrs['paths'] = attrs[roots]
from my.core.warnings import high
high(f'"{roots}" is deprecated! Use "paths" instead.')
return attrs
def candidates(filelist=None, roots=None) -> Iterator[Path]: config = make_config(pdfs, migration=pdfs._migration)
if filelist is not None:
return candidates_from_filelist(filelist)
else:
return candidates_from_roots(roots)
def candidates_from_filelist(filelist) -> Iterator[Path]: logger = LazyLogger(__name__)
for f in filelist:
p = Path(f)
if not is_ignored(p):
yield p
def candidates_from_roots(roots=None) -> Iterator[Path]: def inputs() -> Sequence[Path]:
if roots is None: # TODO ignoring could be handled on get_files/user config site as well?..
roots = config.roots all_files = get_files(config.paths, glob='**/*.pdf')
return [p for p in all_files if not config.is_ignored(p)]
for r in roots:
for p in Path(r).rglob('*.pdf'):
if not is_ignored(p):
yield p
# TODO canonical names # TODO canonical names/fingerprinting?
# TODO defensive if pdf was removed, also cachew key needs to be defensive # TODO defensive if pdf was removed, also cachew key needs to be defensive
class Annotation(NamedTuple): class Annotation(NamedTuple):
path: str path: str
author: Optional[str] author: Optional[str]
page: int page: int
highlight: Optional[str] highlight: Optional[str]
comment: Optional[str] comment: Optional[str]
date: Optional[datetime] date: Optional[datetime] # TODO tz aware/unaware?
def as_annotation(*, raw_ann, path: str) -> Annotation: def as_annotation(*, raw_ann, path: str) -> Annotation:
@ -106,29 +108,40 @@ def as_annotation(*, raw_ann, path: str) -> Annotation:
def get_annots(p: Path) -> List[Annotation]: def get_annots(p: Path) -> List[Annotation]:
b = time.time()
with p.open('rb') as fo: with p.open('rb') as fo:
f = io.StringIO() f = io.StringIO()
with redirect_stderr(f): with redirect_stderr(f):
# FIXME
(annots, outlines) = pdfannots.process_file(fo, emit_progress=False) (annots, outlines) = pdfannots.process_file(fo, emit_progress=False)
# outlines are kinda like TOC, I don't really need them # outlines are kinda like TOC, I don't really need them
a = time.time()
took = a - b
tooks = f'took {took:0.1f} seconds'
if took > 5:
tooks = tooks.upper()
logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots))
return [as_annotation(raw_ann=a, path=str(p)) for a in annots] return [as_annotation(raw_ann=a, path=str(p)) for a in annots]
# TODO stderr? # TODO stderr?
def hash_files(pdfs: List[Path]): def _hash_files(pdfs: Sequence[Path]):
# if mtime hasn't changed then the file hasn't changed either # if mtime hasn't changed then the file hasn't changed either
return [(pdf, pdf.stat().st_mtime) for pdf in pdfs] return [(pdf, pdf.stat().st_mtime) for pdf in pdfs]
# TODO might make more sense to be more fine grained here, e.g. cache annotations for indifidual files # TODO might make more sense to be more fine grained here, e.g. cache annotations for indifidual files
@mcachew(depends_on=_hash_files)
@mcachew(hashf=hash_files) def _iter_annotations(pdfs: Sequence[Path]) -> Iterator[Res[Annotation]]:
def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
logger = get_logger()
logger.info('processing %d pdfs', len(pdfs)) logger.info('processing %d pdfs', len(pdfs))
# TODO how to print to stdout synchronously? # todo how to print to stdout synchronously?
with ProcessPoolExecutor() as pool: # todo global config option not to use pools? useful for debugging..
from concurrent.futures import ProcessPoolExecutor
from my.core.common import DummyExecutor
workers = None # use 0 for debugging
Pool = DummyExecutor if workers == 0 else ProcessPoolExecutor
with Pool(workers) as pool:
futures = [ futures = [
pool.submit(get_annots, pdf) pool.submit(get_annots, pdf)
for pdf in pdfs for pdf in pdfs
@ -139,75 +152,61 @@ def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]:
except Exception as e: except Exception as e:
logger.error('While processing %s:', pdf) logger.error('While processing %s:', pdf)
logger.exception(e) logger.exception(e)
# todo add a comment that it can be ignored... or something like that
# TODO not sure if should attach pdf as well; it's a bit annoying to pass around? # TODO not sure if should attach pdf as well; it's a bit annoying to pass around?
# also really have to think about interaction with cachew... # also really have to think about interaction with cachew...
yield e yield e
def iter_annotations(filelist=None, roots=None) -> Iterator[Res[Annotation]]: def annotations() -> Iterator[Res[Annotation]]:
pdfs = list(sorted(candidates(filelist=filelist, roots=None))) pdfs = inputs()
yield from _iter_annotations(pdfs=pdfs) yield from _iter_annotations(pdfs=pdfs)
class Pdf(NamedTuple): class Pdf(NamedTuple):
path: Path path: Path
annotations: List[Annotation] annotations: Sequence[Annotation]
@property @property
def date(self): def date(self) -> Optional[datetime]:
# TODO tz aware/unaware
return self.annotations[-1].date return self.annotations[-1].date
def annotated_pdfs(filelist=None, roots=None) -> Iterator[Res[Pdf]]: def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]:
it = iter_annotations(filelist=filelist, roots=roots) if filelist is not None:
vit, eit = split_errors(it, ET=Exception) # hacky... keeping it backwards compatible
# https://github.com/karlicoss/HPI/pull/74
config.paths = filelist
ait = annotations()
vit, eit = split_errors(ait, ET=Exception)
for k, g in group_by_key(vit, key=lambda a: a.path).items(): for k, g in group_by_key(vit, key=lambda a: a.path).items():
yield Pdf(path=Path(k), annotations=g) yield Pdf(path=Path(k), annotations=g)
yield from eit yield from eit
def test(): from my.core import stat, Stats
res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf')) def stats() -> Stats:
assert len(res) > 3 return {
**stat(annotations) ,
**stat(annotated_pdfs),
}
def test2(): ### legacy/misc stuff
res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf'))
print(res)
# todo retire later if favor of hpi query?
def test_with_error(): def main() -> None:
# TODO need example of pdf file...
import tempfile
with tempfile.TemporaryDirectory() as td:
root = Path(td)
g = root / 'garbage.pdf'
g.write_text('garbage')
roots = [
root,
# '/usr/share/doc/texlive-doc/latex/amsrefs/',
]
# TODO find some pdfs that actually has annotations...
annots = list(iter_annotations(roots=roots))
assert len(annots) == 1
assert isinstance(annots[0], Exception)
def main():
from pprint import pprint from pprint import pprint
collected = annotated_pdfs()
for r in collected:
if isinstance(r, Exception):
logger.exception(r)
else:
logger.info('collected annotations in: %s', r.path)
for a in r.annotations:
pprint(a)
logger = get_logger() iter_annotations = annotations # for backwards compatibility
from .common import setup_logger ###
setup_logger(logger, level=logging.DEBUG)
collected = list(annotated_pdfs())
if len(collected) > 0:
for r in collected:
if isinstance(r, Exception):
logger.exception(r)
else:
logger.info('collected annotations in: %s', r.path)
for a in r.annotations:
pprint(a)

View file

@ -1,11 +1,60 @@
import inspect
from pathlib import Path from pathlib import Path
import tempfile
from my.pdfs import get_annots, annotated_pdfs from more_itertools import ilen
import pytest
from .common import testdata from .common import testdata
def test_module(with_config) -> None:
# TODO crap. if module is imported too early (on the top level, it makes it super hard to overrride config)
# need to at least detect it...
from my.pdfs import annotations, annotated_pdfs
# todo check types etc as well
assert ilen(annotations()) >= 3
assert ilen(annotated_pdfs()) >= 1
def test_with_error(with_config, tmp_path: Path) -> None:
"""should handle crappy files gracefully"""
root = tmp_path
g = root / 'garbage.pdf'
g.write_text('garbage')
from my.config import pdfs
del pdfs.roots # meh. otherwise legacy config value 'wins'
pdfs.paths = (root,)
from my.pdfs import annotations
annots = list(annotations())
[annot] = annots
assert isinstance(annot, Exception)
@pytest.fixture
def with_config():
from .common import reset_modules
reset_modules() # todo ugh.. getting boilerplaty.. need to make it a bit more automatic..
# extra_data = Path(__file__).absolute().parent / 'extra/data/polar'
# assert extra_data.exists(), extra_data
# todo hmm, turned out no annotations in these ones.. whatever
class user_config:
roots = [
testdata(),
]
import my.core.cfg as C
with C.tmp_config() as config:
config.pdfs = user_config # type: ignore
try:
yield
finally:
reset_modules()
EXPECTED_HIGHLIGHTS = { EXPECTED_HIGHLIGHTS = {
'Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ', 'Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ',
'And yet, unlearn we must, ', 'And yet, unlearn we must, ',
@ -18,6 +67,8 @@ def test_get_annots() -> None:
Test get_annots, with a real PDF file Test get_annots, with a real PDF file
get_annots should return a list of three Annotation objects get_annots should return a list of three Annotation objects
""" """
from my.pdfs import get_annots
annotations = get_annots(testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf') annotations = get_annots(testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf')
assert len(annotations) == 3 assert len(annotations) == 3
assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS
@ -28,9 +79,12 @@ def test_annotated_pdfs_with_filelist() -> None:
Test annotated_pdfs, with a real PDF file Test annotated_pdfs, with a real PDF file
annotated_pdfs should return a list of one Pdf object, with three Annotations annotated_pdfs should return a list of one Pdf object, with three Annotations
""" """
filelist = [testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf'] from my.pdfs import annotated_pdfs
annotations_generator = annotated_pdfs(filelist=filelist, roots=None)
filelist = [testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf']
annotations_generator = annotated_pdfs(filelist=filelist)
import inspect
assert inspect.isgeneratorfunction(annotated_pdfs) assert inspect.isgeneratorfunction(annotated_pdfs)
highlights_from_pdfs = [] highlights_from_pdfs = []
@ -41,3 +95,8 @@ def test_annotated_pdfs_with_filelist() -> None:
assert len(highlights_from_pdfs) == 3 assert len(highlights_from_pdfs) == 3
assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS
# todo old test on my(karlicoss) computer:
# - mature-optimization_wtf.pdf: >3 annotations?
# - nonlinear2.pdf

View file

@ -37,11 +37,12 @@ commands =
hpi module install my.coding.commits hpi module install my.coding.commits
hpi module install my.pdfs
python3 -m pytest tests \ python3 -m pytest tests \
# ignore some tests which might take a while to run on ci.. # ignore some tests which might take a while to run on ci..
--ignore tests/takeout.py \ --ignore tests/takeout.py \
--ignore tests/extra/polar.py \ --ignore tests/extra/polar.py
--ignore tests/pdfs/test_pdfs.py \
{posargs} {posargs}
@ -82,6 +83,7 @@ commands =
hpi module install my.arbtt hpi module install my.arbtt
hpi module install my.coding.commits hpi module install my.coding.commits
hpi module install my.goodreads hpi module install my.goodreads
hpi module install my.pdfs
# todo fuck. -p my.github isn't checking the subpackages?? wtf... # todo fuck. -p my.github isn't checking the subpackages?? wtf...
# guess it wants .pyi file?? # guess it wants .pyi file??
@ -103,6 +105,7 @@ commands =
-p my.arbtt \ -p my.arbtt \
-p my.coding.commits \ -p my.coding.commits \
-p my.goodreads \ -p my.goodreads \
-p my.pdfs \
--txt-report .coverage.mypy-misc \ --txt-report .coverage.mypy-misc \
--html-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \
{posargs} {posargs}