From ad177a1ccd9e1f6104991a28121e49c2fb56a3ea Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 30 Mar 2021 22:00:21 +0100 Subject: [PATCH] my.pdfs: cleanup/refactor - modernize: - add REQUIRES spec for pdfannots library - config dataclass/config stub - stats function - absolute my.core imports in anticipation of splitting core - use 'paths' instead of 'roots' (better reflects the semantics), use get_files backward compatible via config migration - properly run tests/mypy --- my/config.py | 4 + my/core/common.py | 4 +- my/core/stats.py | 2 + my/pdfs.py | 199 +++++++++++++++++++++++----------------------- tests/pdfs.py | 69 ++++++++++++++-- tox.ini | 7 +- 6 files changed, 177 insertions(+), 108 deletions(-) diff --git a/my/config.py b/my/config.py index 306ce5d..096c58a 100644 --- a/my/config.py +++ b/my/config.py @@ -83,3 +83,7 @@ class commits: emails: Optional[Sequence[str]] names: Optional[Sequence[str]] roots: Sequence[PathIsh] + + +class pdfs: + paths: Paths diff --git a/my/core/common.py b/my/core/common.py index 0bc621b..565ef5b 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -164,7 +164,9 @@ def get_files( warnings.warn(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!") paths.extend(map(Path, do_glob(gs))) elif src.is_dir(): - gp: Iterable[Path] = src.glob(glob) # todo not sure if should be recursive? + # todo not sure if should be recursive? + # note: glob='**/*.ext' works without any changes.. so perhaps it's ok as it is + gp: Iterable[Path] = src.glob(glob) paths.extend(gp) else: if not src.is_file(): diff --git a/my/core/stats.py b/my/core/stats.py index 2d61e24..1a59f3d 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -29,6 +29,8 @@ def is_data_provider(fun) -> bool: 1. returns iterable or something like that 2. takes no arguments? (otherwise not callable by stats anyway?) """ + # todo maybe for 2 allow default arguments? not sure + # one example which could benefit is my.pdfs if fun is None: return False # todo. uh.. very similar to what cachew is trying to do? diff --git a/my/pdfs.py b/my/pdfs.py index 3a4905d..cfe630e 100755 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -1,75 +1,77 @@ -#!/usr/bin/env python3 ''' PDF documents and annotations on your filesystem ''' -from concurrent.futures import ProcessPoolExecutor +REQUIRES = [ + 'git+https://github.com/0xabu/pdfannots', +] + +from contextlib import redirect_stderr from datetime import datetime +from dataclasses import dataclass +import io +from pathlib import Path import re import sys -import io -import logging -from pathlib import Path -from typing import NamedTuple, List, Optional, Iterator -from contextlib import redirect_stderr +import time +from typing import NamedTuple, List, Optional, Iterator, Sequence -from .common import mcachew, group_by_key -from .error import Res, split_errors - -# path to pdfannots (https://github.com/0xabu/pdfannots) -import my.config.repos.pdfannots.pdfannots as pdfannots -from my.config import pdfs as config +from my.core import LazyLogger, get_files, Paths, PathIsh +from my.core.cfg import Attrs, make_config +from my.core.common import mcachew, group_by_key +from my.core.error import Res, split_errors -def get_logger(): - return logging.getLogger('my.pdfs') +import pdfannots # type: ignore[import] -def is_ignored(p: Path) -> bool: - """ - Used to ignore some extremely heavy files - is_ignored function taken either from config, - or if not defined, it's a function that returns False - """ - if hasattr(config, 'is_ignored'): - return config.is_ignored(p) +from my.config import pdfs as user_config - # Default - return lambda x: False +@dataclass +class pdfs(user_config): + paths: Paths = () # allowed to be empty for 'filelist' logic + + def is_ignored(self, p: Path) -> bool: + """ + Used to ignore some extremely heavy files + is_ignored function taken either from config, + or if not defined, it's a function that returns False + """ + user_ignore = getattr(user_config, 'is_ignored', None) + if user_ignore is not None: + return user_ignore(p) + + return False + + @staticmethod + def _migration(attrs: Attrs) -> Attrs: + roots = 'roots' + if roots in attrs: # legacy name + attrs['paths'] = attrs[roots] + from my.core.warnings import high + high(f'"{roots}" is deprecated! Use "paths" instead.') + return attrs -def candidates(filelist=None, roots=None) -> Iterator[Path]: - if filelist is not None: - return candidates_from_filelist(filelist) - else: - return candidates_from_roots(roots) +config = make_config(pdfs, migration=pdfs._migration) -def candidates_from_filelist(filelist) -> Iterator[Path]: - for f in filelist: - p = Path(f) - if not is_ignored(p): - yield p +logger = LazyLogger(__name__) -def candidates_from_roots(roots=None) -> Iterator[Path]: - if roots is None: - roots = config.roots +def inputs() -> Sequence[Path]: + # TODO ignoring could be handled on get_files/user config site as well?.. + all_files = get_files(config.paths, glob='**/*.pdf') + return [p for p in all_files if not config.is_ignored(p)] - for r in roots: - for p in Path(r).rglob('*.pdf'): - if not is_ignored(p): - yield p -# TODO canonical names +# TODO canonical names/fingerprinting? # TODO defensive if pdf was removed, also cachew key needs to be defensive - - class Annotation(NamedTuple): path: str author: Optional[str] page: int highlight: Optional[str] comment: Optional[str] - date: Optional[datetime] + date: Optional[datetime] # TODO tz aware/unaware? def as_annotation(*, raw_ann, path: str) -> Annotation: @@ -106,29 +108,40 @@ def as_annotation(*, raw_ann, path: str) -> Annotation: def get_annots(p: Path) -> List[Annotation]: + b = time.time() with p.open('rb') as fo: f = io.StringIO() with redirect_stderr(f): + # FIXME (annots, outlines) = pdfannots.process_file(fo, emit_progress=False) # outlines are kinda like TOC, I don't really need them + a = time.time() + took = a - b + tooks = f'took {took:0.1f} seconds' + if took > 5: + tooks = tooks.upper() + logger.debug('extracting %s %s: %d annotations', tooks, p, len(annots)) return [as_annotation(raw_ann=a, path=str(p)) for a in annots] # TODO stderr? -def hash_files(pdfs: List[Path]): +def _hash_files(pdfs: Sequence[Path]): # if mtime hasn't changed then the file hasn't changed either return [(pdf, pdf.stat().st_mtime) for pdf in pdfs] + # TODO might make more sense to be more fine grained here, e.g. cache annotations for indifidual files - -@mcachew(hashf=hash_files) -def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]: - logger = get_logger() - +@mcachew(depends_on=_hash_files) +def _iter_annotations(pdfs: Sequence[Path]) -> Iterator[Res[Annotation]]: logger.info('processing %d pdfs', len(pdfs)) - # TODO how to print to stdout synchronously? - with ProcessPoolExecutor() as pool: + # todo how to print to stdout synchronously? + # todo global config option not to use pools? useful for debugging.. + from concurrent.futures import ProcessPoolExecutor + from my.core.common import DummyExecutor + workers = None # use 0 for debugging + Pool = DummyExecutor if workers == 0 else ProcessPoolExecutor + with Pool(workers) as pool: futures = [ pool.submit(get_annots, pdf) for pdf in pdfs @@ -139,75 +152,61 @@ def _iter_annotations(pdfs: List[Path]) -> Iterator[Res[Annotation]]: except Exception as e: logger.error('While processing %s:', pdf) logger.exception(e) + # todo add a comment that it can be ignored... or something like that # TODO not sure if should attach pdf as well; it's a bit annoying to pass around? # also really have to think about interaction with cachew... yield e -def iter_annotations(filelist=None, roots=None) -> Iterator[Res[Annotation]]: - pdfs = list(sorted(candidates(filelist=filelist, roots=None))) +def annotations() -> Iterator[Res[Annotation]]: + pdfs = inputs() yield from _iter_annotations(pdfs=pdfs) class Pdf(NamedTuple): path: Path - annotations: List[Annotation] + annotations: Sequence[Annotation] @property - def date(self): + def date(self) -> Optional[datetime]: + # TODO tz aware/unaware return self.annotations[-1].date -def annotated_pdfs(filelist=None, roots=None) -> Iterator[Res[Pdf]]: - it = iter_annotations(filelist=filelist, roots=roots) - vit, eit = split_errors(it, ET=Exception) +def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]: + if filelist is not None: + # hacky... keeping it backwards compatible + # https://github.com/karlicoss/HPI/pull/74 + config.paths = filelist + ait = annotations() + vit, eit = split_errors(ait, ET=Exception) for k, g in group_by_key(vit, key=lambda a: a.path).items(): yield Pdf(path=Path(k), annotations=g) yield from eit -def test(): - res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf')) - assert len(res) > 3 +from my.core import stat, Stats +def stats() -> Stats: + return { + **stat(annotations) , + **stat(annotated_pdfs), + } -def test2(): - res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf')) - print(res) +### legacy/misc stuff - -def test_with_error(): - # TODO need example of pdf file... - import tempfile - with tempfile.TemporaryDirectory() as td: - root = Path(td) - g = root / 'garbage.pdf' - g.write_text('garbage') - roots = [ - root, - # '/usr/share/doc/texlive-doc/latex/amsrefs/', - ] - # TODO find some pdfs that actually has annotations... - annots = list(iter_annotations(roots=roots)) - assert len(annots) == 1 - assert isinstance(annots[0], Exception) - - -def main(): +# todo retire later if favor of hpi query? +def main() -> None: from pprint import pprint + collected = annotated_pdfs() + for r in collected: + if isinstance(r, Exception): + logger.exception(r) + else: + logger.info('collected annotations in: %s', r.path) + for a in r.annotations: + pprint(a) - logger = get_logger() - from .common import setup_logger - setup_logger(logger, level=logging.DEBUG) - - collected = list(annotated_pdfs()) - if len(collected) > 0: - for r in collected: - if isinstance(r, Exception): - logger.exception(r) - else: - logger.info('collected annotations in: %s', r.path) - for a in r.annotations: - pprint(a) - +iter_annotations = annotations # for backwards compatibility +### diff --git a/tests/pdfs.py b/tests/pdfs.py index b8261c5..0290ad0 100644 --- a/tests/pdfs.py +++ b/tests/pdfs.py @@ -1,11 +1,60 @@ -import inspect from pathlib import Path -import tempfile -from my.pdfs import get_annots, annotated_pdfs +from more_itertools import ilen + +import pytest from .common import testdata + +def test_module(with_config) -> None: + # TODO crap. if module is imported too early (on the top level, it makes it super hard to overrride config) + # need to at least detect it... + from my.pdfs import annotations, annotated_pdfs + + # todo check types etc as well + assert ilen(annotations()) >= 3 + assert ilen(annotated_pdfs()) >= 1 + + +def test_with_error(with_config, tmp_path: Path) -> None: + """should handle crappy files gracefully""" + root = tmp_path + g = root / 'garbage.pdf' + g.write_text('garbage') + from my.config import pdfs + del pdfs.roots # meh. otherwise legacy config value 'wins' + pdfs.paths = (root,) + + from my.pdfs import annotations + annots = list(annotations()) + [annot] = annots + assert isinstance(annot, Exception) + + +@pytest.fixture +def with_config(): + from .common import reset_modules + reset_modules() # todo ugh.. getting boilerplaty.. need to make it a bit more automatic.. + + # extra_data = Path(__file__).absolute().parent / 'extra/data/polar' + # assert extra_data.exists(), extra_data + # todo hmm, turned out no annotations in these ones.. whatever + + class user_config: + roots = [ + testdata(), + ] + + import my.core.cfg as C + with C.tmp_config() as config: + config.pdfs = user_config # type: ignore + try: + yield + finally: + reset_modules() + + EXPECTED_HIGHLIGHTS = { 'Since 1994, when we first began organizing web sites, we have enjoyed a rare opportunity to participate in the birth of a new discipline. ', 'And yet, unlearn we must, ', @@ -18,6 +67,8 @@ def test_get_annots() -> None: Test get_annots, with a real PDF file get_annots should return a list of three Annotation objects """ + from my.pdfs import get_annots + annotations = get_annots(testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf') assert len(annotations) == 3 assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS @@ -28,9 +79,12 @@ def test_annotated_pdfs_with_filelist() -> None: Test annotated_pdfs, with a real PDF file annotated_pdfs should return a list of one Pdf object, with three Annotations """ - filelist = [testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf'] - annotations_generator = annotated_pdfs(filelist=filelist, roots=None) + from my.pdfs import annotated_pdfs + filelist = [testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf'] + annotations_generator = annotated_pdfs(filelist=filelist) + + import inspect assert inspect.isgeneratorfunction(annotated_pdfs) highlights_from_pdfs = [] @@ -41,3 +95,8 @@ def test_annotated_pdfs_with_filelist() -> None: assert len(highlights_from_pdfs) == 3 assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS + + +# todo old test on my(karlicoss) computer: +# - mature-optimization_wtf.pdf: >3 annotations? +# - nonlinear2.pdf diff --git a/tox.ini b/tox.ini index 5b6f789..70565f0 100644 --- a/tox.ini +++ b/tox.ini @@ -37,11 +37,12 @@ commands = hpi module install my.coding.commits + hpi module install my.pdfs + python3 -m pytest tests \ # ignore some tests which might take a while to run on ci.. --ignore tests/takeout.py \ - --ignore tests/extra/polar.py \ - --ignore tests/pdfs/test_pdfs.py \ + --ignore tests/extra/polar.py {posargs} @@ -82,6 +83,7 @@ commands = hpi module install my.arbtt hpi module install my.coding.commits hpi module install my.goodreads + hpi module install my.pdfs # todo fuck. -p my.github isn't checking the subpackages?? wtf... # guess it wants .pyi file?? @@ -103,6 +105,7 @@ commands = -p my.arbtt \ -p my.coding.commits \ -p my.goodreads \ + -p my.pdfs \ --txt-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \ {posargs}