import pdfs provider

This commit is contained in:
Dima Gerasimov 2019-10-01 22:28:55 +01:00
parent d407e84d74
commit 06940b8323
2 changed files with 40 additions and 32 deletions

View file

@ -3,4 +3,7 @@
import my_configuration import my_configuration
# TODO maybe just import everything?
# TODO how to make it mypy friendly? maybe defensive import? or mypy config? or interface file?
paths = my_configuration.paths # type: ignore paths = my_configuration.paths # type: ignore

View file

@ -1,35 +1,41 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from . import paths
from .common import import_file
from pathlib import Path from pathlib import Path
# path to pdfannots (https://github.com/0xabu/pdfannots)
pdfannots = import_file(paths.pdfs.pdfannots_py)
from datetime import datetime from datetime import datetime
import re import re
from multiprocessing.pool import Pool
from subprocess import CompletedProcess from subprocess import CompletedProcess
import sys import sys
import io import io
from typing import NamedTuple, List, Optional from typing import NamedTuple, List, Optional
from contextlib import redirect_stderr from contextlib import redirect_stderr
import logging import logging
from pprint import pprint
import itertools
from kython import import_file
from kython.klogging import setup_logzero
from ..ext.pdfannots import pdfannots # type: ignore
from .private import ROOT_PATHS, is_ignored
def get_logger(): def get_logger():
return logging.getLogger('annotation-crawler') return logging.getLogger('my.pdfs')
def get_candidates() -> List[Path]: def get_candidates(roots=None) -> List[Path]:
pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in ROOT_PATHS) if roots is None:
roots = paths.pdfs.roots
import itertools
pdfs = itertools.chain.from_iterable(Path(p).glob('**/*.pdf') for p in roots)
return list(sorted(pdfs)) return list(sorted(pdfs))
def is_ignored(p):
return paths.pdfs.is_ignored(p)
# TODO cachew? # TODO cachew?
class Annotation(NamedTuple): class Annotation(NamedTuple):
author: Optional[str] author: Optional[str]
@ -106,22 +112,10 @@ def get_annots(p: Path) -> Pdf:
raise PdfAnnotsException(p) from e raise PdfAnnotsException(p) from e
def test(): def get_annotated_pdfs(roots=None) -> List[Pdf]:
res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
assert len(res.annotations) > 0
def test2():
res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf'))
print(res)
def get_annotated_pdfs(pdfs=None) -> List[Pdf]:
logger = get_logger() logger = get_logger()
setup_logzero(logger, level=logging.DEBUG)
if pdfs is None: pdfs = get_candidates(roots=roots)
pdfs = get_candidates()
logger.info('processing %d pdfs', len(pdfs)) logger.info('processing %d pdfs', len(pdfs))
collected = [] collected = []
@ -149,6 +143,7 @@ def get_annotated_pdfs(pdfs=None) -> List[Pdf]:
logger.exception(err) logger.exception(err)
errors.append(str(err)) errors.append(str(err))
from multiprocessing.pool import Pool
with Pool() as p: with Pool() as p:
handles = [p.apply_async( handles = [p.apply_async(
get_annots, get_annots,
@ -167,8 +162,22 @@ def get_annotated_pdfs(pdfs=None) -> List[Pdf]:
return collected return collected
def test():
res = get_annots(Path('/L/zzz_syncthing/TODO/TOREAD/done/mature-optimization_wtf.pdf'))
assert len(res.annotations) > 0
def test2():
res = get_annots(Path('/L/zzz_borg/downloads/nonlinear2.pdf'))
print(res)
def main(): def main():
from pprint import pprint
logger = get_logger() logger = get_logger()
from kython.klogging import setup_logzero
setup_logzero(logger, level=logging.DEBUG)
collected = get_annotated_pdfs() collected = get_annotated_pdfs()
if len(collected) > 0: if len(collected) > 0:
@ -176,7 +185,3 @@ def main():
logger.warning('collected annotations in: %s', r.path) logger.warning('collected annotations in: %s', r.path)
for a in r.annotations: for a in r.annotations:
pprint(a) pprint(a)
if __name__ == '__main__':
main()