Merge pull request #50 from karlicoss/polar

polar module updates
2020-05-17 14:01:49 +01:00 · 2020-05-17 14:01:49 +01:00 · c07ea0a600
commit c07ea0a600
parent 647b6087dd 65138808e7
7 changed files with 229 additions and 61 deletions
--- a/doc/MODULES.org
+++ b/doc/MODULES.org
@ -33,6 +33,7 @@ modules = [
    ('twint'  , 'my.twitter.twint'       ),
    ('twitter', 'my.twitter.archive'     ),
    ('lastfm' , 'my.lastfm'              ),
    ('polar'  , 'my.reading.polar'       ),
 ]
 def indent(s, spaces=4):
@ -117,4 +118,15 @@ for cls, p in modules:
        """
        export_path: Paths
    #+end_src
 - [[file:../my/reading/polar.py][my.reading.polar]]
    [[https://github.com/burtonator/polar-books][Polar]] articles and highlights
    #+begin_src python
    class polar:
        '''
        Polar config is optional, you only need it if you want to specify custom 'polar_dir'
        '''
        polar_dir: Path = Path('~/.polar').expanduser()
    #+end_src
 :end:
--- a/my/core/init.py
+++ b/my/core/init.py
@ -1,4 +1,4 @@
 # this file only keeps the most common & critical types/utility functions
 from .common import PathIsh, Paths, Json
-from .common import get_files
+from .common import get_files, LazyLogger
 from .cfg import make_config
--- a/my/core/common.py
+++ b/my/core/common.py
@ -134,7 +134,8 @@ def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path,
                    warnings.warn(f"Treating {ss} as glob path. Explicit glob={glob} argument is ignored!")
                paths.extend(map(Path, do_glob(ss)))
            else:
-                assert src.is_file(), src
+                if not src.is_file():
                    raise RuntimeError(f"Expected '{src}' to exist")
                # todo assert matches glob??
                paths.append(src)
@ -245,3 +246,10 @@ def isoparse(s: str) -> tzdatetime:
    assert s.endswith('Z'), s
    s = s[:-1] + '+00:00'
    return fromisoformat(s)
 import re
 # https://stackoverflow.com/a/295466/706389
 def get_valid_filename(s: str) -> str:
    s = str(s).strip().replace(' ', '_')
    return re.sub(r'(?u)[^-\w.]', '', s)
--- a/my/core/init.py
+++ b/my/core/init.py
@ -30,6 +30,7 @@ def setup_config() -> None:
    import os
    import warnings
    from typing import Optional
    import appdirs # type: ignore[import]
    # not sure if that's necessary, i.e. could rely on PYTHONPATH instead
    # on the other hand, by using MY_CONFIG we are guaranteed to load it from the desired path?
@ -37,9 +38,7 @@ def setup_config() -> None:
    if mvar is not None:
        mycfg_dir = Path(mvar)
    else:
-        # TODO use appdir??
+        mycfg_dir = Path(appdirs.user_config_dir('my'))
        cfg_dir = Path('~/.config').expanduser()
        mycfg_dir = cfg_dir / 'my'
    if not mycfg_dir.exists():
        warnings.warn(f"my.config package isn't found! (expected at {mycfg_dir}). This is likely to result in issues.")
--- a/my/kython/konsume.py
+++ b/my/kython/konsume.py
@ -11,7 +11,7 @@ def zoom(w, *keys):
 # TODO need to support lists
 class Zoomable:
-    def __init__(self, parent, *args, **kwargs):
+    def __init__(self, parent, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs) # type: ignore
        self.parent = parent
@ -21,19 +21,19 @@ class Zoomable:
    def dependants(self):
        raise NotImplementedError
-    def ignore(self):
+    def ignore(self) -> None:
        self.consume_all()
-    def consume_all(self):
+    def consume_all(self) -> None:
        for d in self.dependants:
            d.consume_all()
        self.consume()
-    def consume(self):
+    def consume(self) -> None:
        assert self.parent is not None
        self.parent._remove(self)
-    def zoom(self):
+    def zoom(self) -> 'Zoomable':
        self.consume()
        return self
@ -56,6 +56,8 @@ class Wdict(Zoomable, OrderedDict):
    def this_consumed(self):
        return len(self) == 0
    # TODO specify mypy type for the index special method?
 class Wlist(Zoomable, list):
    def _remove(self, xx):
@ -83,7 +85,8 @@ class Wvalue(Zoomable):
    def __repr__(self):
        return 'WValue{' + repr(self.value) + '}'
-def _wrap(j, parent=None):
+from typing import Tuple
 def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]:
    res: Zoomable
    cc: List[Zoomable]
    if isinstance(j, dict):
@ -109,13 +112,14 @@ def _wrap(j, parent=None):
        raise RuntimeError(f'Unexpected type: {type(j)} {j}')
 from contextlib import contextmanager
 from typing import Iterator
 class UnconsumedError(Exception):
    pass
 # TODO think about error policy later...
@contextmanager
-def wrap(j, throw=True):
+def wrap(j, throw=True) -> Iterator[Zoomable]:
    w, children = _wrap(j)
    yield w
@ -123,33 +127,41 @@ def wrap(j, throw=True):
    for c in children:
        if not c.this_consumed(): # TODO hmm. how does it figure out if it's consumed???
            if throw:
-                raise UnconsumedError(str(c))
+                # TODO need to keep a full path or something...
                raise UnconsumedError(f'''
 Expected {c} to be fully consumed by the parser.
 '''.lstrip())
            else:
                # TODO log?
                pass
-
+from typing import cast
 def test_unconsumed():
    import pytest # type: ignore
    with pytest.raises(UnconsumedError):
        with wrap({'a': 1234}) as w:
            w = cast(Wdict, w)
            pass
    with pytest.raises(UnconsumedError):
        with wrap({'c': {'d': 2222}}) as w:
            w = cast(Wdict, w)
            d = w['c']['d'].zoom()
 def test_consumed():
    with wrap({'a': 1234}) as w:
        w = cast(Wdict, w)
        a = w['a'].zoom()
    with wrap({'c': {'d': 2222}}) as w:
        w = cast(Wdict, w)
        c = w['c'].zoom()
        d = c['d'].zoom()
 def test_types():
    # (string, number, object, array, boolean or nul
    with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w:
        w = cast(Wdict, w)
        w['string'].zoom()
        w['number'].consume()
        w['boolean'].zoom()
@ -159,5 +171,31 @@ def test_types():
 def test_consume_all():
    with wrap({'aaa': {'bbb': {'hi': 123}}}) as w:
        w = cast(Wdict, w)
        aaa = w['aaa'].zoom()
        aaa['bbb'].consume_all()
 def test_consume_few():
    import pytest
    pytest.skip('Will think about it later..')
    with wrap({
            'important': 123,
            'unimportant': 'whatever'
    }) as w:
        w = cast(Wdict, w)
        w['important'].zoom()
        w.consume_all()
        # TODO hmm, we want smth like this to work..
 def test_zoom() -> None:
    import pytest # type: ignore
    with wrap({'aaa': 'whatever'}) as w:
        w = cast(Wdict, w)
        with pytest.raises(KeyError):
            w['nosuchkey'].zoom()
        w['aaa'].zoom()
 # TODO type check this...
--- a/my/reading/polar.py
+++ b/my/reading/polar.py
@ -1,33 +1,56 @@
 """
 [[https://github.com/burtonator/polar-books][Polar]] articles and highlights
 """
 from pathlib import Path
 from typing import Type, Any, cast, TYPE_CHECKING
 import my.config
 if not TYPE_CHECKING:
    user_config = getattr(my.config, 'polar', None)
 else:
    # mypy can't handle dynamic base classes... https://github.com/python/mypy/issues/2477
    user_config = object
 # by default, Polar doesn't need any config, so perhaps makes sense to make it defensive here
 if user_config is None:
    class user_config: # type: ignore[no-redef]
        pass
 from ..core import PathIsh
 from dataclasses import dataclass
@dataclass
 class polar(user_config):
    '''
    Polar config is optional, you only need it if you want to specify custom 'polar_dir'
    '''
    polar_dir: PathIsh = Path('~/.polar').expanduser()
    defensive: bool = True # pass False if you want it to fail faster on errors (useful for debugging)
 from ..core import make_config
 config = make_config(polar)
 # todo not sure where it keeps stuff on Windows?
 # https://github.com/burtonator/polar-bookshelf/issues/296
 from datetime import datetime
-from typing import List, Dict, Iterator, NamedTuple, Sequence, Optional
+from typing import List, Dict, Iterable, NamedTuple, Sequence, Optional
 import json
 import pytz
-from ..common import LazyLogger, get_files
+from ..core import LazyLogger, Json
-
+from ..core.common import isoparse
-from ..error import Res, echain, unwrap, sort_res_by
+from ..error import Res, echain, sort_res_by
-from ..kython.konsume import wrap, zoom, ignore
+from ..kython.konsume import wrap, zoom, ignore, Zoomable, Wdict
 _POLAR_DIR = Path('~').expanduser() / '.polar'
 logger = LazyLogger(__name__)
 # TODO use core.isoparse
 def parse_dt(s: str) -> datetime:
    return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ'))
 Uid = str
 # Ok I guess handling comment-level errors is a bit too much..
 Cid = str
 class Comment(NamedTuple):
@ -41,18 +64,26 @@ class Highlight(NamedTuple):
    created: datetime
    selection: str
    comments: Sequence[Comment]
    tags: Sequence[str]
    color: Optional[str] = None
 Uid = str
 class Book(NamedTuple):
    uid: Uid
    created: datetime
-    filename: str
+    uid: Uid
    path: Path
    title: Optional[str]
    # TODO hmmm. I think this needs to be defensive as well...
    # think about it later.
    items: Sequence[Highlight]
-Error = Exception # for backwards compat with Orger; can remove later
+    tags: Sequence[str]
    @property
    def filename(self) -> str:
        # TODO deprecate
        return str(self.path)
 Result = Res[Book]
@ -61,37 +92,45 @@ class Loader:
        self.path = p
        self.uid = self.path.parent.name
-    def error(self, cause, extra='') -> Exception:
+    def error(self, cause: Exception, extra: str ='') -> Exception:
        if len(extra) > 0:
            extra = '\n' + extra
        return echain(Exception(f'while processing {self.path}{extra}'), cause)
-    def load_item(self, meta) -> Iterator[Highlight]:
+    def load_item(self, meta: Zoomable) -> Iterable[Highlight]:
        meta = cast(Wdict, meta)
        # TODO this should be destructive zoom?
-        meta['notes'].zoom()
+        meta['notes'].zoom() # TODO ??? is it deliberate?
-        meta['pagemarks'].zoom()
+
        meta['pagemarks'].consume_all()
        if 'notes' in meta:
            # TODO something nicer?
            notes = meta['notes'].zoom()
        else:
            notes = [] # TODO FIXME dict?
-        comments = meta['comments'].zoom()
+        comments = list(meta['comments'].zoom().values()) if 'comments' in meta else []
        meta['questions'].zoom()
        meta['flashcards'].zoom()
        highlights = meta['textHighlights'].zoom()
-        meta['areaHighlights'].zoom()
+
        # TODO could be useful to at least add a meta bout area highlights/screens
        meta['areaHighlights'].consume_all()
        meta['screenshots'].zoom()
        meta['thumbnails'].zoom()
        if 'readingProgress' in meta:
-            meta['readingProgress'].zoom()
+            meta['readingProgress'].consume_all()
-        # TODO want to ignore the whold subtree..
+        # TODO want to ignore the whole subtree..
        pi = meta['pageInfo'].zoom()
        pi['num'].zoom()
        if 'dimensions' in pi:
            pi['dimensions'].consume_all()
        # TODO how to make it nicer?
        cmap: Dict[Hid, List[Comment]] = {}
-        vals = list(comments.values())
+        vals = list(comments)
        for v in vals:
            cid = v['id'].zoom()
            v['guid'].zoom()
@ -106,7 +145,7 @@ class Loader:
            cmap[hlid] = ccs
            ccs.append(Comment(
                cid=cid.value,
-                created=parse_dt(crt.value),
+                created=isoparse(crt.value),
                text=html.value, # TODO perhaps coonvert from html to text or org?
            ))
            v.consume()
@ -123,20 +162,32 @@ class Loader:
            updated = h['lastUpdated'].zoom().value
            h['rects'].ignore()
            # TODO make it more generic..
            htags: List[str] = []
            if 'tags' in h:
                ht = h['tags'].zoom()
                for k, v in list(ht.items()):
                    ctag = v.zoom()
                    ctag['id'].consume()
                    ct = ctag['label'].zoom()
                    htags.append(ct.value)
            h['textSelections'].ignore()
            h['notes'].consume()
            h['questions'].consume()
            h['flashcards'].consume()
-            h['color'].consume()
+            color = h['color'].zoom().value
            h['images'].ignore()
            # TODO eh, quite excessive \ns...
            text = h['text'].zoom()['TEXT'].zoom().value
            yield Highlight(
                hid=hid,
-                created=parse_dt(crt),
+                created=isoparse(crt),
                selection=text,
                comments=tuple(comments),
                tags=tuple(htags),
                color=color,
            )
            h.consume()
@ -146,34 +197,41 @@ class Loader:
        # TODO sort by date?
-    def load_items(self, metas) -> Iterator[Highlight]:
+    def load_items(self, metas: Json) -> Iterable[Highlight]:
        for p, meta in metas.items():
-            with wrap(meta, throw=False) as meta:
+            with wrap(meta, throw=not config.defensive) as meta:
                yield from self.load_item(meta)
-    def load(self) -> Iterator[Result]:
+    def load(self) -> Iterable[Result]:
        logger.info('processing %s', self.path)
        j = json.loads(self.path.read_text())
        # TODO konsume here as well?
        di = j['docInfo']
        added = di['added']
-        filename = di['filename']
+        filename = di['filename'] # TODO here
        title = di.get('title', None)
-        tags = di['tags']
+        tags_dict = di['tags']
-        pm = j['pageMetas']
+        pm = j['pageMetas'] # TODO FIXME handle this too
        # todo defensive?
        tags = tuple(t['label'] for t in tags_dict.values())
        path = Path(config.polar_dir) / 'stash' / filename
        yield Book(
            created=isoparse(added),
            uid=self.uid,
-            created=parse_dt(added),
+            path=path,
            filename=filename,
            title=title,
            items=list(self.load_items(pm)),
            tags=tags,
        )
-def iter_entries() -> Iterator[Result]:
+def iter_entries() -> Iterable[Result]:
-    for d in get_files(_POLAR_DIR, glob='*/state.json'):
+    from ..core import get_files
    for d in get_files(config.polar_dir, glob='*/state.json'):
        loader = Loader(d)
        try:
            yield from loader.load()
@ -185,16 +243,18 @@ def iter_entries() -> Iterator[Result]:
 def get_entries() -> List[Result]:
    # sorting by first annotation is reasonable I guess???
    # todo perhaps worth making it a pattern? X() returns iterable, get_X returns reasonably sorted list?
    return list(sort_res_by(iter_entries(), key=lambda e: e.created))
 def main():
-    for entry in iter_entries():
+    for e in iter_entries():
-        try:
+        if isinstance(e, Exception):
            ee = unwrap(entry)
        except Error as e:
            logger.exception(e)
        else:
-            logger.info('processed %s', ee.uid)
+            logger.info('processed %s', e.uid)
-            for i in ee.items:
+            for i in e.items:
                logger.info(i)
 Error = Exception # for backwards compat with Orger; can remove later
--- a/tests/extra/polar.py
+++ b/tests/extra/polar.py
@ -0,0 +1,51 @@
 from pathlib import Path
 import sys
 from importlib import reload
 from my.core.common import get_valid_filename
 ROOT = Path(__file__).parent.absolute()
 OUTPUTS = ROOT / 'outputs'
 import pytest # type: ignore
 def test_hpi(prepare: str) -> None:
    from my.reading.polar import get_entries
    assert len(list(get_entries())) > 1
 def test_orger(prepare: str, tmp_path: Path) -> None:
    from my.core.common import import_from, import_file
    om = import_file(ROOT / 'orger/modules/polar.py')
    # reload(om)
    pv = om.PolarView() # type: ignore
    # TODO hmm. worth making public?
    OUTPUTS.mkdir(exist_ok=True)
    out = OUTPUTS / (get_valid_filename(prepare) + '.org')
    pv._run(to=out)
 PARAMS = [
    # 'data/polar/BojanKV_polar/.polar',
    '',
    # 'data/polar/TheCedarPrince_KnowledgeRepository',
    # 'data/polar/coelias_polardocs',
    # 'data/polar/warkdarrior_polar-document-repository'
 ]
@pytest.fixture(params=PARAMS)
 def prepare(request):
    dotpolar = request.param
    class user_config:
        if dotpolar != '': # defaul
            polar_dir = Path(ROOT / dotpolar)
        defensive = False
    import my.config
    setattr(my.config, 'polar', user_config)
    import my.reading.polar as polar
    reload(polar)
    # TODO hmm... ok, need to document reload()
    yield dotpolar