Merge pull request #50 from karlicoss/polar

polar module updates
This commit is contained in:
karlicoss 2020-05-17 14:01:49 +01:00 committed by GitHub
commit c07ea0a600
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 229 additions and 61 deletions

View file

@ -33,6 +33,7 @@ modules = [
('twint' , 'my.twitter.twint' ), ('twint' , 'my.twitter.twint' ),
('twitter', 'my.twitter.archive' ), ('twitter', 'my.twitter.archive' ),
('lastfm' , 'my.lastfm' ), ('lastfm' , 'my.lastfm' ),
('polar' , 'my.reading.polar' ),
] ]
def indent(s, spaces=4): def indent(s, spaces=4):
@ -117,4 +118,15 @@ for cls, p in modules:
""" """
export_path: Paths export_path: Paths
#+end_src #+end_src
- [[file:../my/reading/polar.py][my.reading.polar]]
[[https://github.com/burtonator/polar-books][Polar]] articles and highlights
#+begin_src python
class polar:
'''
Polar config is optional, you only need it if you want to specify custom 'polar_dir'
'''
polar_dir: Path = Path('~/.polar').expanduser()
#+end_src
:end: :end:

View file

@ -1,4 +1,4 @@
# this file only keeps the most common & critical types/utility functions # this file only keeps the most common & critical types/utility functions
from .common import PathIsh, Paths, Json from .common import PathIsh, Paths, Json
from .common import get_files from .common import get_files, LazyLogger
from .cfg import make_config from .cfg import make_config

View file

@ -134,7 +134,8 @@ def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path,
warnings.warn(f"Treating {ss} as glob path. Explicit glob={glob} argument is ignored!") warnings.warn(f"Treating {ss} as glob path. Explicit glob={glob} argument is ignored!")
paths.extend(map(Path, do_glob(ss))) paths.extend(map(Path, do_glob(ss)))
else: else:
assert src.is_file(), src if not src.is_file():
raise RuntimeError(f"Expected '{src}' to exist")
# todo assert matches glob?? # todo assert matches glob??
paths.append(src) paths.append(src)
@ -245,3 +246,10 @@ def isoparse(s: str) -> tzdatetime:
assert s.endswith('Z'), s assert s.endswith('Z'), s
s = s[:-1] + '+00:00' s = s[:-1] + '+00:00'
return fromisoformat(s) return fromisoformat(s)
import re
# https://stackoverflow.com/a/295466/706389
def get_valid_filename(s: str) -> str:
s = str(s).strip().replace(' ', '_')
return re.sub(r'(?u)[^-\w.]', '', s)

View file

@ -30,6 +30,7 @@ def setup_config() -> None:
import os import os
import warnings import warnings
from typing import Optional from typing import Optional
import appdirs # type: ignore[import]
# not sure if that's necessary, i.e. could rely on PYTHONPATH instead # not sure if that's necessary, i.e. could rely on PYTHONPATH instead
# on the other hand, by using MY_CONFIG we are guaranteed to load it from the desired path? # on the other hand, by using MY_CONFIG we are guaranteed to load it from the desired path?
@ -37,9 +38,7 @@ def setup_config() -> None:
if mvar is not None: if mvar is not None:
mycfg_dir = Path(mvar) mycfg_dir = Path(mvar)
else: else:
# TODO use appdir?? mycfg_dir = Path(appdirs.user_config_dir('my'))
cfg_dir = Path('~/.config').expanduser()
mycfg_dir = cfg_dir / 'my'
if not mycfg_dir.exists(): if not mycfg_dir.exists():
warnings.warn(f"my.config package isn't found! (expected at {mycfg_dir}). This is likely to result in issues.") warnings.warn(f"my.config package isn't found! (expected at {mycfg_dir}). This is likely to result in issues.")

View file

@ -11,7 +11,7 @@ def zoom(w, *keys):
# TODO need to support lists # TODO need to support lists
class Zoomable: class Zoomable:
def __init__(self, parent, *args, **kwargs): def __init__(self, parent, *args, **kwargs) -> None:
super().__init__(*args, **kwargs) # type: ignore super().__init__(*args, **kwargs) # type: ignore
self.parent = parent self.parent = parent
@ -21,19 +21,19 @@ class Zoomable:
def dependants(self): def dependants(self):
raise NotImplementedError raise NotImplementedError
def ignore(self): def ignore(self) -> None:
self.consume_all() self.consume_all()
def consume_all(self): def consume_all(self) -> None:
for d in self.dependants: for d in self.dependants:
d.consume_all() d.consume_all()
self.consume() self.consume()
def consume(self): def consume(self) -> None:
assert self.parent is not None assert self.parent is not None
self.parent._remove(self) self.parent._remove(self)
def zoom(self): def zoom(self) -> 'Zoomable':
self.consume() self.consume()
return self return self
@ -56,6 +56,8 @@ class Wdict(Zoomable, OrderedDict):
def this_consumed(self): def this_consumed(self):
return len(self) == 0 return len(self) == 0
# TODO specify mypy type for the index special method?
class Wlist(Zoomable, list): class Wlist(Zoomable, list):
def _remove(self, xx): def _remove(self, xx):
@ -83,7 +85,8 @@ class Wvalue(Zoomable):
def __repr__(self): def __repr__(self):
return 'WValue{' + repr(self.value) + '}' return 'WValue{' + repr(self.value) + '}'
def _wrap(j, parent=None): from typing import Tuple
def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]:
res: Zoomable res: Zoomable
cc: List[Zoomable] cc: List[Zoomable]
if isinstance(j, dict): if isinstance(j, dict):
@ -109,13 +112,14 @@ def _wrap(j, parent=None):
raise RuntimeError(f'Unexpected type: {type(j)} {j}') raise RuntimeError(f'Unexpected type: {type(j)} {j}')
from contextlib import contextmanager from contextlib import contextmanager
from typing import Iterator
class UnconsumedError(Exception): class UnconsumedError(Exception):
pass pass
# TODO think about error policy later... # TODO think about error policy later...
@contextmanager @contextmanager
def wrap(j, throw=True): def wrap(j, throw=True) -> Iterator[Zoomable]:
w, children = _wrap(j) w, children = _wrap(j)
yield w yield w
@ -123,33 +127,41 @@ def wrap(j, throw=True):
for c in children: for c in children:
if not c.this_consumed(): # TODO hmm. how does it figure out if it's consumed??? if not c.this_consumed(): # TODO hmm. how does it figure out if it's consumed???
if throw: if throw:
raise UnconsumedError(str(c)) # TODO need to keep a full path or something...
raise UnconsumedError(f'''
Expected {c} to be fully consumed by the parser.
'''.lstrip())
else: else:
# TODO log? # TODO log?
pass pass
from typing import cast
def test_unconsumed(): def test_unconsumed():
import pytest # type: ignore import pytest # type: ignore
with pytest.raises(UnconsumedError): with pytest.raises(UnconsumedError):
with wrap({'a': 1234}) as w: with wrap({'a': 1234}) as w:
w = cast(Wdict, w)
pass pass
with pytest.raises(UnconsumedError): with pytest.raises(UnconsumedError):
with wrap({'c': {'d': 2222}}) as w: with wrap({'c': {'d': 2222}}) as w:
w = cast(Wdict, w)
d = w['c']['d'].zoom() d = w['c']['d'].zoom()
def test_consumed(): def test_consumed():
with wrap({'a': 1234}) as w: with wrap({'a': 1234}) as w:
w = cast(Wdict, w)
a = w['a'].zoom() a = w['a'].zoom()
with wrap({'c': {'d': 2222}}) as w: with wrap({'c': {'d': 2222}}) as w:
w = cast(Wdict, w)
c = w['c'].zoom() c = w['c'].zoom()
d = c['d'].zoom() d = c['d'].zoom()
def test_types(): def test_types():
# (string, number, object, array, boolean or nul # (string, number, object, array, boolean or nul
with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w: with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w:
w = cast(Wdict, w)
w['string'].zoom() w['string'].zoom()
w['number'].consume() w['number'].consume()
w['boolean'].zoom() w['boolean'].zoom()
@ -159,5 +171,31 @@ def test_types():
def test_consume_all(): def test_consume_all():
with wrap({'aaa': {'bbb': {'hi': 123}}}) as w: with wrap({'aaa': {'bbb': {'hi': 123}}}) as w:
w = cast(Wdict, w)
aaa = w['aaa'].zoom() aaa = w['aaa'].zoom()
aaa['bbb'].consume_all() aaa['bbb'].consume_all()
def test_consume_few():
import pytest
pytest.skip('Will think about it later..')
with wrap({
'important': 123,
'unimportant': 'whatever'
}) as w:
w = cast(Wdict, w)
w['important'].zoom()
w.consume_all()
# TODO hmm, we want smth like this to work..
def test_zoom() -> None:
import pytest # type: ignore
with wrap({'aaa': 'whatever'}) as w:
w = cast(Wdict, w)
with pytest.raises(KeyError):
w['nosuchkey'].zoom()
w['aaa'].zoom()
# TODO type check this...

View file

@ -1,33 +1,56 @@
""" """
[[https://github.com/burtonator/polar-books][Polar]] articles and highlights [[https://github.com/burtonator/polar-books][Polar]] articles and highlights
""" """
from pathlib import Path from pathlib import Path
from typing import Type, Any, cast, TYPE_CHECKING
import my.config
if not TYPE_CHECKING:
user_config = getattr(my.config, 'polar', None)
else:
# mypy can't handle dynamic base classes... https://github.com/python/mypy/issues/2477
user_config = object
# by default, Polar doesn't need any config, so perhaps makes sense to make it defensive here
if user_config is None:
class user_config: # type: ignore[no-redef]
pass
from ..core import PathIsh
from dataclasses import dataclass
@dataclass
class polar(user_config):
'''
Polar config is optional, you only need it if you want to specify custom 'polar_dir'
'''
polar_dir: PathIsh = Path('~/.polar').expanduser()
defensive: bool = True # pass False if you want it to fail faster on errors (useful for debugging)
from ..core import make_config
config = make_config(polar)
# todo not sure where it keeps stuff on Windows?
# https://github.com/burtonator/polar-bookshelf/issues/296
from datetime import datetime from datetime import datetime
from typing import List, Dict, Iterator, NamedTuple, Sequence, Optional from typing import List, Dict, Iterable, NamedTuple, Sequence, Optional
import json import json
import pytz import pytz
from ..common import LazyLogger, get_files from ..core import LazyLogger, Json
from ..core.common import isoparse
from ..error import Res, echain, unwrap, sort_res_by from ..error import Res, echain, sort_res_by
from ..kython.konsume import wrap, zoom, ignore from ..kython.konsume import wrap, zoom, ignore, Zoomable, Wdict
_POLAR_DIR = Path('~').expanduser() / '.polar'
logger = LazyLogger(__name__) logger = LazyLogger(__name__)
# TODO use core.isoparse
def parse_dt(s: str) -> datetime:
return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ'))
Uid = str
# Ok I guess handling comment-level errors is a bit too much.. # Ok I guess handling comment-level errors is a bit too much..
Cid = str Cid = str
class Comment(NamedTuple): class Comment(NamedTuple):
@ -41,18 +64,26 @@ class Highlight(NamedTuple):
created: datetime created: datetime
selection: str selection: str
comments: Sequence[Comment] comments: Sequence[Comment]
tags: Sequence[str]
color: Optional[str] = None
Uid = str
class Book(NamedTuple): class Book(NamedTuple):
uid: Uid
created: datetime created: datetime
filename: str uid: Uid
path: Path
title: Optional[str] title: Optional[str]
# TODO hmmm. I think this needs to be defensive as well... # TODO hmmm. I think this needs to be defensive as well...
# think about it later. # think about it later.
items: Sequence[Highlight] items: Sequence[Highlight]
Error = Exception # for backwards compat with Orger; can remove later tags: Sequence[str]
@property
def filename(self) -> str:
# TODO deprecate
return str(self.path)
Result = Res[Book] Result = Res[Book]
@ -61,37 +92,45 @@ class Loader:
self.path = p self.path = p
self.uid = self.path.parent.name self.uid = self.path.parent.name
def error(self, cause, extra='') -> Exception: def error(self, cause: Exception, extra: str ='') -> Exception:
if len(extra) > 0: if len(extra) > 0:
extra = '\n' + extra extra = '\n' + extra
return echain(Exception(f'while processing {self.path}{extra}'), cause) return echain(Exception(f'while processing {self.path}{extra}'), cause)
def load_item(self, meta) -> Iterator[Highlight]: def load_item(self, meta: Zoomable) -> Iterable[Highlight]:
meta = cast(Wdict, meta)
# TODO this should be destructive zoom? # TODO this should be destructive zoom?
meta['notes'].zoom() meta['notes'].zoom() # TODO ??? is it deliberate?
meta['pagemarks'].zoom()
meta['pagemarks'].consume_all()
if 'notes' in meta: if 'notes' in meta:
# TODO something nicer? # TODO something nicer?
notes = meta['notes'].zoom() notes = meta['notes'].zoom()
else: else:
notes = [] # TODO FIXME dict? notes = [] # TODO FIXME dict?
comments = meta['comments'].zoom() comments = list(meta['comments'].zoom().values()) if 'comments' in meta else []
meta['questions'].zoom() meta['questions'].zoom()
meta['flashcards'].zoom() meta['flashcards'].zoom()
highlights = meta['textHighlights'].zoom() highlights = meta['textHighlights'].zoom()
meta['areaHighlights'].zoom()
# TODO could be useful to at least add a meta bout area highlights/screens
meta['areaHighlights'].consume_all()
meta['screenshots'].zoom() meta['screenshots'].zoom()
meta['thumbnails'].zoom() meta['thumbnails'].zoom()
if 'readingProgress' in meta: if 'readingProgress' in meta:
meta['readingProgress'].zoom() meta['readingProgress'].consume_all()
# TODO want to ignore the whold subtree.. # TODO want to ignore the whole subtree..
pi = meta['pageInfo'].zoom() pi = meta['pageInfo'].zoom()
pi['num'].zoom() pi['num'].zoom()
if 'dimensions' in pi:
pi['dimensions'].consume_all()
# TODO how to make it nicer? # TODO how to make it nicer?
cmap: Dict[Hid, List[Comment]] = {} cmap: Dict[Hid, List[Comment]] = {}
vals = list(comments.values()) vals = list(comments)
for v in vals: for v in vals:
cid = v['id'].zoom() cid = v['id'].zoom()
v['guid'].zoom() v['guid'].zoom()
@ -106,7 +145,7 @@ class Loader:
cmap[hlid] = ccs cmap[hlid] = ccs
ccs.append(Comment( ccs.append(Comment(
cid=cid.value, cid=cid.value,
created=parse_dt(crt.value), created=isoparse(crt.value),
text=html.value, # TODO perhaps coonvert from html to text or org? text=html.value, # TODO perhaps coonvert from html to text or org?
)) ))
v.consume() v.consume()
@ -123,20 +162,32 @@ class Loader:
updated = h['lastUpdated'].zoom().value updated = h['lastUpdated'].zoom().value
h['rects'].ignore() h['rects'].ignore()
# TODO make it more generic..
htags: List[str] = []
if 'tags' in h:
ht = h['tags'].zoom()
for k, v in list(ht.items()):
ctag = v.zoom()
ctag['id'].consume()
ct = ctag['label'].zoom()
htags.append(ct.value)
h['textSelections'].ignore() h['textSelections'].ignore()
h['notes'].consume() h['notes'].consume()
h['questions'].consume() h['questions'].consume()
h['flashcards'].consume() h['flashcards'].consume()
h['color'].consume() color = h['color'].zoom().value
h['images'].ignore() h['images'].ignore()
# TODO eh, quite excessive \ns... # TODO eh, quite excessive \ns...
text = h['text'].zoom()['TEXT'].zoom().value text = h['text'].zoom()['TEXT'].zoom().value
yield Highlight( yield Highlight(
hid=hid, hid=hid,
created=parse_dt(crt), created=isoparse(crt),
selection=text, selection=text,
comments=tuple(comments), comments=tuple(comments),
tags=tuple(htags),
color=color,
) )
h.consume() h.consume()
@ -146,34 +197,41 @@ class Loader:
# TODO sort by date? # TODO sort by date?
def load_items(self, metas) -> Iterator[Highlight]: def load_items(self, metas: Json) -> Iterable[Highlight]:
for p, meta in metas.items(): for p, meta in metas.items():
with wrap(meta, throw=False) as meta: with wrap(meta, throw=not config.defensive) as meta:
yield from self.load_item(meta) yield from self.load_item(meta)
def load(self) -> Iterator[Result]: def load(self) -> Iterable[Result]:
logger.info('processing %s', self.path) logger.info('processing %s', self.path)
j = json.loads(self.path.read_text()) j = json.loads(self.path.read_text())
# TODO konsume here as well? # TODO konsume here as well?
di = j['docInfo'] di = j['docInfo']
added = di['added'] added = di['added']
filename = di['filename'] filename = di['filename'] # TODO here
title = di.get('title', None) title = di.get('title', None)
tags = di['tags'] tags_dict = di['tags']
pm = j['pageMetas'] pm = j['pageMetas'] # TODO FIXME handle this too
# todo defensive?
tags = tuple(t['label'] for t in tags_dict.values())
path = Path(config.polar_dir) / 'stash' / filename
yield Book( yield Book(
created=isoparse(added),
uid=self.uid, uid=self.uid,
created=parse_dt(added), path=path,
filename=filename,
title=title, title=title,
items=list(self.load_items(pm)), items=list(self.load_items(pm)),
tags=tags,
) )
def iter_entries() -> Iterator[Result]: def iter_entries() -> Iterable[Result]:
for d in get_files(_POLAR_DIR, glob='*/state.json'): from ..core import get_files
for d in get_files(config.polar_dir, glob='*/state.json'):
loader = Loader(d) loader = Loader(d)
try: try:
yield from loader.load() yield from loader.load()
@ -185,16 +243,18 @@ def iter_entries() -> Iterator[Result]:
def get_entries() -> List[Result]: def get_entries() -> List[Result]:
# sorting by first annotation is reasonable I guess??? # sorting by first annotation is reasonable I guess???
# todo perhaps worth making it a pattern? X() returns iterable, get_X returns reasonably sorted list?
return list(sort_res_by(iter_entries(), key=lambda e: e.created)) return list(sort_res_by(iter_entries(), key=lambda e: e.created))
def main(): def main():
for entry in iter_entries(): for e in iter_entries():
try: if isinstance(e, Exception):
ee = unwrap(entry)
except Error as e:
logger.exception(e) logger.exception(e)
else: else:
logger.info('processed %s', ee.uid) logger.info('processed %s', e.uid)
for i in ee.items: for i in e.items:
logger.info(i) logger.info(i)
Error = Exception # for backwards compat with Orger; can remove later

51
tests/extra/polar.py Normal file
View file

@ -0,0 +1,51 @@
from pathlib import Path
import sys
from importlib import reload
from my.core.common import get_valid_filename
ROOT = Path(__file__).parent.absolute()
OUTPUTS = ROOT / 'outputs'
import pytest # type: ignore
def test_hpi(prepare: str) -> None:
from my.reading.polar import get_entries
assert len(list(get_entries())) > 1
def test_orger(prepare: str, tmp_path: Path) -> None:
from my.core.common import import_from, import_file
om = import_file(ROOT / 'orger/modules/polar.py')
# reload(om)
pv = om.PolarView() # type: ignore
# TODO hmm. worth making public?
OUTPUTS.mkdir(exist_ok=True)
out = OUTPUTS / (get_valid_filename(prepare) + '.org')
pv._run(to=out)
PARAMS = [
# 'data/polar/BojanKV_polar/.polar',
'',
# 'data/polar/TheCedarPrince_KnowledgeRepository',
# 'data/polar/coelias_polardocs',
# 'data/polar/warkdarrior_polar-document-repository'
]
@pytest.fixture(params=PARAMS)
def prepare(request):
dotpolar = request.param
class user_config:
if dotpolar != '': # defaul
polar_dir = Path(ROOT / dotpolar)
defensive = False
import my.config
setattr(my.config, 'polar', user_config)
import my.reading.polar as polar
reload(polar)
# TODO hmm... ok, need to document reload()
yield dotpolar