From d24f7e18f268241ec760362559db4c848e239dd1 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 3 May 2019 22:53:59 +0100 Subject: [PATCH 1/7] initial work on polar --- reading/polar.py | 172 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100755 reading/polar.py diff --git a/reading/polar.py b/reading/polar.py new file mode 100755 index 0000000..4c381e2 --- /dev/null +++ b/reading/polar.py @@ -0,0 +1,172 @@ +#!/usr/bin/python3 +from pathlib import Path +import logging +from typing import List, Dict, Iterator, NamedTuple, Sequence, Optional +import json + +from kython.kerror import ResT, echain, unwrap, sort_res_by +from kython.klogging import setup_logzero + + +BDIR = Path('/L/zzz_syncthing/data/.polar') + + +def get_logger(): + return logging.getLogger('polar-provider') + + +def _get_datas() -> List[Path]: + return list(sorted(BDIR.glob('*/state.json'))) + + +Uid = str + +class Error(Exception): + def __init__(self, p: Path, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) # type: ignore + self.uid: Uid = p.parent.name + +ResultItem = ResT['Item', Error] +class Item(NamedTuple): + uid: Uid + +ResultBook = ResT['Book', Error] + +class Book(NamedTuple): + uid: Uid + filename: str + title: Optional[str] + items: Sequence[ResultItem] + +from kython.konsume import zoom, akeq + +class Loader: + def __init__(self, p: Path) -> None: + self.path = p + self.uid = self.path.parent.name + self.err = Error(p) + self.logger = get_logger() + + def error(self, cause, extra): + return echain(Error(self.path, extra), cause) + + def load_item(self, meta) -> Iterator[ResultItem]: + # TODO this should be destructive zoom? + try: + meta['notes'].zoom() + meta['pagemarks'].zoom() + if 'notes' in meta: + # TODO something nicer? + meta['notes'].zoom() + meta['comments'].zoom() + meta['questions'].zoom() + meta['flashcards'].zoom() + meta['textHighlights'].zoom() + meta['areaHighlights'].zoom() + meta['screenshots'].zoom() + meta['thumbnails'].zoom() + meta['readingProgress'].zoom() + + # TODO want to ignore the whold subtree.. + pi = meta['pageInfo'].zoom() + pi['num'].zoom() + except Exception as exx: + err = self.error(exx, meta) + self.logger.exception(err) + yield err + from pprint import pprint + # pprint(notes) + # try: + # pm, notes, comm, que, flash, text, area, screens, thumb, rp, pi = zoom( + # meta, + # ) + # except Exception as exx: + # yield echain(self.err, exx) + # return + + # def aempty(x): + # akeq(x) + # try: + # aempty(pm) + # aempty(que) + # aempty(flash) + # aempty(text) + # aempty(area) # TODO these should be yieldy? + # aempty(screens) + # aempty(rp) + # akeq(pi, 'num') + # except Exception as ex: + # # TODO make it a method? + # yield echain(self.err, ex) + + + # aempty(notes) + # yield Item(self.uid) + + + def load_items(self, metas) -> Iterator[ResultItem]: + from kython.konsume import wrap + for p, meta in metas.items(): + with wrap(meta) as meta: + yield from self.load_item(meta) + + def load(self) -> Iterator[ResultBook]: + self.logger.info('processing %s', self.path) + j = json.loads(self.path.read_text()) + + try: + di = j['docInfo'] + filename = di['filename'] + title = di.get('title', None) + tags = di['tags'] + pm = j['pageMetas'] + except Exception as ex: + self.logger.exception(ex) + yield echain(self.err, ex) + return + + # TODO should I group by book??? + yield Book( + uid=self.uid, + filename=filename, + title=title, + items=list(self.load_items(pm)), + ) + # "textHighlights": {}, + # "comments": {}, + # TODO + # "pagemarks": {}, + # "notes": {}, + # "questions": {}, + # "flashcards": {}, + # "areaHighlights": {}, + # "screenshots": {}, + # "thumbnails": {}, + # "readingProgress": {}, + # "pageInfo": { + # "num": 1 + # } + + +def iter_entries() -> Iterator[ResultBook]: + for d in _get_datas(): + yield from Loader(d).load() + + +def main(): + logger = get_logger() + setup_logzero(logger, level=logging.DEBUG) + + for entry in iter_entries(): + logger.info('processed %s', entry.uid) + for i in entry.items: + try: + ii = unwrap(i) + except Error as e: + logger.exception(e) + else: + logger.info(ii) + + +if __name__ == '__main__': + main() From 592fec86bed13e2bb60d90a9ecae2a4af1185d8e Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 3 May 2019 23:21:24 +0100 Subject: [PATCH 2/7] nicer processing --- reading/polar.py | 57 +++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/reading/polar.py b/reading/polar.py index 4c381e2..832e1b5 100755 --- a/reading/polar.py +++ b/reading/polar.py @@ -57,15 +57,18 @@ class Loader: meta['pagemarks'].zoom() if 'notes' in meta: # TODO something nicer? - meta['notes'].zoom() - meta['comments'].zoom() + notes = meta['notes'].zoom() + else: + notes = [] # TODO FIXME dict? + comments = meta['comments'].zoom() meta['questions'].zoom() meta['flashcards'].zoom() - meta['textHighlights'].zoom() + highlights = meta['textHighlights'].zoom() meta['areaHighlights'].zoom() meta['screenshots'].zoom() meta['thumbnails'].zoom() - meta['readingProgress'].zoom() + if 'readingProgress' in meta: + meta['readingProgress'].zoom() # TODO want to ignore the whold subtree.. pi = meta['pageInfo'].zoom() @@ -74,34 +77,23 @@ class Loader: err = self.error(exx, meta) self.logger.exception(err) yield err - from pprint import pprint - # pprint(notes) - # try: - # pm, notes, comm, que, flash, text, area, screens, thumb, rp, pi = zoom( - # meta, - # ) - # except Exception as exx: - # yield echain(self.err, exx) - # return + return # TODO ugh, careful with unconsumed? - # def aempty(x): - # akeq(x) - # try: - # aempty(pm) - # aempty(que) - # aempty(flash) - # aempty(text) - # aempty(area) # TODO these should be yieldy? - # aempty(screens) - # aempty(rp) - # akeq(pi, 'num') - # except Exception as ex: - # # TODO make it a method? - # yield echain(self.err, ex) + # TODO how to make it nicer? + vals = list(comments.values()) + for v in vals: + cid = v['id'].zoom() + v['guid'].zoom() + # TODO values should probably be checked by flow analysis?? + crt = v['created'].zoom() + updated = v['lastUpdated'].zoom() + content = v['content'].zoom() + html = content['HTML'].zoom() + v['ref'].zoom() + v.consume() - - # aempty(notes) - # yield Item(self.uid) + highlights.consume_all() # TODO FIXME + # TODO need to process text highlights... def load_items(self, metas) -> Iterator[ResultItem]: @@ -121,8 +113,9 @@ class Loader: tags = di['tags'] pm = j['pageMetas'] except Exception as ex: - self.logger.exception(ex) - yield echain(self.err, ex) + err = self.error(ex, j) + self.logger.exception(err) + yield err return # TODO should I group by book??? From fe86efeda87a20c33c27b116af70ebb25b7cecfb Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 3 May 2019 23:30:59 +0100 Subject: [PATCH 3/7] more polar processing --- reading/polar.py | 69 ++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/reading/polar.py b/reading/polar.py index 832e1b5..f637719 100755 --- a/reading/polar.py +++ b/reading/polar.py @@ -26,9 +26,15 @@ class Error(Exception): super().__init__(*args, **kwargs) # type: ignore self.uid: Uid = p.parent.name -ResultItem = ResT['Item', Error] +# TODO not sure if I even need comment? +# Ok I guess handling comment-level errors is a bit too much.. + +Cid = str class Item(NamedTuple): - uid: Uid + cid: Cid + created: str # TODO datetime (parse iso) + comment: str + ResultBook = ResT['Book', Error] @@ -36,7 +42,7 @@ class Book(NamedTuple): uid: Uid filename: str title: Optional[str] - items: Sequence[ResultItem] + items: Sequence[Item] from kython.konsume import zoom, akeq @@ -50,34 +56,28 @@ class Loader: def error(self, cause, extra): return echain(Error(self.path, extra), cause) - def load_item(self, meta) -> Iterator[ResultItem]: + def load_item(self, meta) -> Iterator[Item]: # TODO this should be destructive zoom? - try: - meta['notes'].zoom() - meta['pagemarks'].zoom() - if 'notes' in meta: - # TODO something nicer? - notes = meta['notes'].zoom() - else: - notes = [] # TODO FIXME dict? - comments = meta['comments'].zoom() - meta['questions'].zoom() - meta['flashcards'].zoom() - highlights = meta['textHighlights'].zoom() - meta['areaHighlights'].zoom() - meta['screenshots'].zoom() - meta['thumbnails'].zoom() - if 'readingProgress' in meta: - meta['readingProgress'].zoom() + meta['notes'].zoom() + meta['pagemarks'].zoom() + if 'notes' in meta: + # TODO something nicer? + notes = meta['notes'].zoom() + else: + notes = [] # TODO FIXME dict? + comments = meta['comments'].zoom() + meta['questions'].zoom() + meta['flashcards'].zoom() + highlights = meta['textHighlights'].zoom() + meta['areaHighlights'].zoom() + meta['screenshots'].zoom() + meta['thumbnails'].zoom() + if 'readingProgress' in meta: + meta['readingProgress'].zoom() - # TODO want to ignore the whold subtree.. - pi = meta['pageInfo'].zoom() - pi['num'].zoom() - except Exception as exx: - err = self.error(exx, meta) - self.logger.exception(err) - yield err - return # TODO ugh, careful with unconsumed? + # TODO want to ignore the whold subtree.. + pi = meta['pageInfo'].zoom() + pi['num'].zoom() # TODO how to make it nicer? vals = list(comments.values()) @@ -89,14 +89,19 @@ class Loader: updated = v['lastUpdated'].zoom() content = v['content'].zoom() html = content['HTML'].zoom() - v['ref'].zoom() + v['ref'].zoom() # TODO it actually might be pretty useful.. similar to hypothesis?? + yield Item( + cid=cid.value, + created=crt.value, + comment=html.value, # TODO perhaps coonvert from html to text or org? + ) v.consume() - highlights.consume_all() # TODO FIXME + highlights.consume_all() # TODO need to process text highlights... - def load_items(self, metas) -> Iterator[ResultItem]: + def load_items(self, metas) -> Iterator[Item]: from kython.konsume import wrap for p, meta in metas.items(): with wrap(meta) as meta: From c6a5a9d9bd1fa3be2c893e0c415a60ed89abd2ae Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 3 May 2019 23:54:26 +0100 Subject: [PATCH 4/7] ok, kinda works... --- reading/polar.py | 60 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/reading/polar.py b/reading/polar.py index f637719..a3e9c40 100755 --- a/reading/polar.py +++ b/reading/polar.py @@ -30,11 +30,19 @@ class Error(Exception): # Ok I guess handling comment-level errors is a bit too much.. Cid = str -class Item(NamedTuple): +class Comment(NamedTuple): cid: Cid created: str # TODO datetime (parse iso) comment: str +Hid = str +class Highlight(NamedTuple): + hid: Hid + created: str # TODO datetime + selection: str + comments: Sequence[Comment] + + ResultBook = ResT['Book', Error] @@ -42,7 +50,7 @@ class Book(NamedTuple): uid: Uid filename: str title: Optional[str] - items: Sequence[Item] + items: Sequence[Highlight] from kython.konsume import zoom, akeq @@ -56,7 +64,7 @@ class Loader: def error(self, cause, extra): return echain(Error(self.path, extra), cause) - def load_item(self, meta) -> Iterator[Item]: + def load_item(self, meta) -> Iterator[Highlight]: # TODO this should be destructive zoom? meta['notes'].zoom() meta['pagemarks'].zoom() @@ -80,6 +88,7 @@ class Loader: pi['num'].zoom() # TODO how to make it nicer? + cmap: Dict[Hid, List[Comment]] = {} vals = list(comments.values()) for v in vals: cid = v['id'].zoom() @@ -89,19 +98,52 @@ class Loader: updated = v['lastUpdated'].zoom() content = v['content'].zoom() html = content['HTML'].zoom() - v['ref'].zoom() # TODO it actually might be pretty useful.. similar to hypothesis?? - yield Item( + refv = v['ref'].zoom().value + [_, hlid] = refv.split(':') + ccs = cmap.get(hlid, []) + cmap[hlid] = ccs + ccs.append(Comment( cid=cid.value, created=crt.value, comment=html.value, # TODO perhaps coonvert from html to text or org? - ) + )) v.consume() + for h in list(highlights.values()): + hid = h['id'].zoom().value + if hid in cmap: + comments = cmap[hid] + del cmap[hid] + else: + comments = [] - highlights.consume_all() - # TODO need to process text highlights... + h['guid'].consume() + crt = h['created'].zoom().value + updated = h['lastUpdated'].zoom().value + h['rects'].ignore() + + h['textSelections'].ignore() + h['notes'].consume() + h['questions'].consume() + h['flashcards'].consume() + h['color'].consume() + h['images'].ignore() + # TODO eh, quite excessive \ns... + text = h['text'].zoom()['TEXT'].zoom().value + + yield Highlight( + hid=hid, + created=crt, + selection=text, + comments=tuple(comments), + ) + h.consume() + + if len(cmap) > 0: + raise RuntimeError(f'Unconsumed comments: {cmap}') + # TODO sort by date? - def load_items(self, metas) -> Iterator[Item]: + def load_items(self, metas) -> Iterator[Highlight]: from kython.konsume import wrap for p, meta in metas.items(): with wrap(meta) as meta: From 9380a4e8e29b56758e83f644ed0b98e6429edd51 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 4 May 2019 00:32:06 +0100 Subject: [PATCH 5/7] fix mypy --- reading/polar.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/reading/polar.py b/reading/polar.py index a3e9c40..31a96b4 100755 --- a/reading/polar.py +++ b/reading/polar.py @@ -6,6 +6,7 @@ import json from kython.kerror import ResT, echain, unwrap, sort_res_by from kython.klogging import setup_logzero +from kython.konsume import wrap, zoom, ignore BDIR = Path('/L/zzz_syncthing/data/.polar') @@ -52,7 +53,6 @@ class Book(NamedTuple): title: Optional[str] items: Sequence[Highlight] -from kython.konsume import zoom, akeq class Loader: def __init__(self, p: Path) -> None: @@ -144,7 +144,6 @@ class Loader: def load_items(self, metas) -> Iterator[Highlight]: - from kython.konsume import wrap for p, meta in metas.items(): with wrap(meta) as meta: yield from self.load_item(meta) @@ -199,13 +198,13 @@ def main(): for entry in iter_entries(): logger.info('processed %s', entry.uid) - for i in entry.items: - try: - ii = unwrap(i) - except Error as e: - logger.exception(e) - else: - logger.info(ii) + try: + ee = unwrap(entry) + except Error as e: + logger.exception(e) + else: + for i in ee.items: + logger.info(i) if __name__ == '__main__': From d76ba4e77f20dc1af35ae9635c3f20a596a6a572 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 4 May 2019 00:49:20 +0100 Subject: [PATCH 6/7] parse dt --- reading/polar.py | 72 +++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/reading/polar.py b/reading/polar.py index 31a96b4..3461e04 100755 --- a/reading/polar.py +++ b/reading/polar.py @@ -1,8 +1,10 @@ #!/usr/bin/python3 from pathlib import Path +from datetime import datetime import logging from typing import List, Dict, Iterator, NamedTuple, Sequence, Optional import json +import pytz from kython.kerror import ResT, echain, unwrap, sort_res_by from kython.klogging import setup_logzero @@ -20,6 +22,9 @@ def _get_datas() -> List[Path]: return list(sorted(BDIR.glob('*/state.json'))) +def parse_dt(s: str) -> datetime: + return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ')) + Uid = str class Error(Exception): @@ -27,28 +32,26 @@ class Error(Exception): super().__init__(*args, **kwargs) # type: ignore self.uid: Uid = p.parent.name -# TODO not sure if I even need comment? # Ok I guess handling comment-level errors is a bit too much.. - Cid = str class Comment(NamedTuple): cid: Cid - created: str # TODO datetime (parse iso) + created: datetime comment: str Hid = str class Highlight(NamedTuple): hid: Hid - created: str # TODO datetime + created: datetime selection: str comments: Sequence[Comment] - -ResultBook = ResT['Book', Error] +Result = ResT['Book', Error] class Book(NamedTuple): uid: Uid + created: datetime filename: str title: Optional[str] items: Sequence[Highlight] @@ -61,7 +64,7 @@ class Loader: self.err = Error(p) self.logger = get_logger() - def error(self, cause, extra): + def error(self, cause, extra=''): return echain(Error(self.path, extra), cause) def load_item(self, meta) -> Iterator[Highlight]: @@ -104,7 +107,7 @@ class Loader: cmap[hlid] = ccs ccs.append(Comment( cid=cid.value, - created=crt.value, + created=parse_dt(crt.value), comment=html.value, # TODO perhaps coonvert from html to text or org? )) v.consume() @@ -132,7 +135,7 @@ class Loader: yield Highlight( hid=hid, - created=crt, + created=parse_dt(crt), selection=text, comments=tuple(comments), ) @@ -148,49 +151,42 @@ class Loader: with wrap(meta) as meta: yield from self.load_item(meta) - def load(self) -> Iterator[ResultBook]: + def load(self) -> Iterator[Result]: self.logger.info('processing %s', self.path) j = json.loads(self.path.read_text()) - try: - di = j['docInfo'] - filename = di['filename'] - title = di.get('title', None) - tags = di['tags'] - pm = j['pageMetas'] - except Exception as ex: - err = self.error(ex, j) - self.logger.exception(err) - yield err - return + # TODO konsume here as well? + di = j['docInfo'] + added = di['added'] + filename = di['filename'] + title = di.get('title', None) + tags = di['tags'] + pm = j['pageMetas'] - # TODO should I group by book??? yield Book( uid=self.uid, + created=parse_dt(added), filename=filename, title=title, items=list(self.load_items(pm)), ) - # "textHighlights": {}, - # "comments": {}, - # TODO - # "pagemarks": {}, - # "notes": {}, - # "questions": {}, - # "flashcards": {}, - # "areaHighlights": {}, - # "screenshots": {}, - # "thumbnails": {}, - # "readingProgress": {}, - # "pageInfo": { - # "num": 1 - # } -def iter_entries() -> Iterator[ResultBook]: +def iter_entries() -> Iterator[Result]: + logger = get_logger() for d in _get_datas(): - yield from Loader(d).load() + loader = Loader(d) + try: + yield from loader.load() + except Exception as ee: + err = loader.error(ee) + logger.exception(err) + yield err +def get_entries() -> List[Result]: + # sorting by first annotation is reasonable I guess??? + # TODO + return list(sort_res_by(iter_entries(), key=lambda e: e.created)) def main(): logger = get_logger() From 5b914fab6c0ef961b3ec97e5da36b652e1a6f26e Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 4 May 2019 00:59:57 +0100 Subject: [PATCH 7/7] rname field --- reading/polar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reading/polar.py b/reading/polar.py index 3461e04..50034d8 100755 --- a/reading/polar.py +++ b/reading/polar.py @@ -37,7 +37,7 @@ Cid = str class Comment(NamedTuple): cid: Cid created: datetime - comment: str + text: str Hid = str class Highlight(NamedTuple): @@ -108,7 +108,7 @@ class Loader: ccs.append(Comment( cid=cid.value, created=parse_dt(crt.value), - comment=html.value, # TODO perhaps coonvert from html to text or org? + text=html.value, # TODO perhaps coonvert from html to text or org? )) v.consume() for h in list(highlights.values()):