diff --git a/instapaper/__init__.py b/instapaper/__init__.py index f800477..686da34 100644 --- a/instapaper/__init__.py +++ b/instapaper/__init__.py @@ -2,66 +2,108 @@ from datetime import datetime import json from pathlib import Path import pytz -from typing import NamedTuple, Optional +from typing import NamedTuple, Optional, List, Dict +from collections import OrderedDict + +from kython import group_by_key BDIR = Path('/L/backups/instapaper/') +Bid = str +Hid = str + class Highlight(NamedTuple): dt: datetime - uid: str + uid: Hid + bid: Bid text: str note: Optional[str] url: str title: str +class Bookmark(NamedTuple): + bid: Bid + time: int + url: str + title: str + +class Page(NamedTuple): + bookmark: Bookmark + highlights: List[Highlight] + def get_files(): return sorted(f for f in BDIR.iterdir() if f.suffix == '.json') -def get_stuff(): - all_bks = {} - all_hls = {} +def dkey(x): + return lambda d: d[x] + +def get_stuff(all=True): + all_bks: Dict[Bid, Bookmark] = OrderedDict() + all_hls: Dict[Hid, Highlight] = OrderedDict() # TODO can restore url by bookmark id for f in get_files(): with f.open('r') as fo: j = json.load(fo) - # TODO what are bookmarks?? - for b in j['bookmarks']: - bid = b['bookmark_id'] + for b in sorted(j['bookmarks'], key=dkey('time')): + bid = str(b['bookmark_id']) prev = all_bks.get(bid, None) # assert prev is None or prev == b, '%s vs %s' % (prev, b) # TODO shit, ok progress can change apparently - all_bks[bid] = b + all_bks[bid] = Bookmark( + bid=bid, + time=b['time'], + url=b['url'], + title=b['title'], + ) hls = j['highlights'] - for h in hls: + for h in sorted(hls, key=dkey('time')): hid = h['highlight_id'] + bid = str(h['bookmark_id']) + # TODO just reference to bookmark in hightlight? + bk = all_bks[bid] + dt = pytz.utc.localize(datetime.utcfromtimestamp(h['time'])) + h = Highlight( + uid=hid, + bid=bk.bid, + dt=dt, + text=h['text'], + note=h['note'], + url=bk.url, + title=bk.title, + ) prev = all_hls.get(hid, None) assert prev is None or prev == h all_hls[hid] = h + return all_bks, all_hls def iter_highlights(): - bks, hls = get_stuff() - for h in hls.values(): - bid = h['bookmark_id'] - bk = bks[bid] - dt = pytz.utc.localize(datetime.utcfromtimestamp(h['time'])) - yield Highlight( - uid=str(h['highlight_id']), - dt=dt, - text=h['text'], - note=h['note'], - url=bk['url'], - title=bk['title'], - ) + return iter(get_stuff()[1]) + def get_highlights(): - return sorted(iter_highlights(), key=lambda h: h.dt) + return list(iter_highlights()) + def get_todos(): def is_todo(h): return h.note is not None and h.note.lstrip().lower().startswith('todo') return list(filter(is_todo, get_highlights())) + +def get_pages() -> List[Page]: + bms, hls = get_stuff() + groups = group_by_key(hls.values(), key=lambda h: h.bid) + pages = [] + # TODO how to make sure there are no dangling bookmarks? + for bid, bm in bms.items(): + pages.append(Page( + bookmark=bm, + highlights=sorted(groups.get(bid, []), key=lambda b: b.dt), + )) + return pages + + def main(): for h in get_todos(): print(h)