diff --git a/reddit/__init__.py b/reddit/__init__.py new file mode 100755 index 0000000..4c3169c --- /dev/null +++ b/reddit/__init__.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +from typing import List, Dict, Union, Iterable, Iterator, NamedTuple, Any, Sequence +import json +from functools import lru_cache +from collections import OrderedDict +from pathlib import Path +import pytz +import re +from datetime import datetime +import logging +from multiprocessing import Pool + +from kython import kompress, cproperty, make_dict, numbers +from kython.klogging import setup_logzero + +# TODO hmm. apparently decompressing takes quite a bit of time... + +BPATH = Path("/L/backups/reddit") + + +def get_logger(): + return logging.getLogger('reddit-provider') + +def reddit(suffix: str) -> str: + return 'https://reddit.com' + suffix + + +def _get_backups(all_=True) -> Sequence[Path]: + bfiles = tuple(sorted(BPATH.glob('reddit-*.json.xz'))) # TODO switch to that new compression format? + if all_: + return bfiles + else: + return bfiles[-1:] + +Sid = str + +class Save(NamedTuple): + created: datetime + backup_dt: datetime + title: str + sid: Sid + json: Any = None + # TODO ugh. not sure how to support this in cachew... could try serializing dicts of simple types automatically.. but json can't be properly typed + # TODO why would json be none? + + def __hash__(self): + return hash(self.sid) + + @cproperty + def save_dt(self) -> datetime: + # TODO not exactly precise... but whatever I guess + return self.backup_dt + + @cproperty + def url(self) -> str: + # pylint: disable=unsubscriptable-object + pl = self.json['permalink'] + return reddit(pl) + + @cproperty + def text(self) -> str: + bb = self.json.get('body', None) + st = self.json.get('selftext', None) + if bb is not None and st is not None: + raise RuntimeError(f'wtf, both body and selftext are not None: {bb}; {st}') + return bb or st + + @cproperty + def subreddit(self) -> str: + assert self.json is not None + # pylint: disable=unsubscriptable-object + return self.json['subreddit']['display_name'] + + +# class Misc(NamedTuple): +# pass + +# EventKind = Union[Save, Misc] + +EventKind = Save + +class Event(NamedTuple): + dt: datetime + text: str + kind: EventKind + eid: str + title: str + url: str + + @property + def cmp_key(self): + return (self.dt, (1 if 'unfavorited' in self.text else 0)) + + +# TODO kython? +def get_some(d, *keys): + # TODO only one should be non None?? + for k in keys: + v = d.get(k, None) + if v is not None: + return v + else: + return None + + +Url = str + +def _get_bdate(bfile: Path) -> datetime: + RE = re.compile(r'reddit-(\d{14})') + match = RE.search(bfile.stem) + assert match is not None + bdt = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S")) + return bdt + + +def _get_state(bfile: Path) -> Dict[Sid, Save]: + logger = get_logger() + logger.debug('handling %s', bfile) + + bdt = _get_bdate(bfile) + + saves: List[Save] = [] + with kompress.open(bfile) as fo: + jj = json.load(fo) + + saved = jj['saved'] + for s in saved: + created = pytz.utc.localize(datetime.utcfromtimestamp(s['created_utc'])) + # TODO need permalink + # url = get_some(s, 'link_permalink', 'url') # this was original url... + title = get_some(s, 'link_title', 'title') + save = Save( + created=created, + backup_dt=bdt, + title=title, + sid=s['id'], + json=s, + ) + saves.append(save) + + return make_dict( + sorted(saves, key=lambda p: p.created), + key=lambda s: s.sid, + ) + return OrderedDict() + +# from cachew import cachew +# TODO hmm. how to combine cachew and lru_cache?.... +# @cachew('/L/data/.cache/reddit-events.cache') + +@lru_cache(1) +def _get_events(backups: Sequence[Path], parallel: bool) -> List[Event]: + logger = get_logger() + + events: List[Event] = [] + prev_saves: Dict[Sid, Save] = {} + # TODO suppress first batch?? + # TODO for initial batch, treat event time as creation time + + states: Iterable[Dict[Sid, Save]] + if parallel: + with Pool() as p: + states = p.map(_get_state, backups) + else: + # also make it lazy... + states = map(_get_state, backups) + + for i, bfile, saves in zip(numbers(), backups, states): + bdt = _get_bdate(bfile) + + first = i == 0 + + for key in set(prev_saves.keys()).symmetric_difference(set(saves.keys())): + ps = prev_saves.get(key, None) + if ps is not None: + # TODO use backup date, that is more precise... + # eh. I guess just take max and it will always be correct? + assert not first + events.append(Event( + dt=bdt, # TODO average wit ps.save_dt? + text=f"unfavorited", + kind=ps, + eid=f'unf-{ps.sid}', + url=ps.url, + title=ps.title, + )) + else: # in saves + s = saves[key] + events.append(Event( + dt=s.created if first else s.save_dt, + text=f"favorited{' [initial]' if first else ''}", + kind=s, + eid=f'fav-{s.sid}', + url=s.url, + title=s.title, + )) + prev_saves = saves + + # TODO a bit awkward, favorited should compare lower than unfavorited? + return list(sorted(events, key=lambda e: e.cmp_key)) + +def get_events(*args, all_=True, parallel=True): + backups = _get_backups(all_=all_) + assert len(backups) > 0 + return _get_events(backups=backups, parallel=parallel) + +def get_saves(**kwargs) -> List[Save]: + logger = get_logger() + + events = get_events(**kwargs) + saves: Dict[Sid, Save] = OrderedDict() + for e in events: + if e.text.startswith('favorited'): + ss = e.kind + assert isinstance(ss, Save) + if ss.sid in saves: + # apparently we can get duplicates if we saved/unsaved multiple times... + logger.warning(f'ignoring duplicate save %s, title %s, url %s', ss.sid, ss.title, ss.url) + else: + saves[ss.sid] = ss + assert len(saves) > 0 + + return list(saves.values()) + + +def test(): + get_events(all_=False) + get_saves(all_=False) + + +def test_unfav(): + events = get_events(all_=True) + url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/' + uevents = [e for e in events if e.url == url] + assert len(uevents) == 2 + ff = uevents[0] + assert ff.text == 'favorited' + uf = uevents[1] + assert uf.text == 'unfavorited' + + +def test_get_all_saves(): + saves = get_saves(all_=True) + # just check that they are unique.. + make_dict(saves, key=lambda s: s.sid) + + +def test_disappearing(): + # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason + # but I guess it was just a short glitch... so whatever + saves = get_events(all_=True) + favs = [s.kind for s in saves if s.text == 'favorited'] + [deal_with_it] = [f for f in favs if f.title == '"Deal with it!"'] + assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc) + + +def test_unfavorite(): + events = get_events(all_=True) + unfavs = [s for s in events if s.text == 'unfavorited'] + [xxx] = [u for u in unfavs if u.eid == 'unf-19ifop'] + assert xxx.dt == datetime(2019, 1, 28, 8, 10, 20, tzinfo=pytz.utc) + + +def main(): + setup_logzero(get_logger(), level=logging.DEBUG) + # TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM... + events = get_events(parallel=False) + print(len(events)) + for e in events: + print(e.text, e.url) + # for e in get_ + # 509 with urls.. + # for e in get_events(): + # print(e) + + +if __name__ == '__main__': + main()