From 28d2450a214ac927b00209b56d841723ee18e8b6 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 19 Oct 2023 01:08:50 +0100 Subject: [PATCH] reddit.rexport: some cleanup, move get_events stuff into personal overlay --- my/reddit/rexport.py | 171 +++++++------------------------------------ my/tests/reddit.py | 43 ++--------- 2 files changed, 36 insertions(+), 178 deletions(-) diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index f20d00e..dadfb5a 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -5,13 +5,24 @@ REQUIRES = [ 'git+https://github.com/karlicoss/rexport', ] -from pathlib import Path -from my.core.common import Paths from dataclasses import dataclass -from typing import Any +from pathlib import Path +from typing import Iterator, Sequence + +from my.core import ( + get_files, + make_logger, + stat, + Paths, + Stats, +) +from my.core.cfg import make_config, Attrs +from my.core.common import mcachew from my.config import reddit as uconfig +logger = make_logger(__name__) + @dataclass class reddit(uconfig): @@ -23,7 +34,6 @@ class reddit(uconfig): export_path: Paths -from my.core.cfg import make_config, Attrs # hmm, also nice thing about this is that migration is possible to test without the rest of the config? def migration(attrs: Attrs) -> Attrs: # new structure, take top-level config and extract 'rexport' class @@ -33,6 +43,7 @@ def migration(attrs: Attrs) -> Attrs: attrs['export_path'] = ex.export_path else: from my.core.warnings import high + high("""DEPRECATED! Please modify your reddit config to look like: class reddit: @@ -45,15 +56,15 @@ class reddit: high(f'"{export_dir}" is deprecated! Please use "export_path" instead."') return attrs + config = make_config(reddit, migration=migration) ### -# TODO not sure about the laziness... - try: from rexport import dal except ModuleNotFoundError as e: from my.core.compat import pre_pip_dal_handler + dal = pre_pip_dal_handler('rexport', e, config, requires=REQUIRES) # TODO ugh. this would import too early # but on the other hand we do want to bring the objects into the scope for easier imports, etc. ugh! @@ -61,34 +72,28 @@ except ModuleNotFoundError as e: # maybe, the config could dynamically detect change and reimport itself? dunno. ### -############################ -from typing import List, Sequence, Mapping, Iterator, Any -from my.core import make_logger -from my.core.common import mcachew, get_files, make_dict, Stats - - -logger = make_logger(__name__) - - -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) -Uid = dal.Sid # str -Save = dal.Save -Comment = dal.Comment -Submission = dal.Submission -Upvote = dal.Upvote +# fmt: off +Uid = dal.Sid # str +Save = dal.Save +Comment = dal.Comment +Submission = dal.Submission +Upvote = dal.Upvote +# fmt: on def _dal() -> dal.DAL: inp = list(inputs()) return dal.DAL(inp) + cache = mcachew(depends_on=inputs) + @cache def saved() -> Iterator[Save]: return _dal().saved() @@ -109,132 +114,12 @@ def upvoted() -> Iterator[Upvote]: return _dal().upvoted() -### the rest of the file is some elaborate attempt of restoring favorite/unfavorite times - -from typing import Dict, Iterable, Iterator, NamedTuple -from functools import lru_cache -import re -from datetime import datetime, timezone -from multiprocessing import Pool - -# TODO hmm. apparently decompressing takes quite a bit of time... - -class SaveWithDt(NamedTuple): - save: Save - backup_dt: datetime - - def __getattr__(self, x): - return getattr(self.save, x) - -# TODO for future events? -EventKind = SaveWithDt - - -class Event(NamedTuple): - dt: datetime - text: str - kind: EventKind - eid: str - title: str - url: str - - @property - def cmp_key(self): - return (self.dt, (1 if 'unfavorited' in self.text else 0)) - - -Url = str - -def _get_bdate(bfile: Path) -> datetime: - RE = re.compile(r'reddit.(\d{14})') - stem = bfile.stem - stem = stem.replace('T', '').replace('Z', '') # adapt for arctee - match = RE.search(stem) - assert match is not None - bdt = datetime.strptime(match.group(1), "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc) - return bdt - - -def _get_state(bfile: Path) -> Dict[Uid, SaveWithDt]: - logger.debug('handling %s', bfile) - - bdt = _get_bdate(bfile) - - saves = [SaveWithDt(save, bdt) for save in dal.DAL([bfile]).saved()] - return make_dict( - sorted(saves, key=lambda p: p.save.created), - key=lambda s: s.save.sid, - ) - -# TODO hmm. think about it.. if we set default backups=inputs() -# it's called early so it ends up as a global variable that we can't monkey patch easily -@mcachew(lambda backups: backups) -def _get_events(backups: Sequence[Path], parallel: bool=True) -> Iterator[Event]: - # todo cachew: let it transform return type? so you don't have to write a wrapper for lists? - - prev_saves: Mapping[Uid, SaveWithDt] = {} - # TODO suppress first batch?? - # TODO for initial batch, treat event time as creation time - - states: Iterable[Mapping[Uid, SaveWithDt]] - if parallel: - with Pool() as p: - states = p.map(_get_state, backups) - else: - # also make it lazy... - states = map(_get_state, backups) - # TODO mm, need to make that iterative too? - - for i, (bfile, saves) in enumerate(zip(backups, states)): - bdt = _get_bdate(bfile) - - first = i == 0 - - for key in set(prev_saves.keys()).symmetric_difference(set(saves.keys())): - ps = prev_saves.get(key, None) - if ps is not None: - # TODO use backup date, that is more precise... - # eh. I guess just take max and it will always be correct? - assert not first - yield Event( - dt=bdt, # TODO average with ps.save_dt? - text="unfavorited", - kind=ps, - eid=f'unf-{ps.sid}', - url=ps.url, - title=ps.title, - ) - else: # already in saves - s = saves[key] - last_saved = s.backup_dt - yield Event( - dt=s.created if first else last_saved, - text=f"favorited{' [initial]' if first else ''}", - kind=s, - eid=f'fav-{s.sid}', - url=s.url, - title=s.title, - ) - prev_saves = saves - - # TODO a bit awkward, favorited should compare lower than unfavorited? - -@lru_cache(1) -def events(*args, **kwargs) -> List[Event]: - inp = inputs() - # 2.2s for 300 files without cachew - # 0.2s for 300 files with cachew - evit = _get_events(inp, *args, **kwargs) - # todo mypy is confused here and thinks it's iterable of Path? perhaps something to do with mcachew? - return list(sorted(evit, key=lambda e: e.cmp_key)) - - def stats() -> Stats: - from my.core import stat return { + # fmt: off **stat(saved ), **stat(comments ), **stat(submissions), **stat(upvoted ), + # fmt: on } - diff --git a/my/tests/reddit.py b/my/tests/reddit.py index 0871041..4af95ae 100644 --- a/my/tests/reddit.py +++ b/my/tests/reddit.py @@ -1,5 +1,3 @@ -from datetime import datetime, timezone - from my.core.cfg import tmp_config from my.core.common import make_dict @@ -13,34 +11,25 @@ import my.reddit.rexport as my_reddit_rexport import my.reddit.all as my_reddit_all -def test_basic() -> None: +def test_basic_1() -> None: # todo maybe this should call stat or something instead? # would ensure reasonable stat implementation as well and less duplication # note: deliberately use old module (instead of my.reddit.all) to test bwd compatibility - from my.reddit import saved, events + from my.reddit import saved - assert len(list(events())) > 0 assert len(list(saved())) > 0 +def test_basic_2() -> None: + # deliberately check call from a different style of import to make sure tmp_config works + saves = list(my_reddit_rexport.saved()) + assert len(saves) > 0 + + def test_comments() -> None: assert len(list(my_reddit_all.comments())) > 0 -def test_unfav() -> None: - from my.reddit import events - - ev = events() - url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/' - uev = [e for e in ev if e.url == url] - assert len(uev) == 2 - ff = uev[0] - # TODO could recover these from takeout perhaps? - assert ff.text == 'favorited [initial]' - uf = uev[1] - assert uf.text == 'unfavorited' - - def test_saves() -> None: from my.reddit.all import saved @@ -51,22 +40,6 @@ def test_saves() -> None: make_dict(saves, key=lambda s: s.sid) -def test_disappearing() -> None: - # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason - # but I guess it was just a short glitch... so whatever - evs = my_reddit_rexport.events() - favs = [s.kind for s in evs if s.text == 'favorited'] - [deal_with_it] = [f for f in favs if f.title == '"Deal with it!"'] - assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=timezone.utc) - - -def test_unfavorite() -> None: - evs = my_reddit_rexport.events() - unfavs = [s for s in evs if s.text == 'unfavorited'] - [xxx] = [u for u in unfavs if u.eid == 'unf-19ifop'] - assert xxx.dt == datetime(2019, 1, 29, 10, 10, 20, tzinfo=timezone.utc) - - def test_preserves_extra_attr() -> None: # doesn't strictly belong here (not specific to reddit) # but my.reddit does a fair bit of dynamic hacking, so perhaps a good place to check nothing is lost