reddit.rexport: some cleanup, move get_events stuff into personal overlay

This commit is contained in:
Dima Gerasimov 2023-10-19 01:08:50 +01:00 committed by karlicoss
parent fe26efaea8
commit 28d2450a21
2 changed files with 36 additions and 178 deletions

View file

@ -5,13 +5,24 @@ REQUIRES = [
'git+https://github.com/karlicoss/rexport', 'git+https://github.com/karlicoss/rexport',
] ]
from pathlib import Path
from my.core.common import Paths
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any from pathlib import Path
from typing import Iterator, Sequence
from my.core import (
get_files,
make_logger,
stat,
Paths,
Stats,
)
from my.core.cfg import make_config, Attrs
from my.core.common import mcachew
from my.config import reddit as uconfig from my.config import reddit as uconfig
logger = make_logger(__name__)
@dataclass @dataclass
class reddit(uconfig): class reddit(uconfig):
@ -23,7 +34,6 @@ class reddit(uconfig):
export_path: Paths export_path: Paths
from my.core.cfg import make_config, Attrs
# hmm, also nice thing about this is that migration is possible to test without the rest of the config? # hmm, also nice thing about this is that migration is possible to test without the rest of the config?
def migration(attrs: Attrs) -> Attrs: def migration(attrs: Attrs) -> Attrs:
# new structure, take top-level config and extract 'rexport' class # new structure, take top-level config and extract 'rexport' class
@ -33,6 +43,7 @@ def migration(attrs: Attrs) -> Attrs:
attrs['export_path'] = ex.export_path attrs['export_path'] = ex.export_path
else: else:
from my.core.warnings import high from my.core.warnings import high
high("""DEPRECATED! Please modify your reddit config to look like: high("""DEPRECATED! Please modify your reddit config to look like:
class reddit: class reddit:
@ -45,15 +56,15 @@ class reddit:
high(f'"{export_dir}" is deprecated! Please use "export_path" instead."') high(f'"{export_dir}" is deprecated! Please use "export_path" instead."')
return attrs return attrs
config = make_config(reddit, migration=migration) config = make_config(reddit, migration=migration)
### ###
# TODO not sure about the laziness...
try: try:
from rexport import dal from rexport import dal
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
from my.core.compat import pre_pip_dal_handler from my.core.compat import pre_pip_dal_handler
dal = pre_pip_dal_handler('rexport', e, config, requires=REQUIRES) dal = pre_pip_dal_handler('rexport', e, config, requires=REQUIRES)
# TODO ugh. this would import too early # TODO ugh. this would import too early
# but on the other hand we do want to bring the objects into the scope for easier imports, etc. ugh! # but on the other hand we do want to bring the objects into the scope for easier imports, etc. ugh!
@ -61,34 +72,28 @@ except ModuleNotFoundError as e:
# maybe, the config could dynamically detect change and reimport itself? dunno. # maybe, the config could dynamically detect change and reimport itself? dunno.
### ###
############################
from typing import List, Sequence, Mapping, Iterator, Any
from my.core import make_logger
from my.core.common import mcachew, get_files, make_dict, Stats
logger = make_logger(__name__)
from pathlib import Path
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
return get_files(config.export_path) return get_files(config.export_path)
# fmt: off
Uid = dal.Sid # str Uid = dal.Sid # str
Save = dal.Save Save = dal.Save
Comment = dal.Comment Comment = dal.Comment
Submission = dal.Submission Submission = dal.Submission
Upvote = dal.Upvote Upvote = dal.Upvote
# fmt: on
def _dal() -> dal.DAL: def _dal() -> dal.DAL:
inp = list(inputs()) inp = list(inputs())
return dal.DAL(inp) return dal.DAL(inp)
cache = mcachew(depends_on=inputs) cache = mcachew(depends_on=inputs)
@cache @cache
def saved() -> Iterator[Save]: def saved() -> Iterator[Save]:
return _dal().saved() return _dal().saved()
@ -109,132 +114,12 @@ def upvoted() -> Iterator[Upvote]:
return _dal().upvoted() return _dal().upvoted()
### the rest of the file is some elaborate attempt of restoring favorite/unfavorite times
from typing import Dict, Iterable, Iterator, NamedTuple
from functools import lru_cache
import re
from datetime import datetime, timezone
from multiprocessing import Pool
# TODO hmm. apparently decompressing takes quite a bit of time...
class SaveWithDt(NamedTuple):
save: Save
backup_dt: datetime
def __getattr__(self, x):
return getattr(self.save, x)
# TODO for future events?
EventKind = SaveWithDt
class Event(NamedTuple):
dt: datetime
text: str
kind: EventKind
eid: str
title: str
url: str
@property
def cmp_key(self):
return (self.dt, (1 if 'unfavorited' in self.text else 0))
Url = str
def _get_bdate(bfile: Path) -> datetime:
RE = re.compile(r'reddit.(\d{14})')
stem = bfile.stem
stem = stem.replace('T', '').replace('Z', '') # adapt for arctee
match = RE.search(stem)
assert match is not None
bdt = datetime.strptime(match.group(1), "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc)
return bdt
def _get_state(bfile: Path) -> Dict[Uid, SaveWithDt]:
logger.debug('handling %s', bfile)
bdt = _get_bdate(bfile)
saves = [SaveWithDt(save, bdt) for save in dal.DAL([bfile]).saved()]
return make_dict(
sorted(saves, key=lambda p: p.save.created),
key=lambda s: s.save.sid,
)
# TODO hmm. think about it.. if we set default backups=inputs()
# it's called early so it ends up as a global variable that we can't monkey patch easily
@mcachew(lambda backups: backups)
def _get_events(backups: Sequence[Path], parallel: bool=True) -> Iterator[Event]:
# todo cachew: let it transform return type? so you don't have to write a wrapper for lists?
prev_saves: Mapping[Uid, SaveWithDt] = {}
# TODO suppress first batch??
# TODO for initial batch, treat event time as creation time
states: Iterable[Mapping[Uid, SaveWithDt]]
if parallel:
with Pool() as p:
states = p.map(_get_state, backups)
else:
# also make it lazy...
states = map(_get_state, backups)
# TODO mm, need to make that iterative too?
for i, (bfile, saves) in enumerate(zip(backups, states)):
bdt = _get_bdate(bfile)
first = i == 0
for key in set(prev_saves.keys()).symmetric_difference(set(saves.keys())):
ps = prev_saves.get(key, None)
if ps is not None:
# TODO use backup date, that is more precise...
# eh. I guess just take max and it will always be correct?
assert not first
yield Event(
dt=bdt, # TODO average with ps.save_dt?
text="unfavorited",
kind=ps,
eid=f'unf-{ps.sid}',
url=ps.url,
title=ps.title,
)
else: # already in saves
s = saves[key]
last_saved = s.backup_dt
yield Event(
dt=s.created if first else last_saved,
text=f"favorited{' [initial]' if first else ''}",
kind=s,
eid=f'fav-{s.sid}',
url=s.url,
title=s.title,
)
prev_saves = saves
# TODO a bit awkward, favorited should compare lower than unfavorited?
@lru_cache(1)
def events(*args, **kwargs) -> List[Event]:
inp = inputs()
# 2.2s for 300 files without cachew
# 0.2s for 300 files with cachew
evit = _get_events(inp, *args, **kwargs)
# todo mypy is confused here and thinks it's iterable of Path? perhaps something to do with mcachew?
return list(sorted(evit, key=lambda e: e.cmp_key))
def stats() -> Stats: def stats() -> Stats:
from my.core import stat
return { return {
# fmt: off
**stat(saved ), **stat(saved ),
**stat(comments ), **stat(comments ),
**stat(submissions), **stat(submissions),
**stat(upvoted ), **stat(upvoted ),
# fmt: on
} }

View file

@ -1,5 +1,3 @@
from datetime import datetime, timezone
from my.core.cfg import tmp_config from my.core.cfg import tmp_config
from my.core.common import make_dict from my.core.common import make_dict
@ -13,34 +11,25 @@ import my.reddit.rexport as my_reddit_rexport
import my.reddit.all as my_reddit_all import my.reddit.all as my_reddit_all
def test_basic() -> None: def test_basic_1() -> None:
# todo maybe this should call stat or something instead? # todo maybe this should call stat or something instead?
# would ensure reasonable stat implementation as well and less duplication # would ensure reasonable stat implementation as well and less duplication
# note: deliberately use old module (instead of my.reddit.all) to test bwd compatibility # note: deliberately use old module (instead of my.reddit.all) to test bwd compatibility
from my.reddit import saved, events from my.reddit import saved
assert len(list(events())) > 0
assert len(list(saved())) > 0 assert len(list(saved())) > 0
def test_basic_2() -> None:
# deliberately check call from a different style of import to make sure tmp_config works
saves = list(my_reddit_rexport.saved())
assert len(saves) > 0
def test_comments() -> None: def test_comments() -> None:
assert len(list(my_reddit_all.comments())) > 0 assert len(list(my_reddit_all.comments())) > 0
def test_unfav() -> None:
from my.reddit import events
ev = events()
url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/'
uev = [e for e in ev if e.url == url]
assert len(uev) == 2
ff = uev[0]
# TODO could recover these from takeout perhaps?
assert ff.text == 'favorited [initial]'
uf = uev[1]
assert uf.text == 'unfavorited'
def test_saves() -> None: def test_saves() -> None:
from my.reddit.all import saved from my.reddit.all import saved
@ -51,22 +40,6 @@ def test_saves() -> None:
make_dict(saves, key=lambda s: s.sid) make_dict(saves, key=lambda s: s.sid)
def test_disappearing() -> None:
# eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason
# but I guess it was just a short glitch... so whatever
evs = my_reddit_rexport.events()
favs = [s.kind for s in evs if s.text == 'favorited']
[deal_with_it] = [f for f in favs if f.title == '"Deal with it!"']
assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=timezone.utc)
def test_unfavorite() -> None:
evs = my_reddit_rexport.events()
unfavs = [s for s in evs if s.text == 'unfavorited']
[xxx] = [u for u in unfavs if u.eid == 'unf-19ifop']
assert xxx.dt == datetime(2019, 1, 29, 10, 10, 20, tzinfo=timezone.utc)
def test_preserves_extra_attr() -> None: def test_preserves_extra_attr() -> None:
# doesn't strictly belong here (not specific to reddit) # doesn't strictly belong here (not specific to reddit)
# but my.reddit does a fair bit of dynamic hacking, so perhaps a good place to check nothing is lost # but my.reddit does a fair bit of dynamic hacking, so perhaps a good place to check nothing is lost