cleanup for reddit data provider

This commit is contained in:
Dima Gerasimov 2020-05-06 08:09:20 +01:00
parent 5d3c0bdb1f
commit 069732600c
2 changed files with 78 additions and 58 deletions

View file

@ -1,8 +1,6 @@
""" """
Reddit data: saved items/comments/upvotes/etc. Reddit data: saved items/comments/upvotes/etc.
""" """
from . import init
from pathlib import Path from pathlib import Path
from typing import List, Sequence, Mapping, Iterator from typing import List, Sequence, Mapping, Iterator
@ -13,14 +11,14 @@ from my.config import reddit as config
import my.config.repos.rexport.dal as rexport import my.config.repos.rexport.dal as rexport
def get_sources() -> Sequence[Path]: def inputs() -> Sequence[Path]:
# TODO rename to export_path? # TODO rename to export_path?
files = get_files(config.export_dir) files = get_files(config.export_dir)
# TODO Cpath better be automatic by get_files...
res = list(map(CPath, files)); assert len(res) > 0 res = list(map(CPath, files)); assert len(res) > 0
# todo move the assert to get_files? # todo move the assert to get_files?
return tuple(res) return tuple(res)
logger = LazyLogger(__name__, level='debug') logger = LazyLogger(__name__, level='debug')
@ -32,30 +30,30 @@ Upvote = rexport.Upvote
def dal() -> rexport.DAL: def dal() -> rexport.DAL:
# TODO lru cache? but be careful when it runs continuously return rexport.DAL(inputs())
return rexport.DAL(get_sources())
@mcachew(hashf=lambda: get_sources()) @mcachew(hashf=lambda: inputs())
def saved() -> Iterator[Save]: def saved() -> Iterator[Save]:
return dal().saved() return dal().saved()
@mcachew(hashf=lambda: get_sources()) @mcachew(hashf=lambda: inputs())
def comments() -> Iterator[Comment]: def comments() -> Iterator[Comment]:
return dal().comments() return dal().comments()
@mcachew(hashf=lambda: get_sources()) @mcachew(hashf=lambda: inputs())
def submissions() -> Iterator[Submission]: def submissions() -> Iterator[Submission]:
return dal().submissions() return dal().submissions()
@mcachew(hashf=lambda: get_sources()) @mcachew(hashf=lambda: inputs())
def upvoted() -> Iterator[Upvote]: def upvoted() -> Iterator[Upvote]:
return dal().upvoted() return dal().upvoted()
### the rest of the file is some elaborate attempt of restoring favorite/unfavorite times
from typing import Dict, Union, Iterable, Iterator, NamedTuple, Any from typing import Dict, Union, Iterable, Iterator, NamedTuple, Any
from functools import lru_cache from functools import lru_cache
@ -115,10 +113,11 @@ def _get_state(bfile: Path) -> Dict[Sid, SaveWithDt]:
key=lambda s: s.save.sid, key=lambda s: s.save.sid,
) )
# TODO hmm. think about it.. if we set default backups=inputs()
# it's called early so it ends up as a global variable that we can't monkey patch easily
@mcachew('/L/data/.cache/reddit-events.cache') @mcachew('/L/data/.cache/reddit-events.cache')
def _get_events(backups: Sequence[Path]=get_sources(), parallel: bool=True) -> Iterator[Event]: def _get_events(backups: Sequence[Path], parallel: bool=True) -> Iterator[Event]:
# TODO cachew: let it transform return type? so you don't have to write a wrapper for lists? # TODO cachew: let it transform return type? so you don't have to write a wrapper for lists?
# parallel = False # NOTE: eh, not sure if still necessary? I think glumov didn't like it?
prev_saves: Mapping[Sid, SaveWithDt] = {} prev_saves: Mapping[Sid, SaveWithDt] = {}
# TODO suppress first batch?? # TODO suppress first batch??
@ -168,55 +167,18 @@ def _get_events(backups: Sequence[Path]=get_sources(), parallel: bool=True) -> I
# TODO a bit awkward, favorited should compare lower than unfavorited? # TODO a bit awkward, favorited should compare lower than unfavorited?
@lru_cache(1) @lru_cache(1)
def get_events(*args, **kwargs) -> List[Event]: def events(*args, **kwargs) -> List[Event]:
evit = _get_events(*args, **kwargs) evit = _get_events(inputs(), *args, **kwargs)
return list(sorted(evit, key=lambda e: e.cmp_key)) return list(sorted(evit, key=lambda e: e.cmp_key))
##
def test() -> None:
get_events(backups=get_sources()[-1:])
list(saved())
def test_unfav() -> None:
events = get_events()
url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/'
uevents = [e for e in events if e.url == url]
assert len(uevents) == 2
ff = uevents[0]
assert ff.text == 'favorited'
uf = uevents[1]
assert uf.text == 'unfavorited'
# TODO move out..
def test_get_all_saves() -> None:
# TODO not sure if this is necesasry anymore?
saves = list(saved())
# just check that they are unique..
make_dict(saves, key=lambda s: s.sid)
def test_disappearing() -> None:
# eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason
# but I guess it was just a short glitch... so whatever
saves = get_events()
favs = [s.kind for s in saves if s.text == 'favorited']
[deal_with_it] = [f for f in favs if f.title == '"Deal with it!"']
assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc)
def test_unfavorite() -> None:
events = get_events()
unfavs = [s for s in events if s.text == 'unfavorited']
[xxx] = [u for u in unfavs if u.eid == 'unf-19ifop']
assert xxx.dt == datetime(2019, 1, 28, 8, 10, 20, tzinfo=pytz.utc)
def main() -> None: def main() -> None:
# TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM... # TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM...
events = get_events(parallel=False) el = events(parallel=False)
print(len(events)) print(len(el))
for e in events: for e in el:
print(e.text, e.url) print(e.text, e.url)
# for e in get_ # for e in get_
# 509 with urls.. # 509 with urls..
@ -226,3 +188,8 @@ def main() -> None:
if __name__ == '__main__': if __name__ == '__main__':
main() main()
# TODO deprecate...
get_sources = inputs
get_events = events

View file

@ -1,4 +1,57 @@
# ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927 from datetime import datetime
from my.reddit import * import pytz
# TODO for reddit test, patch up to take every 10th archive or something; but make sure it's deterministic from my.reddit import events, inputs, saved
from my.common import make_dict
def test() -> None:
list(events())
list(saved())
def test_unfav() -> None:
ev = events()
url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/'
uev = [e for e in ev if e.url == url]
assert len(uev) == 2
ff = uev[0]
# TODO could recover these from takeout perhaps?
assert ff.text == 'favorited [initial]'
uf = uev[1]
assert uf.text == 'unfavorited'
def test_saves() -> None:
# TODO not sure if this is necesasry anymore?
saves = list(saved())
# just check that they are unique..
make_dict(saves, key=lambda s: s.sid)
def test_disappearing() -> None:
# eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason
# but I guess it was just a short glitch... so whatever
saves = events()
favs = [s.kind for s in saves if s.text == 'favorited']
[deal_with_it] = [f for f in favs if f.title == '"Deal with it!"']
assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc)
def test_unfavorite() -> None:
evs = events()
unfavs = [s for s in evs if s.text == 'unfavorited']
[xxx] = [u for u in unfavs if u.eid == 'unf-19ifop']
assert xxx.dt == datetime(2019, 1, 28, 8, 10, 20, tzinfo=pytz.utc)
import pytest # type: ignore
@pytest.fixture(autouse=True, scope='module')
def prepare():
from my.common import get_files
from my.config import reddit as config
files = get_files(config.export_dir)
# use less files for the test to make it faster
# first bit is for 'test_unfavorite, the second is for test_disappearing
files = files[300:330] + files[500:520]
config.export_dir = files # type: ignore