cleanup for reddit data provider

2020-05-06 08:09:20 +01:00 · 2020-05-06 08:09:20 +01:00 · 069732600c
commit 069732600c
parent 5d3c0bdb1f
2 changed files with 78 additions and 58 deletions
--- a/my/reddit.py
+++ b/my/reddit.py
@ -1,8 +1,6 @@
 """
 Reddit data: saved items/comments/upvotes/etc.
 """
-from . import init
-
 from pathlib import Path
 from typing import List, Sequence, Mapping, Iterator

@ -13,14 +11,14 @@ from my.config import reddit as config
 import my.config.repos.rexport.dal as rexport


-def get_sources() -> Sequence[Path]:
+def inputs() -> Sequence[Path]:
    # TODO rename to export_path?
    files = get_files(config.export_dir)
+    # TODO Cpath better be automatic by get_files...
    res = list(map(CPath, files)); assert len(res) > 0
    # todo move the assert to get_files?
    return tuple(res)

-
 logger = LazyLogger(__name__, level='debug')


@ -32,30 +30,30 @@ Upvote     = rexport.Upvote


 def dal() -> rexport.DAL:
-    # TODO lru cache? but be careful when it runs continuously
-    return rexport.DAL(get_sources())
+    return rexport.DAL(inputs())


-@mcachew(hashf=lambda: get_sources())
+@mcachew(hashf=lambda: inputs())
 def saved() -> Iterator[Save]:
    return dal().saved()


-@mcachew(hashf=lambda: get_sources())
+@mcachew(hashf=lambda: inputs())
 def comments() -> Iterator[Comment]:
    return dal().comments()


-@mcachew(hashf=lambda: get_sources())
+@mcachew(hashf=lambda: inputs())
 def submissions() -> Iterator[Submission]:
    return dal().submissions()


-@mcachew(hashf=lambda: get_sources())
+@mcachew(hashf=lambda: inputs())
 def upvoted() -> Iterator[Upvote]:
    return dal().upvoted()


+### the rest of the file is some elaborate attempt of restoring favorite/unfavorite times

 from typing import Dict, Union, Iterable, Iterator, NamedTuple, Any
 from functools import lru_cache
@ -115,10 +113,11 @@ def _get_state(bfile: Path) -> Dict[Sid, SaveWithDt]:
        key=lambda s: s.save.sid,
    )

+# TODO hmm. think about it.. if we set default backups=inputs()
+# it's called early so it ends up as a global variable that we can't monkey patch easily
@mcachew('/L/data/.cache/reddit-events.cache')
-def _get_events(backups: Sequence[Path]=get_sources(), parallel: bool=True) -> Iterator[Event]:
+def _get_events(backups: Sequence[Path], parallel: bool=True) -> Iterator[Event]:
    # TODO cachew: let it transform return type? so you don't have to write a wrapper for lists?
-    # parallel = False # NOTE: eh, not sure if still necessary? I think glumov didn't like it?

    prev_saves: Mapping[Sid, SaveWithDt] = {}
    # TODO suppress first batch??
@ -168,55 +167,18 @@ def _get_events(backups: Sequence[Path]=get_sources(), parallel: bool=True) -> I
    # TODO a bit awkward, favorited should compare lower than unfavorited?

@lru_cache(1)
-def get_events(*args, **kwargs) -> List[Event]:
-    evit = _get_events(*args, **kwargs)
+def events(*args, **kwargs) -> List[Event]:
+    evit = _get_events(inputs(), *args, **kwargs)
    return list(sorted(evit, key=lambda e: e.cmp_key))

-
-def test() -> None:
-    get_events(backups=get_sources()[-1:])
-    list(saved())
-
-
-def test_unfav() -> None:
-    events = get_events()
-    url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/'
-    uevents = [e for e in events if e.url == url]
-    assert len(uevents) == 2
-    ff = uevents[0]
-    assert ff.text == 'favorited'
-    uf = uevents[1]
-    assert uf.text == 'unfavorited'
-
-# TODO move out..
-def test_get_all_saves() -> None:
-    # TODO not sure if this is necesasry anymore?
-    saves = list(saved())
-    # just check that they are unique..
-    make_dict(saves, key=lambda s: s.sid)
-
-
-def test_disappearing() -> None:
-    # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason
-    # but I guess it was just a short glitch... so whatever
-    saves = get_events()
-    favs = [s.kind for s in saves if s.text == 'favorited']
-    [deal_with_it] = [f for f in favs if f.title == '"Deal with it!"']
-    assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc)
-
-
-def test_unfavorite() -> None:
-    events = get_events()
-    unfavs = [s for s in events if s.text == 'unfavorited']
-    [xxx] = [u for u in unfavs if u.eid == 'unf-19ifop']
-    assert xxx.dt == datetime(2019, 1, 28, 8, 10, 20, tzinfo=pytz.utc)
+##


 def main() -> None:
    # TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM...
-    events = get_events(parallel=False)
-    print(len(events))
-    for e in events:
+    el = events(parallel=False)
+    print(len(el))
+    for e in el:
        print(e.text, e.url)
    # for e in get_
    # 509 with urls..
@ -226,3 +188,8 @@ def main() -> None:

 if __name__ == '__main__':
    main()
+
+# TODO deprecate...
+
+get_sources = inputs
+get_events = events
--- a/tests/reddit.py
+++ b/tests/reddit.py
@ -1,4 +1,57 @@
-# ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927
-from my.reddit import *
+from datetime import datetime
+import pytz

-# TODO for reddit test, patch up to take every 10th archive or something; but make sure it's deterministic
+from my.reddit import events, inputs, saved
+from my.common import make_dict
+
+
+def test() -> None:
+    list(events())
+    list(saved())
+
+
+def test_unfav() -> None:
+    ev = events()
+    url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/'
+    uev = [e for e in ev if e.url == url]
+    assert len(uev) == 2
+    ff = uev[0]
+    # TODO could recover these from takeout perhaps?
+    assert ff.text == 'favorited [initial]'
+    uf = uev[1]
+    assert uf.text == 'unfavorited'
+
+
+def test_saves() -> None:
+    # TODO not sure if this is necesasry anymore?
+    saves = list(saved())
+    # just check that they are unique..
+    make_dict(saves, key=lambda s: s.sid)
+
+
+def test_disappearing() -> None:
+    # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason
+    # but I guess it was just a short glitch... so whatever
+    saves = events()
+    favs = [s.kind for s in saves if s.text == 'favorited']
+    [deal_with_it] = [f for f in favs if f.title == '"Deal with it!"']
+    assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc)
+
+
+def test_unfavorite() -> None:
+    evs = events()
+    unfavs = [s for s in evs if s.text == 'unfavorited']
+    [xxx] = [u for u in unfavs if u.eid == 'unf-19ifop']
+    assert xxx.dt == datetime(2019, 1, 28, 8, 10, 20, tzinfo=pytz.utc)
+
+
+import pytest # type: ignore
+@pytest.fixture(autouse=True, scope='module')
+def prepare():
+    from my.common import get_files
+    from my.config import reddit as config
+    files = get_files(config.export_dir)
+    # use less files for the test to make it faster
+    # first bit is for 'test_unfavorite, the second is for test_disappearing
+    files = files[300:330] + files[500:520]
+    config.export_dir = files # type: ignore