my.reddit: refactor into module that supports pushshift/gdpr (#179)

* initial pushshift/rexport merge implementation, using id for merging * smarter module deprecation warning using regex * add `RedditBase` from promnesia * `import_source` helper for gracefully handing mixin data sources
2021-10-31 13:39:04 -07:00 · 2021-10-31 13:39:04 -07:00 · 8422c6e420
commit 8422c6e420
parent b54ec0d7f1
15 changed files with 374 additions and 58 deletions
--- a/my/reddit/init.py
+++ b/my/reddit/init.py
@ -0,0 +1,41 @@
+"""
+This is here temporarily, for backwards compatability purposes
+It should be removed in the future, and you should replace any imports
+like:
+from my.reddit import ...
+to:
+from my.reddit.all import ...
+since that allows for easier overriding using namespace packages
+https://github.com/karlicoss/HPI/issues/102
+"""
+
+# For now, including this here, since importing the module
+# causes .rexport to be imported, which requires rexport
+REQUIRES = [
+    'git+https://github.com/karlicoss/rexport',
+]
+
+import re
+import traceback
+
+# some hacky traceback to inspect the current stack
+# to see if the user is using the old style of importing
+warn = False
+for f in traceback.extract_stack():
+    line = f.line or '' # just in case it's None, who knows..
+
+    # cover the most common ways of previously interacting with the module
+    if 'import my.reddit ' in (line + ' '):
+        warn = True
+    elif 'from my import reddit' in line:
+        warn = True
+    elif re.match(r"from my\.reddit\simport\s(comments|saved|submissions|upvoted)", line):
+        warn = True
+
+# TODO: add link to instructions to migrate
+if warn:
+    from my.core import warnings as W
+    W.high("DEPRECATED! Instead of my.reddit, import from my.reddit.all instead.")
+
+
+from .rexport import *
--- a/my/reddit/all.py
+++ b/my/reddit/all.py
@ -0,0 +1,68 @@
+from typing import Iterator
+from my.core.common import Stats
+from my.core.source import import_source
+
+from .common import Save, Upvote, Comment, Submission, _merge_comments
+
+# Man... ideally an all.py file isn't this verbose, but
+# reddit just feels like that much of a complicated source and
+# data acquired by different methods isn't the same
+
+### 'safe importers' -- falls back to empty data if the module couldn't be found
+rexport_src = import_source(module_name="my.reddit.rexport")
+pushshift_src = import_source(module_name="my.reddit.pushshift")
+
+@rexport_src
+def _rexport_comments() -> Iterator[Comment]:
+    from . import rexport
+    yield from rexport.comments()
+
+@rexport_src
+def _rexport_submissions() -> Iterator[Submission]:
+    from . import rexport
+    yield from rexport.submissions()
+
+@rexport_src
+def _rexport_saved() -> Iterator[Save]:
+    from . import rexport
+    yield from rexport.saved()
+
+@rexport_src
+def _rexport_upvoted() -> Iterator[Upvote]:
+    from . import rexport
+    yield from rexport.upvoted()
+
+@pushshift_src
+def _pushshift_comments() -> Iterator[Comment]:
+    from .pushshift import comments as pcomments
+    yield from pcomments()
+
+# Merged functions
+
+def comments() -> Iterator[Comment]:
+    # TODO: merge gdpr here
+    yield from _merge_comments(_rexport_comments(), _pushshift_comments())
+
+def submissions() -> Iterator[Submission]:
+    # TODO: merge gdpr here
+    yield from _rexport_submissions()
+
+@rexport_src
+def saved() -> Iterator[Save]:
+    from .rexport import saved
+    yield from saved()
+
+@rexport_src
+def upvoted() -> Iterator[Upvote]:
+    from .rexport import upvoted
+    yield from upvoted()
+
+def stats() -> Stats:
+    from my.core import stat
+    return {
+        **stat(saved),
+        **stat(comments),
+        **stat(submissions),
+        **stat(upvoted),
+    }
+
--- a/my/reddit/common.py
+++ b/my/reddit/common.py
@ -0,0 +1,72 @@
+"""
+This defines Protocol classes, which make sure that each different
+type of shared models have a standardized interface
+"""
+
+from typing import Dict, Any, Set, Iterator, TYPE_CHECKING
+from itertools import chain
+
+from my.core.common import datetime_aware
+
+Json = Dict[str, Any]
+
+if TYPE_CHECKING:
+    try:
+        from typing import Protocol
+    except ImportError:
+        # requirement of mypy
+        from typing_extensions import Protocol  # type: ignore[misc]
+else:
+    Protocol = object
+
+
+# common fields across all the Protocol classes, so generic code can be written
+class RedditBase(Protocol):
+    @property
+    def raw(self) -> Json: ...
+    @property
+    def created(self) -> datetime_aware: ...
+    @property
+    def id(self) -> str: ...
+    @property
+    def url(self) -> str: ...
+    @property
+    def text(self) -> str: ...
+
+
+# Note: doesn't include GDPR Save's since they don't have the same metadata
+class Save(Protocol, RedditBase):
+    @property
+    def subreddit(self) -> str: ...
+
+# Note: doesn't include GDPR Upvote's since they don't have the same metadata
+class Upvote(Protocol, RedditBase):
+    @property
+    def title(self) -> str: ...
+
+
+# From rexport, pushshift and the reddit GDPR export
+class Comment(Protocol, RedditBase):
+    pass
+
+
+# From rexport and the GDPR export
+class Submission(Protocol, RedditBase):
+    @property
+    def title(self) -> str: ...
+
+
+def _merge_comments(*sources: Iterator[Comment]) -> Iterator[Comment]:
+    #from .rexport import logger
+    #ignored = 0
+    emitted: Set[str] = set()
+    for e in chain(*sources):
+        uid = e.id
+        if uid in emitted:
+            #ignored += 1
+            #logger.info('ignoring %s: %s', uid, e)
+            continue
+        yield e
+        emitted.add(uid)
+    #logger.info(f"Ignored {ignored} comments...")
+
--- a/my/reddit/pushshift.py
+++ b/my/reddit/pushshift.py
@ -0,0 +1,48 @@
+"""
+Gives you access to older comments possibly not accessible with rexport
+using pushshift
+See https://github.com/seanbreckenridge/pushshift_comment_export
+"""
+
+REQUIRES = [
+    "git+https://github.com/seanbreckenridge/pushshift_comment_export",
+]
+
+from my.core.common import Paths, Stats
+from dataclasses import dataclass
+from my.core.cfg import make_config
+
+from my.config import reddit as uconfig
+
+@dataclass
+class pushshift_config(uconfig.pushshift):
+    '''
+    Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
+    '''
+
+    # path[s]/glob to the exported JSON data
+    export_path: Paths
+
+config = make_config(pushshift_config)
+
+from my.core import get_files
+from typing import Sequence, Iterator
+from pathlib import Path
+
+from pushshift_comment_export.dal import read_file, PComment
+
+
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+
+
+def comments() -> Iterator[PComment]:
+    for f in inputs():
+        yield from read_file(f)
+
+def stats() -> Stats:
+    from my.core import stat
+    return {
+        **stat(comments)
+    }
+
--- a/my/reddit/rexport.py
+++ b/my/reddit/rexport.py
@ -0,0 +1,247 @@
+"""
+Reddit data: saved items/comments/upvotes/etc.
+"""
+REQUIRES = [
+    'git+https://github.com/karlicoss/rexport',
+]
+
+from my.core.common import Paths
+from dataclasses import dataclass
+from typing import Any
+
+from my.config import reddit as uconfig
+
+
+@dataclass
+class reddit(uconfig):
+    '''
+    Uses [[https://github.com/karlicoss/rexport][rexport]] output.
+    '''
+
+    # path[s]/glob to the exported JSON data
+    export_path: Paths
+
+
+from my.core.cfg import make_config, Attrs
+# hmm, also nice thing about this is that migration is possible to test without the rest of the config?
+def migration(attrs: Attrs) -> Attrs:
+    # new structure, take top-level config and extract 'rexport' class
+    if 'rexport' in attrs:
+        ex: uconfig.rexport = attrs['rexport']
+        attrs['export_path'] = ex.export_path
+    else:
+        from my.core.warnings import high
+        high("""DEPRECATED! Please modify your reddit config to look like:
+
+class reddit:
+    class rexport:
+        export_path: Paths = '/path/to/rexport/data'
+            """)
+        export_dir = 'export_dir'
+        if export_dir in attrs: # legacy name
+            attrs['export_path'] = attrs[export_dir]
+            high(f'"{export_dir}" is deprecated! Please use "export_path" instead."')
+    return attrs
+
+config = make_config(reddit, migration=migration)
+
+###
+# TODO not sure about the laziness...
+
+try:
+    from rexport import dal
+except ModuleNotFoundError as e:
+    from my.core.compat import pre_pip_dal_handler
+    dal = pre_pip_dal_handler('rexport', e, config, requires=REQUIRES)
+# TODO ugh. this would import too early
+# but on the other hand we do want to bring the objects into the scope for easier imports, etc. ugh!
+# ok, fair enough I suppose. It makes sense to configure something before using it. can always figure it out later..
+# maybe, the config could dynamically detect change and reimport itself? dunno.
+###
+
+############################
+
+from typing import List, Sequence, Mapping, Iterator, Any
+from my.core.common import mcachew, get_files, LazyLogger, make_dict, Stats
+
+
+logger = LazyLogger(__name__, level='debug')
+
+
+from pathlib import Path
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+
+
+Uid        = dal.Sid  # str
+Save       = dal.Save
+Comment    = dal.Comment
+Submission = dal.Submission
+Upvote     = dal.Upvote
+
+
+def _dal() -> dal.DAL:
+    inp = list(inputs())
+    return dal.DAL(inp)
+cache = mcachew(depends_on=inputs) # depends on inputs only
+
+
+@cache
+def saved() -> Iterator[Save]:
+    return _dal().saved()
+
+
+@cache
+def comments() -> Iterator[Comment]:
+    return _dal().comments()
+
+
+@cache
+def submissions() -> Iterator[Submission]:
+    return _dal().submissions()
+
+
+@cache
+def upvoted() -> Iterator[Upvote]:
+    return _dal().upvoted()
+
+
+### the rest of the file is some elaborate attempt of restoring favorite/unfavorite times
+
+from typing import Dict, Iterable, Iterator, NamedTuple
+from functools import lru_cache
+import pytz
+import re
+from datetime import datetime
+from multiprocessing import Pool
+
+# TODO hmm. apparently decompressing takes quite a bit of time...
+
+class SaveWithDt(NamedTuple):
+    save: Save
+    backup_dt: datetime
+
+    def __getattr__(self, x):
+        return getattr(self.save, x)
+
+# TODO for future events?
+EventKind = SaveWithDt
+
+
+class Event(NamedTuple):
+    dt: datetime
+    text: str
+    kind: EventKind
+    eid: str
+    title: str
+    url: str
+
+    @property
+    def cmp_key(self):
+        return (self.dt, (1 if 'unfavorited' in self.text else 0))
+
+
+Url = str
+
+def _get_bdate(bfile: Path) -> datetime:
+    RE = re.compile(r'reddit.(\d{14})')
+    stem = bfile.stem
+    stem = stem.replace('T', '').replace('Z', '') # adapt for arctee
+    match = RE.search(stem)
+    assert match is not None
+    bdt = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S"))
+    return bdt
+
+
+def _get_state(bfile: Path) -> Dict[Uid, SaveWithDt]:
+    logger.debug('handling %s', bfile)
+
+    bdt = _get_bdate(bfile)
+
+    saves = [SaveWithDt(save, bdt) for save in dal.DAL([bfile]).saved()]
+    return make_dict(
+        sorted(saves, key=lambda p: p.save.created),
+        key=lambda s: s.save.sid,
+    )
+
+# TODO hmm. think about it.. if we set default backups=inputs()
+# it's called early so it ends up as a global variable that we can't monkey patch easily
+@mcachew
+def _get_events(backups: Sequence[Path], parallel: bool=True) -> Iterator[Event]:
+    # todo cachew: let it transform return type? so you don't have to write a wrapper for lists?
+
+    prev_saves: Mapping[Uid, SaveWithDt] = {}
+    # TODO suppress first batch??
+    # TODO for initial batch, treat event time as creation time
+
+    states: Iterable[Mapping[Uid, SaveWithDt]]
+    if parallel:
+        with Pool() as p:
+            states = p.map(_get_state, backups)
+    else:
+        # also make it lazy...
+        states = map(_get_state, backups)
+    # TODO mm, need to make that iterative too?
+
+    for i, (bfile, saves) in enumerate(zip(backups, states)):
+        bdt = _get_bdate(bfile)
+
+        first = i == 0
+
+        for key in set(prev_saves.keys()).symmetric_difference(set(saves.keys())):
+            ps = prev_saves.get(key, None)
+            if ps is not None:
+                # TODO use backup date, that is more precise...
+                # eh. I guess just take max and it will always be correct?
+                assert not first
+                yield Event(
+                    dt=bdt, # TODO average wit ps.save_dt? 
+                    text="unfavorited",
+                    kind=ps,
+                    eid=f'unf-{ps.sid}',
+                    url=ps.url,
+                    title=ps.title,
+                )
+            else: # already in saves
+                s = saves[key]
+                last_saved = s.backup_dt
+                yield Event(
+                    dt=s.created if first else last_saved,
+                    text=f"favorited{' [initial]' if first else ''}",
+                    kind=s,
+                    eid=f'fav-{s.sid}',
+                    url=s.url,
+                    title=s.title,
+                )
+        prev_saves = saves
+
+    # TODO a bit awkward, favorited should compare lower than unfavorited?
+
+@lru_cache(1)
+def events(*args, **kwargs) -> List[Event]:
+    inp = inputs()
+    # 2.2s for 300 files without cachew
+    # 0.2s for 300 files with cachew
+    evit = _get_events(inp, *args, **kwargs) # type: ignore[call-arg]
+    # todo mypy is confused here and thinks it's iterable of Path? perhaps something to do with mcachew?
+    return list(sorted(evit, key=lambda e: e.cmp_key)) # type: ignore[attr-defined,arg-type]
+
+
+def stats() -> Stats:
+    from my.core import stat
+    return {
+        **stat(saved      ),
+        **stat(comments   ),
+        **stat(submissions),
+        **stat(upvoted    ),
+    }
+
+
+def main() -> None:
+    for e in events(parallel=False):
+        print(e)
+
+
+if __name__ == '__main__':
+    main()
+