my.reddit: refactor into module that supports pushshift/gdpr (#179)

* initial pushshift/rexport merge implementation, using id for merging
* smarter module deprecation warning using regex
* add `RedditBase` from promnesia
* `import_source` helper for gracefully handing mixin data sources
This commit is contained in:
Sean Breckenridge 2021-10-31 13:39:04 -07:00 committed by GitHub
parent b54ec0d7f1
commit 8422c6e420
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 374 additions and 58 deletions

48
my/reddit/pushshift.py Normal file
View file

@ -0,0 +1,48 @@
"""
Gives you access to older comments possibly not accessible with rexport
using pushshift
See https://github.com/seanbreckenridge/pushshift_comment_export
"""
REQUIRES = [
"git+https://github.com/seanbreckenridge/pushshift_comment_export",
]
from my.core.common import Paths, Stats
from dataclasses import dataclass
from my.core.cfg import make_config
from my.config import reddit as uconfig
@dataclass
class pushshift_config(uconfig.pushshift):
'''
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
'''
# path[s]/glob to the exported JSON data
export_path: Paths
config = make_config(pushshift_config)
from my.core import get_files
from typing import Sequence, Iterator
from pathlib import Path
from pushshift_comment_export.dal import read_file, PComment
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
def comments() -> Iterator[PComment]:
for f in inputs():
yield from read_file(f)
def stats() -> Stats:
from my.core import stat
return {
**stat(comments)
}