my.reddit: refactor into module that supports pushshift/gdpr (#179)
* initial pushshift/rexport merge implementation, using id for merging * smarter module deprecation warning using regex * add `RedditBase` from promnesia * `import_source` helper for gracefully handing mixin data sources
This commit is contained in:
parent
b54ec0d7f1
commit
8422c6e420
15 changed files with 374 additions and 58 deletions
48
my/reddit/pushshift.py
Normal file
48
my/reddit/pushshift.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
Gives you access to older comments possibly not accessible with rexport
|
||||
using pushshift
|
||||
See https://github.com/seanbreckenridge/pushshift_comment_export
|
||||
"""
|
||||
|
||||
REQUIRES = [
|
||||
"git+https://github.com/seanbreckenridge/pushshift_comment_export",
|
||||
]
|
||||
|
||||
from my.core.common import Paths, Stats
|
||||
from dataclasses import dataclass
|
||||
from my.core.cfg import make_config
|
||||
|
||||
from my.config import reddit as uconfig
|
||||
|
||||
@dataclass
|
||||
class pushshift_config(uconfig.pushshift):
|
||||
'''
|
||||
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
|
||||
'''
|
||||
|
||||
# path[s]/glob to the exported JSON data
|
||||
export_path: Paths
|
||||
|
||||
config = make_config(pushshift_config)
|
||||
|
||||
from my.core import get_files
|
||||
from typing import Sequence, Iterator
|
||||
from pathlib import Path
|
||||
|
||||
from pushshift_comment_export.dal import read_file, PComment
|
||||
|
||||
|
||||
def inputs() -> Sequence[Path]:
|
||||
return get_files(config.export_path)
|
||||
|
||||
|
||||
def comments() -> Iterator[PComment]:
|
||||
for f in inputs():
|
||||
yield from read_file(f)
|
||||
|
||||
def stats() -> Stats:
|
||||
from my.core import stat
|
||||
return {
|
||||
**stat(comments)
|
||||
}
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue