HPI/my/stackexchange/gdpr.py

79 lines
3.1 KiB
Python

"""
Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][official GDPR export]])
"""
# TODO need to merge gdpr and stexport
### config
from my.config import stackexchange as user_config
from ..core import dataclass, PathIsh, make_config
@dataclass
class stackexchange(user_config):
gdpr_path: PathIsh # path to GDPR zip file
config = make_config(stackexchange)
# TODO later support unpacked zip too
###
# TODO just merge all of them and then filter?.. not sure
from ..core.common import Json, isoparse
from typing import NamedTuple, Iterable
from datetime import datetime
class Vote(NamedTuple):
j: Json
# todo ip?
@property
def when(self) -> datetime:
return isoparse(self.j['eventTime'])
# todo Url return type?
@property
def link(self) -> str:
# vote target
l = f"https://{self.j['siteId']}/"
t = self.j['target']
if t == 'Comment':
# for comments, these work?
# - https://meta.stackexchange.com/posts/comments/943975
# - https://meta.stackexchange.com/questions/5436/direct-link-to-a-comment#comment943975_290757
# ^question id ^comment id ^answer id
# hmm, this loads very raw comments without the rest of the page?
# - https://meta.stackexchange.com/posts/27319/comments#comment-57475
#
# parentPostId is the original quesion
# TODO is not always present? fucking hell
# seems like there is no way to get a hierarchical comment link.. guess this needs to be handled in Promnesia normalisation...
# postId is the answer
l += f"posts/comments/{self.j['commentId']}"
elif t == 'Post':
# https://unix.stackexchange.com/q/14841/180307
# https://unix.stackexchange.com/a/14871/180307
# https://unix.stackexchange.com/a/16756/180307
# shit. links generated by stackexchange are not hierarchical
# on the other hand seems that it works without the last bit (/180307)
# ok, 'a' works even for questions
l += f"a/{self.j['postId']}"
else:
raise RuntimeError(f'Unexpected type {t}')
return l
# todo expose vote type?
import json
from ..core.kompress import ZipPath
from ..core.error import Res
def votes() -> Iterable[Res[Vote]]:
# TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed
# todo should be defensive? not sure if present when user has no votes
path = ZipPath(config.gdpr_path)
votes_path = path / 'analytics' / 'qa\\vote.submit.json' # yes, it does contain a backslash...
j = json.loads(votes_path.read_text(encoding='utf-8-sig')) # not sure why, but this encoding seems necessary
for r in reversed(j): # they seem to be in decreasing order by default
# TODO implement check method that would go through all properties and emit errors?
yield Vote(r)
from ..core import stat, Stats
def stats() -> Stats:
return stat(votes)