my.stackexchange: use GDPR data for votes

This commit is contained in:
Dima Gerasimov 2020-12-04 18:41:16 +00:00 committed by karlicoss
parent ddea816a49
commit 63c825ab81
3 changed files with 98 additions and 2 deletions

View file

@ -9,3 +9,8 @@ from .cfg import make_config
from .util import __NOT_HPI_MODULE__
from .error import Res, unwrap
# just for brevity in modules
from dataclasses import dataclass
from pathlib import Path

81
my/stackexchange/gdpr.py Normal file
View file

@ -0,0 +1,81 @@
"""
Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][official GDPR export]])
"""
# TODO need to merge gdpr and stexport
### config
from my.config import stackexchange as user_config
from ..core import dataclass, PathIsh, make_config
@dataclass
class stackexchange(user_config):
gdpr_path: PathIsh # path to GDPR zip file
config = make_config(stackexchange)
# TODO later support unpacked zip too
###
# TODO just merge all of them and then filter?.. not sure
from ..core.common import Json, isoparse
from typing import NamedTuple, Iterable
from datetime import datetime
class Vote(NamedTuple):
j: Json
# todo ip?
@property
def when(self) -> datetime:
return isoparse(self.j['eventTime'])
# todo Url return type?
@property
def link(self) -> str:
# vote target
l = f"https://{self.j['siteId']}/"
t = self.j['target']
if t == 'Comment':
# for comments, these work?
# - https://meta.stackexchange.com/posts/comments/943975
# - https://meta.stackexchange.com/questions/5436/direct-link-to-a-comment#comment943975_290757
# ^question id ^comment id ^answer id
# hmm, this loads very raw comments without the rest of the page?
# - https://meta.stackexchange.com/posts/27319/comments#comment-57475
#
# parentPostId is the original quesion
# TODO is not always present? fucking hell
# seems like there is no way to get a hierarchical comment link.. guess this needs to be handled in Promnesia normalisation...
# postId is the answer
l += f"posts/comments/{self.j['commentId']}"
elif t == 'Post':
# https://unix.stackexchange.com/q/14841/180307
# https://unix.stackexchange.com/a/14871/180307
# https://unix.stackexchange.com/a/16756/180307
# shit. links generated by stackexchange are not hierarchical
# on the other hand seems that it works without the last bit (/180307)
# ok, 'a' works even for questions
l += f"a/{self.j['postId']}"
else:
raise RuntimeError(f'Unexpected type {t}')
return l
# todo expose vote type?
import json
from ..core.kompress import kopen
from ..core.error import Res
def votes() -> Iterable[Res[Vote]]:
# TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed
# todo should be defensive? not sure if present when user has no votes
with kopen(
config.gdpr_path,
'analytics/qa\\vote.submit.json', # TODO what the fuck is wrong with these separators
encoding='utf-8-sig', # not sure why, but seems necessary for this data
) as fo:
for r in reversed(json.load(fo)): # they seem to be in decreasing order by default
# TODO implement check method that would go through all properties and emit errors?
yield Vote(r)
from ..core import stat, Stats
def stats() -> Stats:
return stat(votes)

View file

@ -5,10 +5,19 @@ REQUIRES = [
'git+https://github.com/karlicoss/stexport',
]
# TODO use GDPR?
### config
from my.config import stackexchange as user_config
from ..core import dataclass, PathIsh, make_config
@dataclass
class stackexchange(user_config):
'''
Uses [[https://github.com/karlicoss/stexport][stexport]] outputs
'''
export_path: PathIsh # path to GDPR zip file
config = make_config(stackexchange)
###
from stexport import dal
from my.config import stackexchange as config
# todo lru cache?
@ -18,6 +27,7 @@ def _dal() -> dal.DAL:
return dal.DAL(inputs)
# TODO not sure if should keep the sites separate.. probably easier to filter after than merge
def site(name: str) -> dal.SiteDAL:
return _dal().site_dal(name)