my.stackexchange: use GDPR data for votes
This commit is contained in:
parent
ddea816a49
commit
63c825ab81
3 changed files with 98 additions and 2 deletions
|
@ -9,3 +9,8 @@ from .cfg import make_config
|
|||
from .util import __NOT_HPI_MODULE__
|
||||
|
||||
from .error import Res, unwrap
|
||||
|
||||
|
||||
# just for brevity in modules
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
|
81
my/stackexchange/gdpr.py
Normal file
81
my/stackexchange/gdpr.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
"""
|
||||
Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][official GDPR export]])
|
||||
"""
|
||||
|
||||
# TODO need to merge gdpr and stexport
|
||||
|
||||
### config
|
||||
from my.config import stackexchange as user_config
|
||||
from ..core import dataclass, PathIsh, make_config
|
||||
@dataclass
|
||||
class stackexchange(user_config):
|
||||
gdpr_path: PathIsh # path to GDPR zip file
|
||||
config = make_config(stackexchange)
|
||||
# TODO later support unpacked zip too
|
||||
###
|
||||
|
||||
# TODO just merge all of them and then filter?.. not sure
|
||||
|
||||
from ..core.common import Json, isoparse
|
||||
from typing import NamedTuple, Iterable
|
||||
from datetime import datetime
|
||||
class Vote(NamedTuple):
|
||||
j: Json
|
||||
# todo ip?
|
||||
|
||||
@property
|
||||
def when(self) -> datetime:
|
||||
return isoparse(self.j['eventTime'])
|
||||
|
||||
# todo Url return type?
|
||||
@property
|
||||
def link(self) -> str:
|
||||
# vote target
|
||||
l = f"https://{self.j['siteId']}/"
|
||||
t = self.j['target']
|
||||
if t == 'Comment':
|
||||
# for comments, these work?
|
||||
# - https://meta.stackexchange.com/posts/comments/943975
|
||||
# - https://meta.stackexchange.com/questions/5436/direct-link-to-a-comment#comment943975_290757
|
||||
# ^question id ^comment id ^answer id
|
||||
# hmm, this loads very raw comments without the rest of the page?
|
||||
# - https://meta.stackexchange.com/posts/27319/comments#comment-57475
|
||||
#
|
||||
# parentPostId is the original quesion
|
||||
# TODO is not always present? fucking hell
|
||||
# seems like there is no way to get a hierarchical comment link.. guess this needs to be handled in Promnesia normalisation...
|
||||
# postId is the answer
|
||||
l += f"posts/comments/{self.j['commentId']}"
|
||||
elif t == 'Post':
|
||||
# https://unix.stackexchange.com/q/14841/180307
|
||||
# https://unix.stackexchange.com/a/14871/180307
|
||||
# https://unix.stackexchange.com/a/16756/180307
|
||||
# shit. links generated by stackexchange are not hierarchical
|
||||
# on the other hand seems that it works without the last bit (/180307)
|
||||
# ok, 'a' works even for questions
|
||||
l += f"a/{self.j['postId']}"
|
||||
else:
|
||||
raise RuntimeError(f'Unexpected type {t}')
|
||||
return l
|
||||
|
||||
# todo expose vote type?
|
||||
|
||||
import json
|
||||
from ..core.kompress import kopen
|
||||
from ..core.error import Res
|
||||
def votes() -> Iterable[Res[Vote]]:
|
||||
# TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed
|
||||
# todo should be defensive? not sure if present when user has no votes
|
||||
with kopen(
|
||||
config.gdpr_path,
|
||||
'analytics/qa\\vote.submit.json', # TODO what the fuck is wrong with these separators
|
||||
encoding='utf-8-sig', # not sure why, but seems necessary for this data
|
||||
) as fo:
|
||||
for r in reversed(json.load(fo)): # they seem to be in decreasing order by default
|
||||
# TODO implement check method that would go through all properties and emit errors?
|
||||
yield Vote(r)
|
||||
|
||||
|
||||
from ..core import stat, Stats
|
||||
def stats() -> Stats:
|
||||
return stat(votes)
|
|
@ -5,10 +5,19 @@ REQUIRES = [
|
|||
'git+https://github.com/karlicoss/stexport',
|
||||
]
|
||||
|
||||
# TODO use GDPR?
|
||||
### config
|
||||
from my.config import stackexchange as user_config
|
||||
from ..core import dataclass, PathIsh, make_config
|
||||
@dataclass
|
||||
class stackexchange(user_config):
|
||||
'''
|
||||
Uses [[https://github.com/karlicoss/stexport][stexport]] outputs
|
||||
'''
|
||||
export_path: PathIsh # path to GDPR zip file
|
||||
config = make_config(stackexchange)
|
||||
###
|
||||
|
||||
from stexport import dal
|
||||
from my.config import stackexchange as config
|
||||
|
||||
|
||||
# todo lru cache?
|
||||
|
@ -18,6 +27,7 @@ def _dal() -> dal.DAL:
|
|||
return dal.DAL(inputs)
|
||||
|
||||
|
||||
# TODO not sure if should keep the sites separate.. probably easier to filter after than merge
|
||||
def site(name: str) -> dal.SiteDAL:
|
||||
return _dal().site_dal(name)
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue