my.stackexchange: use GDPR data for votes
This commit is contained in:
parent
ddea816a49
commit
63c825ab81
3 changed files with 98 additions and 2 deletions
|
@ -9,3 +9,8 @@ from .cfg import make_config
|
||||||
from .util import __NOT_HPI_MODULE__
|
from .util import __NOT_HPI_MODULE__
|
||||||
|
|
||||||
from .error import Res, unwrap
|
from .error import Res, unwrap
|
||||||
|
|
||||||
|
|
||||||
|
# just for brevity in modules
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
81
my/stackexchange/gdpr.py
Normal file
81
my/stackexchange/gdpr.py
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
"""
|
||||||
|
Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][official GDPR export]])
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO need to merge gdpr and stexport
|
||||||
|
|
||||||
|
### config
|
||||||
|
from my.config import stackexchange as user_config
|
||||||
|
from ..core import dataclass, PathIsh, make_config
|
||||||
|
@dataclass
|
||||||
|
class stackexchange(user_config):
|
||||||
|
gdpr_path: PathIsh # path to GDPR zip file
|
||||||
|
config = make_config(stackexchange)
|
||||||
|
# TODO later support unpacked zip too
|
||||||
|
###
|
||||||
|
|
||||||
|
# TODO just merge all of them and then filter?.. not sure
|
||||||
|
|
||||||
|
from ..core.common import Json, isoparse
|
||||||
|
from typing import NamedTuple, Iterable
|
||||||
|
from datetime import datetime
|
||||||
|
class Vote(NamedTuple):
|
||||||
|
j: Json
|
||||||
|
# todo ip?
|
||||||
|
|
||||||
|
@property
|
||||||
|
def when(self) -> datetime:
|
||||||
|
return isoparse(self.j['eventTime'])
|
||||||
|
|
||||||
|
# todo Url return type?
|
||||||
|
@property
|
||||||
|
def link(self) -> str:
|
||||||
|
# vote target
|
||||||
|
l = f"https://{self.j['siteId']}/"
|
||||||
|
t = self.j['target']
|
||||||
|
if t == 'Comment':
|
||||||
|
# for comments, these work?
|
||||||
|
# - https://meta.stackexchange.com/posts/comments/943975
|
||||||
|
# - https://meta.stackexchange.com/questions/5436/direct-link-to-a-comment#comment943975_290757
|
||||||
|
# ^question id ^comment id ^answer id
|
||||||
|
# hmm, this loads very raw comments without the rest of the page?
|
||||||
|
# - https://meta.stackexchange.com/posts/27319/comments#comment-57475
|
||||||
|
#
|
||||||
|
# parentPostId is the original quesion
|
||||||
|
# TODO is not always present? fucking hell
|
||||||
|
# seems like there is no way to get a hierarchical comment link.. guess this needs to be handled in Promnesia normalisation...
|
||||||
|
# postId is the answer
|
||||||
|
l += f"posts/comments/{self.j['commentId']}"
|
||||||
|
elif t == 'Post':
|
||||||
|
# https://unix.stackexchange.com/q/14841/180307
|
||||||
|
# https://unix.stackexchange.com/a/14871/180307
|
||||||
|
# https://unix.stackexchange.com/a/16756/180307
|
||||||
|
# shit. links generated by stackexchange are not hierarchical
|
||||||
|
# on the other hand seems that it works without the last bit (/180307)
|
||||||
|
# ok, 'a' works even for questions
|
||||||
|
l += f"a/{self.j['postId']}"
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f'Unexpected type {t}')
|
||||||
|
return l
|
||||||
|
|
||||||
|
# todo expose vote type?
|
||||||
|
|
||||||
|
import json
|
||||||
|
from ..core.kompress import kopen
|
||||||
|
from ..core.error import Res
|
||||||
|
def votes() -> Iterable[Res[Vote]]:
|
||||||
|
# TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed
|
||||||
|
# todo should be defensive? not sure if present when user has no votes
|
||||||
|
with kopen(
|
||||||
|
config.gdpr_path,
|
||||||
|
'analytics/qa\\vote.submit.json', # TODO what the fuck is wrong with these separators
|
||||||
|
encoding='utf-8-sig', # not sure why, but seems necessary for this data
|
||||||
|
) as fo:
|
||||||
|
for r in reversed(json.load(fo)): # they seem to be in decreasing order by default
|
||||||
|
# TODO implement check method that would go through all properties and emit errors?
|
||||||
|
yield Vote(r)
|
||||||
|
|
||||||
|
|
||||||
|
from ..core import stat, Stats
|
||||||
|
def stats() -> Stats:
|
||||||
|
return stat(votes)
|
|
@ -5,10 +5,19 @@ REQUIRES = [
|
||||||
'git+https://github.com/karlicoss/stexport',
|
'git+https://github.com/karlicoss/stexport',
|
||||||
]
|
]
|
||||||
|
|
||||||
# TODO use GDPR?
|
### config
|
||||||
|
from my.config import stackexchange as user_config
|
||||||
|
from ..core import dataclass, PathIsh, make_config
|
||||||
|
@dataclass
|
||||||
|
class stackexchange(user_config):
|
||||||
|
'''
|
||||||
|
Uses [[https://github.com/karlicoss/stexport][stexport]] outputs
|
||||||
|
'''
|
||||||
|
export_path: PathIsh # path to GDPR zip file
|
||||||
|
config = make_config(stackexchange)
|
||||||
|
###
|
||||||
|
|
||||||
from stexport import dal
|
from stexport import dal
|
||||||
from my.config import stackexchange as config
|
|
||||||
|
|
||||||
|
|
||||||
# todo lru cache?
|
# todo lru cache?
|
||||||
|
@ -18,6 +27,7 @@ def _dal() -> dal.DAL:
|
||||||
return dal.DAL(inputs)
|
return dal.DAL(inputs)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO not sure if should keep the sites separate.. probably easier to filter after than merge
|
||||||
def site(name: str) -> dal.SiteDAL:
|
def site(name: str) -> dal.SiteDAL:
|
||||||
return _dal().site_dal(name)
|
return _dal().site_dal(name)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue