diff --git a/my/core/__init__.py b/my/core/__init__.py index 34a3e0a..8567cd7 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -9,3 +9,8 @@ from .cfg import make_config from .util import __NOT_HPI_MODULE__ from .error import Res, unwrap + + +# just for brevity in modules +from dataclasses import dataclass +from pathlib import Path diff --git a/my/stackexchange/gdpr.py b/my/stackexchange/gdpr.py new file mode 100644 index 0000000..94dc627 --- /dev/null +++ b/my/stackexchange/gdpr.py @@ -0,0 +1,81 @@ +""" +Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][official GDPR export]]) +""" + +# TODO need to merge gdpr and stexport + +### config +from my.config import stackexchange as user_config +from ..core import dataclass, PathIsh, make_config +@dataclass +class stackexchange(user_config): + gdpr_path: PathIsh # path to GDPR zip file +config = make_config(stackexchange) +# TODO later support unpacked zip too +### + +# TODO just merge all of them and then filter?.. not sure + +from ..core.common import Json, isoparse +from typing import NamedTuple, Iterable +from datetime import datetime +class Vote(NamedTuple): + j: Json + # todo ip? + + @property + def when(self) -> datetime: + return isoparse(self.j['eventTime']) + + # todo Url return type? + @property + def link(self) -> str: + # vote target + l = f"https://{self.j['siteId']}/" + t = self.j['target'] + if t == 'Comment': + # for comments, these work? + # - https://meta.stackexchange.com/posts/comments/943975 + # - https://meta.stackexchange.com/questions/5436/direct-link-to-a-comment#comment943975_290757 + # ^question id ^comment id ^answer id + # hmm, this loads very raw comments without the rest of the page? + # - https://meta.stackexchange.com/posts/27319/comments#comment-57475 + # + # parentPostId is the original quesion + # TODO is not always present? fucking hell + # seems like there is no way to get a hierarchical comment link.. guess this needs to be handled in Promnesia normalisation... + # postId is the answer + l += f"posts/comments/{self.j['commentId']}" + elif t == 'Post': + # https://unix.stackexchange.com/q/14841/180307 + # https://unix.stackexchange.com/a/14871/180307 + # https://unix.stackexchange.com/a/16756/180307 + # shit. links generated by stackexchange are not hierarchical + # on the other hand seems that it works without the last bit (/180307) + # ok, 'a' works even for questions + l += f"a/{self.j['postId']}" + else: + raise RuntimeError(f'Unexpected type {t}') + return l + + # todo expose vote type? + +import json +from ..core.kompress import kopen +from ..core.error import Res +def votes() -> Iterable[Res[Vote]]: + # TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed + # todo should be defensive? not sure if present when user has no votes + with kopen( + config.gdpr_path, + 'analytics/qa\\vote.submit.json', # TODO what the fuck is wrong with these separators + encoding='utf-8-sig', # not sure why, but seems necessary for this data + ) as fo: + for r in reversed(json.load(fo)): # they seem to be in decreasing order by default + # TODO implement check method that would go through all properties and emit errors? + yield Vote(r) + + +from ..core import stat, Stats +def stats() -> Stats: + return stat(votes) diff --git a/my/stackexchange/stexport.py b/my/stackexchange/stexport.py index 7150b1e..8dfee73 100644 --- a/my/stackexchange/stexport.py +++ b/my/stackexchange/stexport.py @@ -5,10 +5,19 @@ REQUIRES = [ 'git+https://github.com/karlicoss/stexport', ] -# TODO use GDPR? +### config +from my.config import stackexchange as user_config +from ..core import dataclass, PathIsh, make_config +@dataclass +class stackexchange(user_config): + ''' + Uses [[https://github.com/karlicoss/stexport][stexport]] outputs + ''' + export_path: PathIsh # path to GDPR zip file +config = make_config(stackexchange) +### from stexport import dal -from my.config import stackexchange as config # todo lru cache? @@ -18,6 +27,7 @@ def _dal() -> dal.DAL: return dal.DAL(inputs) +# TODO not sure if should keep the sites separate.. probably easier to filter after than merge def site(name: str) -> dal.SiteDAL: return _dal().site_dal(name)