From 642e3b14d5529ad63b7c2c9c83aeb821305b9a30 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 24 Aug 2023 23:04:36 +0100 Subject: [PATCH] my.github.gdpr: some minor enhancements - better error context - handle some unknown files - handle user=None in some cases - cleanup imports --- my/github/gdpr.py | 68 ++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/my/github/gdpr.py b/my/github/gdpr.py index 3d23565..1ff0f93 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -1,38 +1,33 @@ """ Github data (uses [[https://github.com/settings/admin][official GDPR export]]) """ - +from dataclasses import dataclass import json from pathlib import Path import tarfile -from typing import Iterable, Dict, Any, Sequence +from typing import Iterable, Any, Sequence, Dict, Optional -from ..core import get_files, Res -from ..core.error import notnone +from my.core import get_files, Res, PathIsh, stat, Stats, make_logger +from my.core.cfg import make_config +from my.core.error import notnone, echain from .common import Event, parse_dt, EventIds # TODO later, use a separate user config? (github_gdpr) from my.config import github as user_config -from dataclasses import dataclass -from ..core import PathIsh @dataclass class github(user_config): gdpr_dir: PathIsh # path to unpacked GDPR archive -### - -from ..core import make_logger -logger = make_logger(__name__) - - -from ..core.cfg import make_config config = make_config(github) +logger = make_logger(__name__) + + def inputs() -> Sequence[Path]: gdir = config.gdpr_dir res = get_files(gdir) @@ -54,22 +49,22 @@ def events() -> Iterable[Res[Event]]: # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples # another one is zulip archive if last.is_dir(): - files = list(sorted(last.glob('*.json'))) # looks like all files are in the root + files = list(sorted(last.glob('*.json'))) # looks like all files are in the root open_file = lambda f: f.open() else: # treat as .tar.gz tfile = tarfile.open(last) files = list(sorted(map(Path, tfile.getnames()))) files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json'] - open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ - + open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ + # fmt: off handler_map = { 'schema' : None, - 'issue_events_': None, # eh, doesn't seem to have any useful bodies - 'attachments_' : None, # not sure if useful - 'users' : None, # just contains random users - 'bots' : None, # just contains random bots + 'issue_events_': None, # eh, doesn't seem to have any useful bodies + 'attachments_' : None, # not sure if useful + 'users' : None, # just contains random users + 'bots' : None, # just contains random bots 'repositories_' : _parse_repository, 'issue_comments_': _parse_issue_comment, 'issues_' : _parse_issue, @@ -82,8 +77,11 @@ def events() -> Iterable[Res[Event]]: 'pull_request_review_threads_': None, 'pull_request_reviews_': None, ## - 'repository_files_': None, # repository artifacts, probs not very useful + 'repository_files_': None, # repository artifacts, probs not very useful + 'discussion_categories_': None, # doesn't seem to contain any useful info, just some repo metadata + 'organizations_': None, # no useful info, just some org metadata } + # fmt: on for f in files: logger.info(f'{f} : processing...') handler: Any @@ -106,11 +104,10 @@ def events() -> Iterable[Res[Event]]: try: yield handler(r) except Exception as e: - yield e + yield echain(RuntimeError(f'While processing file: {f}'), e) -def stats(): - from ..core import stat +def stats() -> Stats: return { **stat(events), } @@ -121,7 +118,7 @@ def _parse_common(d: Dict) -> Dict: url = d['url'] body = d.get('body') return { - 'dt' : parse_dt(d['created_at']), + 'dt': parse_dt(d['created_at']), 'link': url, 'body': body, } @@ -131,8 +128,9 @@ def _parse_repository(d: Dict) -> Event: pref = 'https://github.com/' url = d['url'] dts = d['created_at'] - rt = d['type'] - assert url.startswith(pref); name = url[len(pref):] + rt = d['type'] + assert url.startswith(pref) + name = url[len(pref) :] eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None) return Event( **_parse_common(d), @@ -141,26 +139,31 @@ def _parse_repository(d: Dict) -> Event: ) +# user may be None if the user was deleted +def _is_bot(user: Optional[str]) -> bool: + if user is None: + return False + return "[bot]" in "user" + + def _parse_issue_comment(d: Dict) -> Event: url = d['url'] - is_bot = "[bot]" in d["user"] return Event( **_parse_common(d), summary=f'commented on issue {url}', eid='issue_comment_' + url, - is_bot=is_bot, + is_bot=_is_bot(d['user']), ) def _parse_issue(d: Dict) -> Event: url = d['url'] title = d['title'] - is_bot = "[bot]" in d["user"] return Event( **_parse_common(d), summary=f'opened issue {title}', eid='issue_comment_' + url, - is_bot=is_bot, + is_bot=_is_bot(d['user']), ) @@ -168,14 +171,13 @@ def _parse_pull_request(d: Dict) -> Event: dts = d['created_at'] url = d['url'] title = d['title'] - is_bot = "[bot]" in d["user"] return Event( **_parse_common(d), # TODO distinguish incoming/outgoing? # TODO action? opened/closed?? summary=f'opened PR {title}', eid=EventIds.pr(dts=dts, action='opened', url=url), - is_bot=is_bot, + is_bot=_is_bot(d['user']), )