""" Github data (uses [[https://github.com/settings/admin][official GDPR export]]) """ from __future__ import annotations import json from abc import abstractmethod from collections.abc import Iterator, Sequence from pathlib import Path from typing import Any from my.core import Paths, Res, Stats, get_files, make_logger, stat, warnings from my.core.error import echain from .common import Event, EventIds, parse_dt logger = make_logger(__name__) class config: @property @abstractmethod def gdpr_dir(self) -> Paths: raise NotImplementedError def make_config() -> config: # TODO later, use a separate user config? (github_gdpr) from my.config import github as user_config class combined_config(user_config, config): pass return combined_config() def inputs() -> Sequence[Path]: gdpr_dir = make_config().gdpr_dir res = get_files(gdpr_dir) schema_json = [f for f in res if f.name == 'schema.json'] was_unpacked = len(schema_json) > 0 if was_unpacked: # 'legacy' behaviour, we've been passed an extracted export directory # although in principle nothing wrong with running against a directory with several unpacked archives # so need to think how to support that in the future as well return [schema_json[0].parent] # otherwise, should contain a bunch of archives? # not sure if need to warn if any of them aren't .tar.gz? return res def events() -> Iterator[Res[Event]]: last = max(inputs()) logger.info(f'extracting data from {last}') root: Path | None = None if last.is_dir(): # if it's already CPath, this will match it root = last else: try: from kompress import CPath root = CPath(last) assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support except Exception as e: logger.exception(e) warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.") if root is None: from my.core.structure import match_structure with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns [root] = res yield from _process_one(root) else: yield from _process_one(root) def _process_one(root: Path) -> Iterator[Res[Event]]: files = sorted(root.glob('*.json')) # looks like all files are in the root # fmt: off handler_map = { 'schema' : None, 'issue_events_': None, # eh, doesn't seem to have any useful bodies 'attachments_' : None, # not sure if useful 'users' : None, # just contains random users 'bots' : None, # just contains random bots 'repositories_' : _parse_repository, 'issue_comments_': _parse_issue_comment, 'issues_' : _parse_issue, 'pull_requests_' : _parse_pull_request, 'projects_' : _parse_project, 'releases_' : _parse_release, 'commit_comments': _parse_commit_comment, ## TODO need to handle these 'pull_request_review_comments_': None, 'pull_request_review_threads_': None, 'pull_request_reviews_': None, ## 'repository_files_': None, # repository artifacts, probs not very useful 'discussion_categories_': None, # doesn't seem to contain any useful info, just some repo metadata 'organizations_': None, # no useful info, just some org metadata } # fmt: on for f in files: logger.info(f'{f} : processing...') handler: Any for prefix, h in handler_map.items(): if not f.name.startswith(prefix): continue handler = h break else: yield RuntimeError(f'Unhandled file: {f}') continue if handler is None: # ignored continue j = json.loads(f.read_text()) for r in j: try: yield handler(r) except Exception as e: yield echain(RuntimeError(f'While processing file: {f}'), e) def stats() -> Stats: return { **stat(events), } # TODO typing.TypedDict could be handy here.. def _parse_common(d: dict) -> dict: url = d['url'] body = d.get('body') return { 'dt': parse_dt(d['created_at']), 'link': url, 'body': body, } def _parse_repository(d: dict) -> Event: pref = 'https://github.com/' url = d['url'] dts = d['created_at'] rt = d['type'] assert url.startswith(pref) name = url[len(pref) :] eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None) return Event( **_parse_common(d), summary='created ' + name, eid=eid, ) # user may be None if the user was deleted def _is_bot(user: str | None) -> bool: if user is None: return False return "[bot]" in user def _parse_issue_comment(d: dict) -> Event: url = d['url'] return Event( **_parse_common(d), summary=f'commented on issue {url}', eid='issue_comment_' + url, is_bot=_is_bot(d['user']), ) def _parse_issue(d: dict) -> Event: url = d['url'] title = d['title'] return Event( **_parse_common(d), summary=f'opened issue {title}', eid='issue_comment_' + url, is_bot=_is_bot(d['user']), ) def _parse_pull_request(d: dict) -> Event: dts = d['created_at'] url = d['url'] title = d['title'] return Event( **_parse_common(d), # TODO distinguish incoming/outgoing? # TODO action? opened/closed?? summary=f'opened PR {title}', eid=EventIds.pr(dts=dts, action='opened', url=url), is_bot=_is_bot(d['user']), ) def _parse_project(d: dict) -> Event: url = d['url'] title = d['name'] is_bot = "[bot]" in d["creator"] # TODO: use columns somehow? # Doesn't fit with Event schema, # is a list of each of the boards return Event( **_parse_common(d), summary=f'created project {title}', eid='project_' + url, is_bot=is_bot, ) def _parse_release(d: dict) -> Event: tag = d['tag_name'] return Event( **_parse_common(d), summary=f'released {tag}', eid='release_' + tag, ) def _parse_commit_comment(d: dict) -> Event: url = d['url'] return Event( **_parse_common(d), summary=f'commented on {url}', eid='commit_comment_' + url, )