my.github.gdpr: some minor enhancements

- better error context
- handle some unknown files
- handle user=None in some cases
- cleanup imports
This commit is contained in:
Dima Gerasimov 2023-08-24 23:04:36 +01:00 committed by karlicoss
parent 7ec894807f
commit 642e3b14d5

View file

@ -1,38 +1,33 @@
""" """
Github data (uses [[https://github.com/settings/admin][official GDPR export]]) Github data (uses [[https://github.com/settings/admin][official GDPR export]])
""" """
from dataclasses import dataclass
import json import json
from pathlib import Path from pathlib import Path
import tarfile import tarfile
from typing import Iterable, Dict, Any, Sequence from typing import Iterable, Any, Sequence, Dict, Optional
from ..core import get_files, Res from my.core import get_files, Res, PathIsh, stat, Stats, make_logger
from ..core.error import notnone from my.core.cfg import make_config
from my.core.error import notnone, echain
from .common import Event, parse_dt, EventIds from .common import Event, parse_dt, EventIds
# TODO later, use a separate user config? (github_gdpr) # TODO later, use a separate user config? (github_gdpr)
from my.config import github as user_config from my.config import github as user_config
from dataclasses import dataclass
from ..core import PathIsh
@dataclass @dataclass
class github(user_config): class github(user_config):
gdpr_dir: PathIsh # path to unpacked GDPR archive gdpr_dir: PathIsh # path to unpacked GDPR archive
###
from ..core import make_logger
logger = make_logger(__name__)
from ..core.cfg import make_config
config = make_config(github) config = make_config(github)
logger = make_logger(__name__)
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
gdir = config.gdpr_dir gdir = config.gdpr_dir
res = get_files(gdir) res = get_files(gdir)
@ -54,22 +49,22 @@ def events() -> Iterable[Res[Event]]:
# a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples
# another one is zulip archive # another one is zulip archive
if last.is_dir(): if last.is_dir():
files = list(sorted(last.glob('*.json'))) # looks like all files are in the root files = list(sorted(last.glob('*.json'))) # looks like all files are in the root
open_file = lambda f: f.open() open_file = lambda f: f.open()
else: else:
# treat as .tar.gz # treat as .tar.gz
tfile = tarfile.open(last) tfile = tarfile.open(last)
files = list(sorted(map(Path, tfile.getnames()))) files = list(sorted(map(Path, tfile.getnames())))
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json'] files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./
# fmt: off
handler_map = { handler_map = {
'schema' : None, 'schema' : None,
'issue_events_': None, # eh, doesn't seem to have any useful bodies 'issue_events_': None, # eh, doesn't seem to have any useful bodies
'attachments_' : None, # not sure if useful 'attachments_' : None, # not sure if useful
'users' : None, # just contains random users 'users' : None, # just contains random users
'bots' : None, # just contains random bots 'bots' : None, # just contains random bots
'repositories_' : _parse_repository, 'repositories_' : _parse_repository,
'issue_comments_': _parse_issue_comment, 'issue_comments_': _parse_issue_comment,
'issues_' : _parse_issue, 'issues_' : _parse_issue,
@ -82,8 +77,11 @@ def events() -> Iterable[Res[Event]]:
'pull_request_review_threads_': None, 'pull_request_review_threads_': None,
'pull_request_reviews_': None, 'pull_request_reviews_': None,
## ##
'repository_files_': None, # repository artifacts, probs not very useful 'repository_files_': None, # repository artifacts, probs not very useful
'discussion_categories_': None, # doesn't seem to contain any useful info, just some repo metadata
'organizations_': None, # no useful info, just some org metadata
} }
# fmt: on
for f in files: for f in files:
logger.info(f'{f} : processing...') logger.info(f'{f} : processing...')
handler: Any handler: Any
@ -106,11 +104,10 @@ def events() -> Iterable[Res[Event]]:
try: try:
yield handler(r) yield handler(r)
except Exception as e: except Exception as e:
yield e yield echain(RuntimeError(f'While processing file: {f}'), e)
def stats(): def stats() -> Stats:
from ..core import stat
return { return {
**stat(events), **stat(events),
} }
@ -121,7 +118,7 @@ def _parse_common(d: Dict) -> Dict:
url = d['url'] url = d['url']
body = d.get('body') body = d.get('body')
return { return {
'dt' : parse_dt(d['created_at']), 'dt': parse_dt(d['created_at']),
'link': url, 'link': url,
'body': body, 'body': body,
} }
@ -131,8 +128,9 @@ def _parse_repository(d: Dict) -> Event:
pref = 'https://github.com/' pref = 'https://github.com/'
url = d['url'] url = d['url']
dts = d['created_at'] dts = d['created_at']
rt = d['type'] rt = d['type']
assert url.startswith(pref); name = url[len(pref):] assert url.startswith(pref)
name = url[len(pref) :]
eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None) eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None)
return Event( return Event(
**_parse_common(d), **_parse_common(d),
@ -141,26 +139,31 @@ def _parse_repository(d: Dict) -> Event:
) )
# user may be None if the user was deleted
def _is_bot(user: Optional[str]) -> bool:
if user is None:
return False
return "[bot]" in "user"
def _parse_issue_comment(d: Dict) -> Event: def _parse_issue_comment(d: Dict) -> Event:
url = d['url'] url = d['url']
is_bot = "[bot]" in d["user"]
return Event( return Event(
**_parse_common(d), **_parse_common(d),
summary=f'commented on issue {url}', summary=f'commented on issue {url}',
eid='issue_comment_' + url, eid='issue_comment_' + url,
is_bot=is_bot, is_bot=_is_bot(d['user']),
) )
def _parse_issue(d: Dict) -> Event: def _parse_issue(d: Dict) -> Event:
url = d['url'] url = d['url']
title = d['title'] title = d['title']
is_bot = "[bot]" in d["user"]
return Event( return Event(
**_parse_common(d), **_parse_common(d),
summary=f'opened issue {title}', summary=f'opened issue {title}',
eid='issue_comment_' + url, eid='issue_comment_' + url,
is_bot=is_bot, is_bot=_is_bot(d['user']),
) )
@ -168,14 +171,13 @@ def _parse_pull_request(d: Dict) -> Event:
dts = d['created_at'] dts = d['created_at']
url = d['url'] url = d['url']
title = d['title'] title = d['title']
is_bot = "[bot]" in d["user"]
return Event( return Event(
**_parse_common(d), **_parse_common(d),
# TODO distinguish incoming/outgoing? # TODO distinguish incoming/outgoing?
# TODO action? opened/closed?? # TODO action? opened/closed??
summary=f'opened PR {title}', summary=f'opened PR {title}',
eid=EventIds.pr(dts=dts, action='opened', url=url), eid=EventIds.pr(dts=dts, action='opened', url=url),
is_bot=is_bot, is_bot=_is_bot(d['user']),
) )