my.github.gdpr: some minor enhancements

- better error context
- handle some unknown files
- handle user=None in some cases
- cleanup imports
This commit is contained in:
Dima Gerasimov 2023-08-24 23:04:36 +01:00 committed by karlicoss
parent 7ec894807f
commit 642e3b14d5

View file

@ -1,38 +1,33 @@
"""
Github data (uses [[https://github.com/settings/admin][official GDPR export]])
"""
from dataclasses import dataclass
import json
from pathlib import Path
import tarfile
from typing import Iterable, Dict, Any, Sequence
from typing import Iterable, Any, Sequence, Dict, Optional
from ..core import get_files, Res
from ..core.error import notnone
from my.core import get_files, Res, PathIsh, stat, Stats, make_logger
from my.core.cfg import make_config
from my.core.error import notnone, echain
from .common import Event, parse_dt, EventIds
# TODO later, use a separate user config? (github_gdpr)
from my.config import github as user_config
from dataclasses import dataclass
from ..core import PathIsh
@dataclass
class github(user_config):
gdpr_dir: PathIsh # path to unpacked GDPR archive
###
from ..core import make_logger
logger = make_logger(__name__)
from ..core.cfg import make_config
config = make_config(github)
logger = make_logger(__name__)
def inputs() -> Sequence[Path]:
gdir = config.gdpr_dir
res = get_files(gdir)
@ -63,7 +58,7 @@ def events() -> Iterable[Res[Event]]:
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./
# fmt: off
handler_map = {
'schema' : None,
'issue_events_': None, # eh, doesn't seem to have any useful bodies
@ -83,7 +78,10 @@ def events() -> Iterable[Res[Event]]:
'pull_request_reviews_': None,
##
'repository_files_': None, # repository artifacts, probs not very useful
'discussion_categories_': None, # doesn't seem to contain any useful info, just some repo metadata
'organizations_': None, # no useful info, just some org metadata
}
# fmt: on
for f in files:
logger.info(f'{f} : processing...')
handler: Any
@ -106,11 +104,10 @@ def events() -> Iterable[Res[Event]]:
try:
yield handler(r)
except Exception as e:
yield e
yield echain(RuntimeError(f'While processing file: {f}'), e)
def stats():
from ..core import stat
def stats() -> Stats:
return {
**stat(events),
}
@ -132,7 +129,8 @@ def _parse_repository(d: Dict) -> Event:
url = d['url']
dts = d['created_at']
rt = d['type']
assert url.startswith(pref); name = url[len(pref):]
assert url.startswith(pref)
name = url[len(pref) :]
eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None)
return Event(
**_parse_common(d),
@ -141,26 +139,31 @@ def _parse_repository(d: Dict) -> Event:
)
# user may be None if the user was deleted
def _is_bot(user: Optional[str]) -> bool:
if user is None:
return False
return "[bot]" in "user"
def _parse_issue_comment(d: Dict) -> Event:
url = d['url']
is_bot = "[bot]" in d["user"]
return Event(
**_parse_common(d),
summary=f'commented on issue {url}',
eid='issue_comment_' + url,
is_bot=is_bot,
is_bot=_is_bot(d['user']),
)
def _parse_issue(d: Dict) -> Event:
url = d['url']
title = d['title']
is_bot = "[bot]" in d["user"]
return Event(
**_parse_common(d),
summary=f'opened issue {title}',
eid='issue_comment_' + url,
is_bot=is_bot,
is_bot=_is_bot(d['user']),
)
@ -168,14 +171,13 @@ def _parse_pull_request(d: Dict) -> Event:
dts = d['created_at']
url = d['url']
title = d['title']
is_bot = "[bot]" in d["user"]
return Event(
**_parse_common(d),
# TODO distinguish incoming/outgoing?
# TODO action? opened/closed??
summary=f'opened PR {title}',
eid=EventIds.pr(dts=dts, action='opened', url=url),
is_bot=is_bot,
is_bot=_is_bot(d['user']),
)