my.github.gdpr: some minor enhancements
- better error context - handle some unknown files - handle user=None in some cases - cleanup imports
This commit is contained in:
parent
7ec894807f
commit
642e3b14d5
1 changed files with 35 additions and 33 deletions
|
@ -1,38 +1,33 @@
|
||||||
"""
|
"""
|
||||||
Github data (uses [[https://github.com/settings/admin][official GDPR export]])
|
Github data (uses [[https://github.com/settings/admin][official GDPR export]])
|
||||||
"""
|
"""
|
||||||
|
from dataclasses import dataclass
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import tarfile
|
import tarfile
|
||||||
from typing import Iterable, Dict, Any, Sequence
|
from typing import Iterable, Any, Sequence, Dict, Optional
|
||||||
|
|
||||||
from ..core import get_files, Res
|
from my.core import get_files, Res, PathIsh, stat, Stats, make_logger
|
||||||
from ..core.error import notnone
|
from my.core.cfg import make_config
|
||||||
|
from my.core.error import notnone, echain
|
||||||
|
|
||||||
from .common import Event, parse_dt, EventIds
|
from .common import Event, parse_dt, EventIds
|
||||||
|
|
||||||
# TODO later, use a separate user config? (github_gdpr)
|
# TODO later, use a separate user config? (github_gdpr)
|
||||||
from my.config import github as user_config
|
from my.config import github as user_config
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from ..core import PathIsh
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class github(user_config):
|
class github(user_config):
|
||||||
gdpr_dir: PathIsh # path to unpacked GDPR archive
|
gdpr_dir: PathIsh # path to unpacked GDPR archive
|
||||||
|
|
||||||
###
|
|
||||||
|
|
||||||
|
|
||||||
from ..core import make_logger
|
|
||||||
logger = make_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
from ..core.cfg import make_config
|
|
||||||
config = make_config(github)
|
config = make_config(github)
|
||||||
|
|
||||||
|
|
||||||
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
gdir = config.gdpr_dir
|
gdir = config.gdpr_dir
|
||||||
res = get_files(gdir)
|
res = get_files(gdir)
|
||||||
|
@ -54,22 +49,22 @@ def events() -> Iterable[Res[Event]]:
|
||||||
# a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples
|
# a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples
|
||||||
# another one is zulip archive
|
# another one is zulip archive
|
||||||
if last.is_dir():
|
if last.is_dir():
|
||||||
files = list(sorted(last.glob('*.json'))) # looks like all files are in the root
|
files = list(sorted(last.glob('*.json'))) # looks like all files are in the root
|
||||||
open_file = lambda f: f.open()
|
open_file = lambda f: f.open()
|
||||||
else:
|
else:
|
||||||
# treat as .tar.gz
|
# treat as .tar.gz
|
||||||
tfile = tarfile.open(last)
|
tfile = tarfile.open(last)
|
||||||
files = list(sorted(map(Path, tfile.getnames())))
|
files = list(sorted(map(Path, tfile.getnames())))
|
||||||
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
|
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
|
||||||
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./
|
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
handler_map = {
|
handler_map = {
|
||||||
'schema' : None,
|
'schema' : None,
|
||||||
'issue_events_': None, # eh, doesn't seem to have any useful bodies
|
'issue_events_': None, # eh, doesn't seem to have any useful bodies
|
||||||
'attachments_' : None, # not sure if useful
|
'attachments_' : None, # not sure if useful
|
||||||
'users' : None, # just contains random users
|
'users' : None, # just contains random users
|
||||||
'bots' : None, # just contains random bots
|
'bots' : None, # just contains random bots
|
||||||
'repositories_' : _parse_repository,
|
'repositories_' : _parse_repository,
|
||||||
'issue_comments_': _parse_issue_comment,
|
'issue_comments_': _parse_issue_comment,
|
||||||
'issues_' : _parse_issue,
|
'issues_' : _parse_issue,
|
||||||
|
@ -82,8 +77,11 @@ def events() -> Iterable[Res[Event]]:
|
||||||
'pull_request_review_threads_': None,
|
'pull_request_review_threads_': None,
|
||||||
'pull_request_reviews_': None,
|
'pull_request_reviews_': None,
|
||||||
##
|
##
|
||||||
'repository_files_': None, # repository artifacts, probs not very useful
|
'repository_files_': None, # repository artifacts, probs not very useful
|
||||||
|
'discussion_categories_': None, # doesn't seem to contain any useful info, just some repo metadata
|
||||||
|
'organizations_': None, # no useful info, just some org metadata
|
||||||
}
|
}
|
||||||
|
# fmt: on
|
||||||
for f in files:
|
for f in files:
|
||||||
logger.info(f'{f} : processing...')
|
logger.info(f'{f} : processing...')
|
||||||
handler: Any
|
handler: Any
|
||||||
|
@ -106,11 +104,10 @@ def events() -> Iterable[Res[Event]]:
|
||||||
try:
|
try:
|
||||||
yield handler(r)
|
yield handler(r)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
yield e
|
yield echain(RuntimeError(f'While processing file: {f}'), e)
|
||||||
|
|
||||||
|
|
||||||
def stats():
|
def stats() -> Stats:
|
||||||
from ..core import stat
|
|
||||||
return {
|
return {
|
||||||
**stat(events),
|
**stat(events),
|
||||||
}
|
}
|
||||||
|
@ -121,7 +118,7 @@ def _parse_common(d: Dict) -> Dict:
|
||||||
url = d['url']
|
url = d['url']
|
||||||
body = d.get('body')
|
body = d.get('body')
|
||||||
return {
|
return {
|
||||||
'dt' : parse_dt(d['created_at']),
|
'dt': parse_dt(d['created_at']),
|
||||||
'link': url,
|
'link': url,
|
||||||
'body': body,
|
'body': body,
|
||||||
}
|
}
|
||||||
|
@ -131,8 +128,9 @@ def _parse_repository(d: Dict) -> Event:
|
||||||
pref = 'https://github.com/'
|
pref = 'https://github.com/'
|
||||||
url = d['url']
|
url = d['url']
|
||||||
dts = d['created_at']
|
dts = d['created_at']
|
||||||
rt = d['type']
|
rt = d['type']
|
||||||
assert url.startswith(pref); name = url[len(pref):]
|
assert url.startswith(pref)
|
||||||
|
name = url[len(pref) :]
|
||||||
eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None)
|
eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None)
|
||||||
return Event(
|
return Event(
|
||||||
**_parse_common(d),
|
**_parse_common(d),
|
||||||
|
@ -141,26 +139,31 @@ def _parse_repository(d: Dict) -> Event:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# user may be None if the user was deleted
|
||||||
|
def _is_bot(user: Optional[str]) -> bool:
|
||||||
|
if user is None:
|
||||||
|
return False
|
||||||
|
return "[bot]" in "user"
|
||||||
|
|
||||||
|
|
||||||
def _parse_issue_comment(d: Dict) -> Event:
|
def _parse_issue_comment(d: Dict) -> Event:
|
||||||
url = d['url']
|
url = d['url']
|
||||||
is_bot = "[bot]" in d["user"]
|
|
||||||
return Event(
|
return Event(
|
||||||
**_parse_common(d),
|
**_parse_common(d),
|
||||||
summary=f'commented on issue {url}',
|
summary=f'commented on issue {url}',
|
||||||
eid='issue_comment_' + url,
|
eid='issue_comment_' + url,
|
||||||
is_bot=is_bot,
|
is_bot=_is_bot(d['user']),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _parse_issue(d: Dict) -> Event:
|
def _parse_issue(d: Dict) -> Event:
|
||||||
url = d['url']
|
url = d['url']
|
||||||
title = d['title']
|
title = d['title']
|
||||||
is_bot = "[bot]" in d["user"]
|
|
||||||
return Event(
|
return Event(
|
||||||
**_parse_common(d),
|
**_parse_common(d),
|
||||||
summary=f'opened issue {title}',
|
summary=f'opened issue {title}',
|
||||||
eid='issue_comment_' + url,
|
eid='issue_comment_' + url,
|
||||||
is_bot=is_bot,
|
is_bot=_is_bot(d['user']),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -168,14 +171,13 @@ def _parse_pull_request(d: Dict) -> Event:
|
||||||
dts = d['created_at']
|
dts = d['created_at']
|
||||||
url = d['url']
|
url = d['url']
|
||||||
title = d['title']
|
title = d['title']
|
||||||
is_bot = "[bot]" in d["user"]
|
|
||||||
return Event(
|
return Event(
|
||||||
**_parse_common(d),
|
**_parse_common(d),
|
||||||
# TODO distinguish incoming/outgoing?
|
# TODO distinguish incoming/outgoing?
|
||||||
# TODO action? opened/closed??
|
# TODO action? opened/closed??
|
||||||
summary=f'opened PR {title}',
|
summary=f'opened PR {title}',
|
||||||
eid=EventIds.pr(dts=dts, action='opened', url=url),
|
eid=EventIds.pr(dts=dts, action='opened', url=url),
|
||||||
is_bot=is_bot,
|
is_bot=_is_bot(d['user']),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue