my.github.gdpr: support uncompressed .tar.gz files

related to https://github.com/karlicoss/HPI/issues/20
This commit is contained in:
Dima Gerasimov 2022-05-31 21:54:11 +01:00 committed by karlicoss
parent 1b4ca6ad1b
commit 049820c827
2 changed files with 33 additions and 5 deletions

View file

@ -33,6 +33,8 @@ class pocket:
class github: class github:
export_path: Paths = '' export_path: Paths = ''
gdpr_dir: Paths = ''
class reddit: class reddit:
class rexport: class rexport:
export_path: Paths = '' export_path: Paths = ''

View file

@ -4,9 +4,11 @@ Github data (uses [[https://github.com/settings/admin][official GDPR export]])
import json import json
from pathlib import Path from pathlib import Path
import tarfile
from typing import Iterable, Dict, Any, Sequence from typing import Iterable, Dict, Any, Sequence
from ..core import get_files, Res from ..core import get_files, Res
from ..core.error import notnone
from .common import Event, parse_dt, EventIds from .common import Event, parse_dt, EventIds
@ -23,6 +25,10 @@ class github(user_config):
### ###
from ..core import LazyLogger
logger = LazyLogger(__name__)
from ..core.cfg import make_config from ..core.cfg import make_config
config = make_config(github) config = make_config(github)
@ -33,18 +39,31 @@ def inputs() -> Sequence[Path]:
schema_json = [f for f in res if f.name == 'schema.json'] schema_json = [f for f in res if f.name == 'schema.json']
was_unpacked = len(schema_json) > 0 was_unpacked = len(schema_json) > 0
if was_unpacked: if was_unpacked:
# legacy behaviour, we've been passed an extracted export directory # 'legacy' behaviour, we've been passed an extracted export directory
# although in principle nothing wrong with running against a directory with several unpacked archives
# so need to think how to support that in the future as well
return [schema_json[0].parent] return [schema_json[0].parent]
# otherwise, should contain a bunch of archives? # otherwise, should contain a bunch of archives?
# not sure if need to warn if any of them aren't .tar.gz? # not sure if need to warn if any of them aren't .tar.gz?
assert False, "TODO not implemented yet"
return res return res
def events() -> Iterable[Res[Event]]: def events() -> Iterable[Res[Event]]:
last = max(inputs()) last = max(inputs())
# TODO allow using archive here?
files = last.glob('*.json') # looks like all files are in the root # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples
# another one is zulip archive
if last.is_dir():
files = list(sorted(last.glob('*.json'))) # looks like all files are in the root
open_file = lambda f: f.open()
else:
# treat as .tar.gz
tfile = tarfile.open(last)
files = list(sorted(map(Path, tfile.getnames())))
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./
handler_map = { handler_map = {
'schema' : None, 'schema' : None,
'issue_events_': None, # eh, doesn't seem to have any useful bodies 'issue_events_': None, # eh, doesn't seem to have any useful bodies
@ -58,6 +77,12 @@ def events() -> Iterable[Res[Event]]:
'projects_' : _parse_project, 'projects_' : _parse_project,
'releases_' : _parse_release, 'releases_' : _parse_release,
'commit_comments': _parse_commit_comment, 'commit_comments': _parse_commit_comment,
## TODO need to handle these
'pull_request_review_comments_': None,
'pull_request_review_threads_': None,
'pull_request_reviews_': None,
##
'repository_files_': None, # repository artifacts, probs not very useful
} }
for f in files: for f in files:
handler: Any handler: Any
@ -74,7 +99,8 @@ def events() -> Iterable[Res[Event]]:
# ignored # ignored
continue continue
j = json.loads(f.read_text()) with open_file(f) as fo:
j = json.load(fo)
for r in j: for r in j:
try: try:
yield handler(r) yield handler(r)