From 049820c82739ce4704e98fe059b115db84e65652 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 31 May 2022 21:54:11 +0100 Subject: [PATCH] my.github.gdpr: support uncompressed .tar.gz files related to https://github.com/karlicoss/HPI/issues/20 --- my/config.py | 2 ++ my/github/gdpr.py | 36 +++++++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/my/config.py b/my/config.py index 1a8e49a..35e22fb 100644 --- a/my/config.py +++ b/my/config.py @@ -33,6 +33,8 @@ class pocket: class github: export_path: Paths = '' + gdpr_dir: Paths = '' + class reddit: class rexport: export_path: Paths = '' diff --git a/my/github/gdpr.py b/my/github/gdpr.py index 0d75a87..c41fb6c 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -4,9 +4,11 @@ Github data (uses [[https://github.com/settings/admin][official GDPR export]]) import json from pathlib import Path +import tarfile from typing import Iterable, Dict, Any, Sequence from ..core import get_files, Res +from ..core.error import notnone from .common import Event, parse_dt, EventIds @@ -23,6 +25,10 @@ class github(user_config): ### +from ..core import LazyLogger +logger = LazyLogger(__name__) + + from ..core.cfg import make_config config = make_config(github) @@ -33,18 +39,31 @@ def inputs() -> Sequence[Path]: schema_json = [f for f in res if f.name == 'schema.json'] was_unpacked = len(schema_json) > 0 if was_unpacked: - # legacy behaviour, we've been passed an extracted export directory + # 'legacy' behaviour, we've been passed an extracted export directory + # although in principle nothing wrong with running against a directory with several unpacked archives + # so need to think how to support that in the future as well return [schema_json[0].parent] # otherwise, should contain a bunch of archives? # not sure if need to warn if any of them aren't .tar.gz? - assert False, "TODO not implemented yet" return res def events() -> Iterable[Res[Event]]: last = max(inputs()) - # TODO allow using archive here? - files = last.glob('*.json') # looks like all files are in the root + + # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples + # another one is zulip archive + if last.is_dir(): + files = list(sorted(last.glob('*.json'))) # looks like all files are in the root + open_file = lambda f: f.open() + else: + # treat as .tar.gz + tfile = tarfile.open(last) + files = list(sorted(map(Path, tfile.getnames()))) + files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json'] + open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ + + handler_map = { 'schema' : None, 'issue_events_': None, # eh, doesn't seem to have any useful bodies @@ -58,6 +77,12 @@ def events() -> Iterable[Res[Event]]: 'projects_' : _parse_project, 'releases_' : _parse_release, 'commit_comments': _parse_commit_comment, + ## TODO need to handle these + 'pull_request_review_comments_': None, + 'pull_request_review_threads_': None, + 'pull_request_reviews_': None, + ## + 'repository_files_': None, # repository artifacts, probs not very useful } for f in files: handler: Any @@ -74,7 +99,8 @@ def events() -> Iterable[Res[Event]]: # ignored continue - j = json.loads(f.read_text()) + with open_file(f) as fo: + j = json.load(fo) for r in j: try: yield handler(r)