my.github.gdpr: support uncompressed .tar.gz files
related to https://github.com/karlicoss/HPI/issues/20
This commit is contained in:
parent
1b4ca6ad1b
commit
049820c827
2 changed files with 33 additions and 5 deletions
|
@ -33,6 +33,8 @@ class pocket:
|
||||||
class github:
|
class github:
|
||||||
export_path: Paths = ''
|
export_path: Paths = ''
|
||||||
|
|
||||||
|
gdpr_dir: Paths = ''
|
||||||
|
|
||||||
class reddit:
|
class reddit:
|
||||||
class rexport:
|
class rexport:
|
||||||
export_path: Paths = ''
|
export_path: Paths = ''
|
||||||
|
|
|
@ -4,9 +4,11 @@ Github data (uses [[https://github.com/settings/admin][official GDPR export]])
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import tarfile
|
||||||
from typing import Iterable, Dict, Any, Sequence
|
from typing import Iterable, Dict, Any, Sequence
|
||||||
|
|
||||||
from ..core import get_files, Res
|
from ..core import get_files, Res
|
||||||
|
from ..core.error import notnone
|
||||||
|
|
||||||
from .common import Event, parse_dt, EventIds
|
from .common import Event, parse_dt, EventIds
|
||||||
|
|
||||||
|
@ -23,6 +25,10 @@ class github(user_config):
|
||||||
###
|
###
|
||||||
|
|
||||||
|
|
||||||
|
from ..core import LazyLogger
|
||||||
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
from ..core.cfg import make_config
|
from ..core.cfg import make_config
|
||||||
config = make_config(github)
|
config = make_config(github)
|
||||||
|
|
||||||
|
@ -33,18 +39,31 @@ def inputs() -> Sequence[Path]:
|
||||||
schema_json = [f for f in res if f.name == 'schema.json']
|
schema_json = [f for f in res if f.name == 'schema.json']
|
||||||
was_unpacked = len(schema_json) > 0
|
was_unpacked = len(schema_json) > 0
|
||||||
if was_unpacked:
|
if was_unpacked:
|
||||||
# legacy behaviour, we've been passed an extracted export directory
|
# 'legacy' behaviour, we've been passed an extracted export directory
|
||||||
|
# although in principle nothing wrong with running against a directory with several unpacked archives
|
||||||
|
# so need to think how to support that in the future as well
|
||||||
return [schema_json[0].parent]
|
return [schema_json[0].parent]
|
||||||
# otherwise, should contain a bunch of archives?
|
# otherwise, should contain a bunch of archives?
|
||||||
# not sure if need to warn if any of them aren't .tar.gz?
|
# not sure if need to warn if any of them aren't .tar.gz?
|
||||||
assert False, "TODO not implemented yet"
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def events() -> Iterable[Res[Event]]:
|
def events() -> Iterable[Res[Event]]:
|
||||||
last = max(inputs())
|
last = max(inputs())
|
||||||
# TODO allow using archive here?
|
|
||||||
files = last.glob('*.json') # looks like all files are in the root
|
# a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples
|
||||||
|
# another one is zulip archive
|
||||||
|
if last.is_dir():
|
||||||
|
files = list(sorted(last.glob('*.json'))) # looks like all files are in the root
|
||||||
|
open_file = lambda f: f.open()
|
||||||
|
else:
|
||||||
|
# treat as .tar.gz
|
||||||
|
tfile = tarfile.open(last)
|
||||||
|
files = list(sorted(map(Path, tfile.getnames())))
|
||||||
|
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
|
||||||
|
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./
|
||||||
|
|
||||||
|
|
||||||
handler_map = {
|
handler_map = {
|
||||||
'schema' : None,
|
'schema' : None,
|
||||||
'issue_events_': None, # eh, doesn't seem to have any useful bodies
|
'issue_events_': None, # eh, doesn't seem to have any useful bodies
|
||||||
|
@ -58,6 +77,12 @@ def events() -> Iterable[Res[Event]]:
|
||||||
'projects_' : _parse_project,
|
'projects_' : _parse_project,
|
||||||
'releases_' : _parse_release,
|
'releases_' : _parse_release,
|
||||||
'commit_comments': _parse_commit_comment,
|
'commit_comments': _parse_commit_comment,
|
||||||
|
## TODO need to handle these
|
||||||
|
'pull_request_review_comments_': None,
|
||||||
|
'pull_request_review_threads_': None,
|
||||||
|
'pull_request_reviews_': None,
|
||||||
|
##
|
||||||
|
'repository_files_': None, # repository artifacts, probs not very useful
|
||||||
}
|
}
|
||||||
for f in files:
|
for f in files:
|
||||||
handler: Any
|
handler: Any
|
||||||
|
@ -74,7 +99,8 @@ def events() -> Iterable[Res[Event]]:
|
||||||
# ignored
|
# ignored
|
||||||
continue
|
continue
|
||||||
|
|
||||||
j = json.loads(f.read_text())
|
with open_file(f) as fo:
|
||||||
|
j = json.load(fo)
|
||||||
for r in j:
|
for r in j:
|
||||||
try:
|
try:
|
||||||
yield handler(r)
|
yield handler(r)
|
||||||
|
|
Loading…
Add table
Reference in a new issue