my.github.gdpr/my.zulip.organization: use kompress support for tar.gz if it's available

otherwise fall back onto unpacking into tmp dir via my.core.structure
This commit is contained in:
Dima Gerasimov 2024-09-18 23:03:03 +01:00 committed by karlicoss
parent 201ddd4d7c
commit 6a18f47c37
4 changed files with 135 additions and 82 deletions

View file

@ -1,4 +1,5 @@
from .internal import assert_subpackage; assert_subpackage(__name__) from .internal import assert_subpackage; assert_subpackage(__name__)
from . import warnings from . import warnings
# do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath) # do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath)
@ -8,10 +9,7 @@ try:
from kompress import * from kompress import *
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
if e.name == 'kompress': if e.name == 'kompress':
warnings.high('Please install kompress (pip3 install kompress), it will be required in the future. Falling onto vendorized kompress for now.') warnings.high('Please install kompress (pip3 install kompress). Falling onto vendorized kompress for now.')
from ._deprecated.kompress import * # type: ignore[assignment] from ._deprecated.kompress import * # type: ignore[assignment]
else: else:
raise e raise e
# this is deprecated in compress, keep here for backwards compatibility
open = kopen # noqa: F405

View file

@ -1,36 +1,42 @@
""" """
Github data (uses [[https://github.com/settings/admin][official GDPR export]]) Github data (uses [[https://github.com/settings/admin][official GDPR export]])
""" """
from dataclasses import dataclass
from __future__ import annotations
import json import json
from abc import abstractmethod
from pathlib import Path from pathlib import Path
import tarfile from typing import Any, Iterator, Sequence
from typing import Iterable, Any, Sequence, Dict, Optional
from my.core import get_files, Res, PathIsh, stat, Stats, make_logger from my.core import Paths, Res, Stats, get_files, make_logger, stat, warnings
from my.core.cfg import make_config from my.core.error import echain
from my.core.error import notnone, echain
from .common import Event, parse_dt, EventIds
# TODO later, use a separate user config? (github_gdpr)
from my.config import github as user_config
@dataclass
class github(user_config):
gdpr_dir: PathIsh # path to unpacked GDPR archive
config = make_config(github)
from .common import Event, EventIds, parse_dt
logger = make_logger(__name__) logger = make_logger(__name__)
class config:
@property
@abstractmethod
def gdpr_dir(self) -> Paths:
raise NotImplementedError
def make_config() -> config:
# TODO later, use a separate user config? (github_gdpr)
from my.config import github as user_config
class combined_config(user_config, config):
pass
return combined_config()
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
gdir = config.gdpr_dir gdpr_dir = make_config().gdpr_dir
res = get_files(gdir) res = get_files(gdpr_dir)
schema_json = [f for f in res if f.name == 'schema.json'] schema_json = [f for f in res if f.name == 'schema.json']
was_unpacked = len(schema_json) > 0 was_unpacked = len(schema_json) > 0
if was_unpacked: if was_unpacked:
@ -43,22 +49,37 @@ def inputs() -> Sequence[Path]:
return res return res
def events() -> Iterable[Res[Event]]: def events() -> Iterator[Res[Event]]:
last = max(inputs()) last = max(inputs())
logger.info(f'extracting data from {last}') logger.info(f'extracting data from {last}')
# a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples root: Path | None = None
# another one is zulip archive
if last.is_dir(): if last.is_dir(): # if it's already CPath, this will match it
files = sorted(last.glob('*.json')) # looks like all files are in the root root = last
open_file = lambda f: f.open()
else: else:
# treat as .tar.gz try:
tfile = tarfile.open(last) from kompress import CPath
files = sorted(map(Path, tfile.getnames()))
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json'] root = CPath(last)
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support
except Exception as e:
logger.exception(e)
warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")
if root is None:
from my.core.structure import match_structure
with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns
[root] = res
yield from _process_one(root)
else:
yield from _process_one(root)
def _process_one(root: Path) -> Iterator[Res[Event]]:
files = sorted(root.glob('*.json')) # looks like all files are in the root
# fmt: off # fmt: off
handler_map = { handler_map = {
@ -100,8 +121,7 @@ def events() -> Iterable[Res[Event]]:
# ignored # ignored
continue continue
with open_file(f) as fo: j = json.loads(f.read_text())
j = json.load(fo)
for r in j: for r in j:
try: try:
yield handler(r) yield handler(r)
@ -116,7 +136,7 @@ def stats() -> Stats:
# TODO typing.TypedDict could be handy here.. # TODO typing.TypedDict could be handy here..
def _parse_common(d: Dict) -> Dict: def _parse_common(d: dict) -> dict:
url = d['url'] url = d['url']
body = d.get('body') body = d.get('body')
return { return {
@ -126,7 +146,7 @@ def _parse_common(d: Dict) -> Dict:
} }
def _parse_repository(d: Dict) -> Event: def _parse_repository(d: dict) -> Event:
pref = 'https://github.com/' pref = 'https://github.com/'
url = d['url'] url = d['url']
dts = d['created_at'] dts = d['created_at']
@ -142,13 +162,13 @@ def _parse_repository(d: Dict) -> Event:
# user may be None if the user was deleted # user may be None if the user was deleted
def _is_bot(user: Optional[str]) -> bool: def _is_bot(user: str | None) -> bool:
if user is None: if user is None:
return False return False
return "[bot]" in user return "[bot]" in user
def _parse_issue_comment(d: Dict) -> Event: def _parse_issue_comment(d: dict) -> Event:
url = d['url'] url = d['url']
return Event( return Event(
**_parse_common(d), **_parse_common(d),
@ -158,7 +178,7 @@ def _parse_issue_comment(d: Dict) -> Event:
) )
def _parse_issue(d: Dict) -> Event: def _parse_issue(d: dict) -> Event:
url = d['url'] url = d['url']
title = d['title'] title = d['title']
return Event( return Event(
@ -169,7 +189,7 @@ def _parse_issue(d: Dict) -> Event:
) )
def _parse_pull_request(d: Dict) -> Event: def _parse_pull_request(d: dict) -> Event:
dts = d['created_at'] dts = d['created_at']
url = d['url'] url = d['url']
title = d['title'] title = d['title']
@ -183,7 +203,7 @@ def _parse_pull_request(d: Dict) -> Event:
) )
def _parse_project(d: Dict) -> Event: def _parse_project(d: dict) -> Event:
url = d['url'] url = d['url']
title = d['name'] title = d['name']
is_bot = "[bot]" in d["creator"] is_bot = "[bot]" in d["creator"]
@ -198,7 +218,7 @@ def _parse_project(d: Dict) -> Event:
) )
def _parse_release(d: Dict) -> Event: def _parse_release(d: dict) -> Event:
tag = d['tag_name'] tag = d['tag_name']
return Event( return Event(
**_parse_common(d), **_parse_common(d),
@ -207,7 +227,7 @@ def _parse_release(d: Dict) -> Event:
) )
def _parse_commit_comment(d: Dict) -> Event: def _parse_commit_comment(d: dict) -> Event:
url = d['url'] url = d['url']
return Event( return Event(
**_parse_common(d), **_parse_common(d),

View file

@ -1,38 +1,55 @@
""" """
Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]] Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
""" """
from __future__ import annotations
import json
from abc import abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timezone from datetime import datetime, timezone
from itertools import count from itertools import count
import json
from pathlib import Path from pathlib import Path
from typing import Sequence, Iterator, Dict, Union from typing import Iterator, Sequence
from my.core import ( from my.core import (
assert_never,
datetime_aware,
get_files,
stat,
Json, Json,
Paths, Paths,
Res, Res,
Stats, Stats,
assert_never,
datetime_aware,
get_files,
make_logger,
stat,
warnings,
) )
from my.core.error import notnone
import my.config logger = make_logger(__name__)
@dataclass class config:
class organization(my.config.zulip.organization): @property
# paths[s]/glob to the exported JSON data @abstractmethod
export_path: Paths def export_path(self) -> Paths:
"""paths[s]/glob to the exported JSON data"""
raise NotImplementedError
def make_config() -> config:
from my.config import zulip as user_config
class combined_config(user_config.organization, config):
pass
return combined_config()
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
# TODO: seems like export ids are kinda random.. # TODO: seems like export ids are kinda random..
# not sure what's the best way to figure out the last without renaming? # not sure what's the best way to figure out the last without renaming?
# could use mtime perhaps? # could use mtime perhaps?
return get_files(organization.export_path, sort=False) return get_files(make_config().export_path, sort=False)
@dataclass(frozen=True) @dataclass(frozen=True)
@ -85,19 +102,39 @@ class Message:
# todo cache it # todo cache it
def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: def _entities() -> Iterator[Res[Server | Sender | _Message]]:
last = max(inputs()) last = max(inputs())
# todo would be nice to switch it to unpacked dirs as well, similar to ZipPath logger.info(f'extracting data from {last}')
# I guess makes sense to have a special implementation for .tar.gz considering how common are they
import tarfile
tfile = tarfile.open(last) root: Path | None = None
subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that if last.is_dir(): # if it's already CPath, this will match it
root = last
else:
try:
from kompress import CPath
with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo: root = CPath(last)
rj = json.load(fo) assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support
except Exception as e:
logger.exception(e)
warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")
if root is None:
from my.core.structure import match_structure
with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns
[root] = res
yield from _process_one(root)
else:
yield from _process_one(root)
def _process_one(root: Path) -> Iterator[Res[Server | Sender | _Message]]:
[subdir] = root.iterdir() # there is a directory inside tar file, first name should be that
rj = json.loads((subdir / 'realm.json').read_text())
[sj] = rj['zerver_realm'] [sj] = rj['zerver_realm']
server = Server( server = Server(
@ -136,12 +173,10 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
for idx in count(start=1, step=1): for idx in count(start=1, step=1):
fname = f'messages-{idx:06}.json' fname = f'messages-{idx:06}.json'
fpath = f'{subdir}/{fname}' fpath = subdir / fname
if fpath not in tfile.getnames(): if not fpath.exists():
# tarfile doesn't have .exists?
break break
with notnone(tfile.extractfile(fpath)) as fo: mj = json.loads(fpath.read_text())
mj = json.load(fo)
# TODO handle zerver_usermessage # TODO handle zerver_usermessage
for j in mj['zerver_message']: for j in mj['zerver_message']:
try: try:
@ -151,8 +186,8 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
def messages() -> Iterator[Res[Message]]: def messages() -> Iterator[Res[Message]]:
id2sender: Dict[int, Sender] = {} id2sender: dict[int, Sender] = {}
id2server: Dict[int, Server] = {} id2server: dict[int, Server] = {}
for x in _entities(): for x in _entities():
if isinstance(x, Exception): if isinstance(x, Exception):
yield x yield x

View file

@ -4,13 +4,13 @@
from setuptools import setup, find_namespace_packages # type: ignore from setuptools import setup, find_namespace_packages # type: ignore
INSTALL_REQUIRES = [ INSTALL_REQUIRES = [
'pytz', # even though it's not needed by the core, it's so common anyway... 'pytz' , # even though it's not needed by the core, it's so common anyway...
'typing-extensions', # one of the most common pypi packages, ok to depend for core 'typing-extensions' , # one of the most common pypi packages, ok to depend for core
'appdirs', # very common, and makes it portable 'appdirs' , # very common, and makes it portable
'more-itertools', # it's just too useful and very common anyway 'more-itertools' , # it's just too useful and very common anyway
'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core 'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI 'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI
'kompress' , # for transparent access to compressed files via pathlib.Path 'kompress>=0.2.20240918' , # for transparent access to compressed files via pathlib.Path
] ]