my.github.gdpr/my.zulip.organization: use kompress support for tar.gz if it's available

otherwise fall back onto unpacking into tmp dir via my.core.structure
This commit is contained in:
Dima Gerasimov 2024-09-18 23:03:03 +01:00 committed by karlicoss
parent 201ddd4d7c
commit 6a18f47c37
4 changed files with 135 additions and 82 deletions

View file

@ -1,4 +1,5 @@
from .internal import assert_subpackage; assert_subpackage(__name__)
from . import warnings
# do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath)
@ -8,10 +9,7 @@ try:
from kompress import *
except ModuleNotFoundError as e:
if e.name == 'kompress':
warnings.high('Please install kompress (pip3 install kompress), it will be required in the future. Falling onto vendorized kompress for now.')
warnings.high('Please install kompress (pip3 install kompress). Falling onto vendorized kompress for now.')
from ._deprecated.kompress import * # type: ignore[assignment]
else:
raise e
# this is deprecated in compress, keep here for backwards compatibility
open = kopen # noqa: F405

View file

@ -1,36 +1,42 @@
"""
Github data (uses [[https://github.com/settings/admin][official GDPR export]])
"""
from dataclasses import dataclass
from __future__ import annotations
import json
from abc import abstractmethod
from pathlib import Path
import tarfile
from typing import Iterable, Any, Sequence, Dict, Optional
from typing import Any, Iterator, Sequence
from my.core import get_files, Res, PathIsh, stat, Stats, make_logger
from my.core.cfg import make_config
from my.core.error import notnone, echain
from .common import Event, parse_dt, EventIds
# TODO later, use a separate user config? (github_gdpr)
from my.config import github as user_config
@dataclass
class github(user_config):
gdpr_dir: PathIsh # path to unpacked GDPR archive
config = make_config(github)
from my.core import Paths, Res, Stats, get_files, make_logger, stat, warnings
from my.core.error import echain
from .common import Event, EventIds, parse_dt
logger = make_logger(__name__)
class config:
@property
@abstractmethod
def gdpr_dir(self) -> Paths:
raise NotImplementedError
def make_config() -> config:
# TODO later, use a separate user config? (github_gdpr)
from my.config import github as user_config
class combined_config(user_config, config):
pass
return combined_config()
def inputs() -> Sequence[Path]:
gdir = config.gdpr_dir
res = get_files(gdir)
gdpr_dir = make_config().gdpr_dir
res = get_files(gdpr_dir)
schema_json = [f for f in res if f.name == 'schema.json']
was_unpacked = len(schema_json) > 0
if was_unpacked:
@ -43,22 +49,37 @@ def inputs() -> Sequence[Path]:
return res
def events() -> Iterable[Res[Event]]:
def events() -> Iterator[Res[Event]]:
last = max(inputs())
logger.info(f'extracting data from {last}')
# a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples
# another one is zulip archive
if last.is_dir():
files = sorted(last.glob('*.json')) # looks like all files are in the root
open_file = lambda f: f.open()
root: Path | None = None
if last.is_dir(): # if it's already CPath, this will match it
root = last
else:
# treat as .tar.gz
tfile = tarfile.open(last)
files = sorted(map(Path, tfile.getnames()))
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./
try:
from kompress import CPath
root = CPath(last)
assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support
except Exception as e:
logger.exception(e)
warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")
if root is None:
from my.core.structure import match_structure
with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns
[root] = res
yield from _process_one(root)
else:
yield from _process_one(root)
def _process_one(root: Path) -> Iterator[Res[Event]]:
files = sorted(root.glob('*.json')) # looks like all files are in the root
# fmt: off
handler_map = {
@ -100,8 +121,7 @@ def events() -> Iterable[Res[Event]]:
# ignored
continue
with open_file(f) as fo:
j = json.load(fo)
j = json.loads(f.read_text())
for r in j:
try:
yield handler(r)
@ -116,7 +136,7 @@ def stats() -> Stats:
# TODO typing.TypedDict could be handy here..
def _parse_common(d: Dict) -> Dict:
def _parse_common(d: dict) -> dict:
url = d['url']
body = d.get('body')
return {
@ -126,7 +146,7 @@ def _parse_common(d: Dict) -> Dict:
}
def _parse_repository(d: Dict) -> Event:
def _parse_repository(d: dict) -> Event:
pref = 'https://github.com/'
url = d['url']
dts = d['created_at']
@ -142,13 +162,13 @@ def _parse_repository(d: Dict) -> Event:
# user may be None if the user was deleted
def _is_bot(user: Optional[str]) -> bool:
def _is_bot(user: str | None) -> bool:
if user is None:
return False
return "[bot]" in user
def _parse_issue_comment(d: Dict) -> Event:
def _parse_issue_comment(d: dict) -> Event:
url = d['url']
return Event(
**_parse_common(d),
@ -158,7 +178,7 @@ def _parse_issue_comment(d: Dict) -> Event:
)
def _parse_issue(d: Dict) -> Event:
def _parse_issue(d: dict) -> Event:
url = d['url']
title = d['title']
return Event(
@ -169,7 +189,7 @@ def _parse_issue(d: Dict) -> Event:
)
def _parse_pull_request(d: Dict) -> Event:
def _parse_pull_request(d: dict) -> Event:
dts = d['created_at']
url = d['url']
title = d['title']
@ -183,7 +203,7 @@ def _parse_pull_request(d: Dict) -> Event:
)
def _parse_project(d: Dict) -> Event:
def _parse_project(d: dict) -> Event:
url = d['url']
title = d['name']
is_bot = "[bot]" in d["creator"]
@ -198,7 +218,7 @@ def _parse_project(d: Dict) -> Event:
)
def _parse_release(d: Dict) -> Event:
def _parse_release(d: dict) -> Event:
tag = d['tag_name']
return Event(
**_parse_common(d),
@ -207,7 +227,7 @@ def _parse_release(d: Dict) -> Event:
)
def _parse_commit_comment(d: Dict) -> Event:
def _parse_commit_comment(d: dict) -> Event:
url = d['url']
return Event(
**_parse_common(d),

View file

@ -1,38 +1,55 @@
"""
Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
"""
from __future__ import annotations
import json
from abc import abstractmethod
from dataclasses import dataclass
from datetime import datetime, timezone
from itertools import count
import json
from pathlib import Path
from typing import Sequence, Iterator, Dict, Union
from typing import Iterator, Sequence
from my.core import (
assert_never,
datetime_aware,
get_files,
stat,
Json,
Paths,
Res,
Stats,
assert_never,
datetime_aware,
get_files,
make_logger,
stat,
warnings,
)
from my.core.error import notnone
import my.config
logger = make_logger(__name__)
@dataclass
class organization(my.config.zulip.organization):
# paths[s]/glob to the exported JSON data
export_path: Paths
class config:
@property
@abstractmethod
def export_path(self) -> Paths:
"""paths[s]/glob to the exported JSON data"""
raise NotImplementedError
def make_config() -> config:
from my.config import zulip as user_config
class combined_config(user_config.organization, config):
pass
return combined_config()
def inputs() -> Sequence[Path]:
# TODO: seems like export ids are kinda random..
# not sure what's the best way to figure out the last without renaming?
# could use mtime perhaps?
return get_files(organization.export_path, sort=False)
return get_files(make_config().export_path, sort=False)
@dataclass(frozen=True)
@ -85,19 +102,39 @@ class Message:
# todo cache it
def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
def _entities() -> Iterator[Res[Server | Sender | _Message]]:
last = max(inputs())
# todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
# I guess makes sense to have a special implementation for .tar.gz considering how common are they
import tarfile
logger.info(f'extracting data from {last}')
tfile = tarfile.open(last)
root: Path | None = None
subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that
if last.is_dir(): # if it's already CPath, this will match it
root = last
else:
try:
from kompress import CPath
with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
rj = json.load(fo)
root = CPath(last)
assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support
except Exception as e:
logger.exception(e)
warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")
if root is None:
from my.core.structure import match_structure
with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns
[root] = res
yield from _process_one(root)
else:
yield from _process_one(root)
def _process_one(root: Path) -> Iterator[Res[Server | Sender | _Message]]:
[subdir] = root.iterdir() # there is a directory inside tar file, first name should be that
rj = json.loads((subdir / 'realm.json').read_text())
[sj] = rj['zerver_realm']
server = Server(
@ -136,12 +173,10 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
for idx in count(start=1, step=1):
fname = f'messages-{idx:06}.json'
fpath = f'{subdir}/{fname}'
if fpath not in tfile.getnames():
# tarfile doesn't have .exists?
fpath = subdir / fname
if not fpath.exists():
break
with notnone(tfile.extractfile(fpath)) as fo:
mj = json.load(fo)
mj = json.loads(fpath.read_text())
# TODO handle zerver_usermessage
for j in mj['zerver_message']:
try:
@ -151,8 +186,8 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
def messages() -> Iterator[Res[Message]]:
id2sender: Dict[int, Sender] = {}
id2server: Dict[int, Server] = {}
id2sender: dict[int, Sender] = {}
id2server: dict[int, Server] = {}
for x in _entities():
if isinstance(x, Exception):
yield x

View file

@ -4,13 +4,13 @@
from setuptools import setup, find_namespace_packages # type: ignore
INSTALL_REQUIRES = [
'pytz', # even though it's not needed by the core, it's so common anyway...
'typing-extensions', # one of the most common pypi packages, ok to depend for core
'appdirs', # very common, and makes it portable
'more-itertools', # it's just too useful and very common anyway
'pytz' , # even though it's not needed by the core, it's so common anyway...
'typing-extensions' , # one of the most common pypi packages, ok to depend for core
'appdirs' , # very common, and makes it portable
'more-itertools' , # it's just too useful and very common anyway
'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI
'kompress' , # for transparent access to compressed files via pathlib.Path
'kompress>=0.2.20240918' , # for transparent access to compressed files via pathlib.Path
]