my.github.gdpr/my.zulip.organization: use kompress support for tar.gz if it's available
otherwise fall back onto unpacking into tmp dir via my.core.structure
This commit is contained in:
parent
201ddd4d7c
commit
6a18f47c37
4 changed files with 135 additions and 82 deletions
|
@ -1,4 +1,5 @@
|
||||||
from .internal import assert_subpackage; assert_subpackage(__name__)
|
from .internal import assert_subpackage; assert_subpackage(__name__)
|
||||||
|
|
||||||
from . import warnings
|
from . import warnings
|
||||||
|
|
||||||
# do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath)
|
# do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath)
|
||||||
|
@ -8,10 +9,7 @@ try:
|
||||||
from kompress import *
|
from kompress import *
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
if e.name == 'kompress':
|
if e.name == 'kompress':
|
||||||
warnings.high('Please install kompress (pip3 install kompress), it will be required in the future. Falling onto vendorized kompress for now.')
|
warnings.high('Please install kompress (pip3 install kompress). Falling onto vendorized kompress for now.')
|
||||||
from ._deprecated.kompress import * # type: ignore[assignment]
|
from ._deprecated.kompress import * # type: ignore[assignment]
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
# this is deprecated in compress, keep here for backwards compatibility
|
|
||||||
open = kopen # noqa: F405
|
|
||||||
|
|
|
@ -1,36 +1,42 @@
|
||||||
"""
|
"""
|
||||||
Github data (uses [[https://github.com/settings/admin][official GDPR export]])
|
Github data (uses [[https://github.com/settings/admin][official GDPR export]])
|
||||||
"""
|
"""
|
||||||
from dataclasses import dataclass
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
from abc import abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import tarfile
|
from typing import Any, Iterator, Sequence
|
||||||
from typing import Iterable, Any, Sequence, Dict, Optional
|
|
||||||
|
|
||||||
from my.core import get_files, Res, PathIsh, stat, Stats, make_logger
|
from my.core import Paths, Res, Stats, get_files, make_logger, stat, warnings
|
||||||
from my.core.cfg import make_config
|
from my.core.error import echain
|
||||||
from my.core.error import notnone, echain
|
|
||||||
|
|
||||||
from .common import Event, parse_dt, EventIds
|
|
||||||
|
|
||||||
# TODO later, use a separate user config? (github_gdpr)
|
|
||||||
from my.config import github as user_config
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class github(user_config):
|
|
||||||
gdpr_dir: PathIsh # path to unpacked GDPR archive
|
|
||||||
|
|
||||||
|
|
||||||
config = make_config(github)
|
|
||||||
|
|
||||||
|
from .common import Event, EventIds, parse_dt
|
||||||
|
|
||||||
logger = make_logger(__name__)
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class config:
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def gdpr_dir(self) -> Paths:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def make_config() -> config:
|
||||||
|
# TODO later, use a separate user config? (github_gdpr)
|
||||||
|
from my.config import github as user_config
|
||||||
|
|
||||||
|
class combined_config(user_config, config):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return combined_config()
|
||||||
|
|
||||||
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
gdir = config.gdpr_dir
|
gdpr_dir = make_config().gdpr_dir
|
||||||
res = get_files(gdir)
|
res = get_files(gdpr_dir)
|
||||||
schema_json = [f for f in res if f.name == 'schema.json']
|
schema_json = [f for f in res if f.name == 'schema.json']
|
||||||
was_unpacked = len(schema_json) > 0
|
was_unpacked = len(schema_json) > 0
|
||||||
if was_unpacked:
|
if was_unpacked:
|
||||||
|
@ -43,22 +49,37 @@ def inputs() -> Sequence[Path]:
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def events() -> Iterable[Res[Event]]:
|
def events() -> Iterator[Res[Event]]:
|
||||||
last = max(inputs())
|
last = max(inputs())
|
||||||
|
|
||||||
logger.info(f'extracting data from {last}')
|
logger.info(f'extracting data from {last}')
|
||||||
|
|
||||||
# a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples
|
root: Path | None = None
|
||||||
# another one is zulip archive
|
|
||||||
if last.is_dir():
|
if last.is_dir(): # if it's already CPath, this will match it
|
||||||
files = sorted(last.glob('*.json')) # looks like all files are in the root
|
root = last
|
||||||
open_file = lambda f: f.open()
|
|
||||||
else:
|
else:
|
||||||
# treat as .tar.gz
|
try:
|
||||||
tfile = tarfile.open(last)
|
from kompress import CPath
|
||||||
files = sorted(map(Path, tfile.getnames()))
|
|
||||||
files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
|
root = CPath(last)
|
||||||
open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./
|
assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
|
warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")
|
||||||
|
|
||||||
|
if root is None:
|
||||||
|
from my.core.structure import match_structure
|
||||||
|
|
||||||
|
with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns
|
||||||
|
[root] = res
|
||||||
|
yield from _process_one(root)
|
||||||
|
else:
|
||||||
|
yield from _process_one(root)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_one(root: Path) -> Iterator[Res[Event]]:
|
||||||
|
files = sorted(root.glob('*.json')) # looks like all files are in the root
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
handler_map = {
|
handler_map = {
|
||||||
|
@ -100,8 +121,7 @@ def events() -> Iterable[Res[Event]]:
|
||||||
# ignored
|
# ignored
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with open_file(f) as fo:
|
j = json.loads(f.read_text())
|
||||||
j = json.load(fo)
|
|
||||||
for r in j:
|
for r in j:
|
||||||
try:
|
try:
|
||||||
yield handler(r)
|
yield handler(r)
|
||||||
|
@ -116,7 +136,7 @@ def stats() -> Stats:
|
||||||
|
|
||||||
|
|
||||||
# TODO typing.TypedDict could be handy here..
|
# TODO typing.TypedDict could be handy here..
|
||||||
def _parse_common(d: Dict) -> Dict:
|
def _parse_common(d: dict) -> dict:
|
||||||
url = d['url']
|
url = d['url']
|
||||||
body = d.get('body')
|
body = d.get('body')
|
||||||
return {
|
return {
|
||||||
|
@ -126,7 +146,7 @@ def _parse_common(d: Dict) -> Dict:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _parse_repository(d: Dict) -> Event:
|
def _parse_repository(d: dict) -> Event:
|
||||||
pref = 'https://github.com/'
|
pref = 'https://github.com/'
|
||||||
url = d['url']
|
url = d['url']
|
||||||
dts = d['created_at']
|
dts = d['created_at']
|
||||||
|
@ -142,13 +162,13 @@ def _parse_repository(d: Dict) -> Event:
|
||||||
|
|
||||||
|
|
||||||
# user may be None if the user was deleted
|
# user may be None if the user was deleted
|
||||||
def _is_bot(user: Optional[str]) -> bool:
|
def _is_bot(user: str | None) -> bool:
|
||||||
if user is None:
|
if user is None:
|
||||||
return False
|
return False
|
||||||
return "[bot]" in user
|
return "[bot]" in user
|
||||||
|
|
||||||
|
|
||||||
def _parse_issue_comment(d: Dict) -> Event:
|
def _parse_issue_comment(d: dict) -> Event:
|
||||||
url = d['url']
|
url = d['url']
|
||||||
return Event(
|
return Event(
|
||||||
**_parse_common(d),
|
**_parse_common(d),
|
||||||
|
@ -158,7 +178,7 @@ def _parse_issue_comment(d: Dict) -> Event:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _parse_issue(d: Dict) -> Event:
|
def _parse_issue(d: dict) -> Event:
|
||||||
url = d['url']
|
url = d['url']
|
||||||
title = d['title']
|
title = d['title']
|
||||||
return Event(
|
return Event(
|
||||||
|
@ -169,7 +189,7 @@ def _parse_issue(d: Dict) -> Event:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _parse_pull_request(d: Dict) -> Event:
|
def _parse_pull_request(d: dict) -> Event:
|
||||||
dts = d['created_at']
|
dts = d['created_at']
|
||||||
url = d['url']
|
url = d['url']
|
||||||
title = d['title']
|
title = d['title']
|
||||||
|
@ -183,7 +203,7 @@ def _parse_pull_request(d: Dict) -> Event:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _parse_project(d: Dict) -> Event:
|
def _parse_project(d: dict) -> Event:
|
||||||
url = d['url']
|
url = d['url']
|
||||||
title = d['name']
|
title = d['name']
|
||||||
is_bot = "[bot]" in d["creator"]
|
is_bot = "[bot]" in d["creator"]
|
||||||
|
@ -198,7 +218,7 @@ def _parse_project(d: Dict) -> Event:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _parse_release(d: Dict) -> Event:
|
def _parse_release(d: dict) -> Event:
|
||||||
tag = d['tag_name']
|
tag = d['tag_name']
|
||||||
return Event(
|
return Event(
|
||||||
**_parse_common(d),
|
**_parse_common(d),
|
||||||
|
@ -207,7 +227,7 @@ def _parse_release(d: Dict) -> Event:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _parse_commit_comment(d: Dict) -> Event:
|
def _parse_commit_comment(d: dict) -> Event:
|
||||||
url = d['url']
|
url = d['url']
|
||||||
return Event(
|
return Event(
|
||||||
**_parse_common(d),
|
**_parse_common(d),
|
||||||
|
|
|
@ -1,38 +1,55 @@
|
||||||
"""
|
"""
|
||||||
Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
|
Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from abc import abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from itertools import count
|
from itertools import count
|
||||||
import json
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Sequence, Iterator, Dict, Union
|
from typing import Iterator, Sequence
|
||||||
|
|
||||||
from my.core import (
|
from my.core import (
|
||||||
assert_never,
|
|
||||||
datetime_aware,
|
|
||||||
get_files,
|
|
||||||
stat,
|
|
||||||
Json,
|
Json,
|
||||||
Paths,
|
Paths,
|
||||||
Res,
|
Res,
|
||||||
Stats,
|
Stats,
|
||||||
|
assert_never,
|
||||||
|
datetime_aware,
|
||||||
|
get_files,
|
||||||
|
make_logger,
|
||||||
|
stat,
|
||||||
|
warnings,
|
||||||
)
|
)
|
||||||
from my.core.error import notnone
|
|
||||||
import my.config
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
class config:
|
||||||
class organization(my.config.zulip.organization):
|
@property
|
||||||
# paths[s]/glob to the exported JSON data
|
@abstractmethod
|
||||||
export_path: Paths
|
def export_path(self) -> Paths:
|
||||||
|
"""paths[s]/glob to the exported JSON data"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def make_config() -> config:
|
||||||
|
from my.config import zulip as user_config
|
||||||
|
|
||||||
|
class combined_config(user_config.organization, config):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return combined_config()
|
||||||
|
|
||||||
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
# TODO: seems like export ids are kinda random..
|
# TODO: seems like export ids are kinda random..
|
||||||
# not sure what's the best way to figure out the last without renaming?
|
# not sure what's the best way to figure out the last without renaming?
|
||||||
# could use mtime perhaps?
|
# could use mtime perhaps?
|
||||||
return get_files(organization.export_path, sort=False)
|
return get_files(make_config().export_path, sort=False)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
@ -85,19 +102,39 @@ class Message:
|
||||||
|
|
||||||
|
|
||||||
# todo cache it
|
# todo cache it
|
||||||
def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
|
def _entities() -> Iterator[Res[Server | Sender | _Message]]:
|
||||||
last = max(inputs())
|
last = max(inputs())
|
||||||
|
|
||||||
# todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
|
logger.info(f'extracting data from {last}')
|
||||||
# I guess makes sense to have a special implementation for .tar.gz considering how common are they
|
|
||||||
import tarfile
|
|
||||||
|
|
||||||
tfile = tarfile.open(last)
|
root: Path | None = None
|
||||||
|
|
||||||
subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that
|
if last.is_dir(): # if it's already CPath, this will match it
|
||||||
|
root = last
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
from kompress import CPath
|
||||||
|
|
||||||
with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
|
root = CPath(last)
|
||||||
rj = json.load(fo)
|
assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
|
warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")
|
||||||
|
|
||||||
|
if root is None:
|
||||||
|
from my.core.structure import match_structure
|
||||||
|
|
||||||
|
with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns
|
||||||
|
[root] = res
|
||||||
|
yield from _process_one(root)
|
||||||
|
else:
|
||||||
|
yield from _process_one(root)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_one(root: Path) -> Iterator[Res[Server | Sender | _Message]]:
|
||||||
|
[subdir] = root.iterdir() # there is a directory inside tar file, first name should be that
|
||||||
|
|
||||||
|
rj = json.loads((subdir / 'realm.json').read_text())
|
||||||
|
|
||||||
[sj] = rj['zerver_realm']
|
[sj] = rj['zerver_realm']
|
||||||
server = Server(
|
server = Server(
|
||||||
|
@ -136,12 +173,10 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
|
||||||
|
|
||||||
for idx in count(start=1, step=1):
|
for idx in count(start=1, step=1):
|
||||||
fname = f'messages-{idx:06}.json'
|
fname = f'messages-{idx:06}.json'
|
||||||
fpath = f'{subdir}/{fname}'
|
fpath = subdir / fname
|
||||||
if fpath not in tfile.getnames():
|
if not fpath.exists():
|
||||||
# tarfile doesn't have .exists?
|
|
||||||
break
|
break
|
||||||
with notnone(tfile.extractfile(fpath)) as fo:
|
mj = json.loads(fpath.read_text())
|
||||||
mj = json.load(fo)
|
|
||||||
# TODO handle zerver_usermessage
|
# TODO handle zerver_usermessage
|
||||||
for j in mj['zerver_message']:
|
for j in mj['zerver_message']:
|
||||||
try:
|
try:
|
||||||
|
@ -151,8 +186,8 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
|
||||||
|
|
||||||
|
|
||||||
def messages() -> Iterator[Res[Message]]:
|
def messages() -> Iterator[Res[Message]]:
|
||||||
id2sender: Dict[int, Sender] = {}
|
id2sender: dict[int, Sender] = {}
|
||||||
id2server: Dict[int, Server] = {}
|
id2server: dict[int, Server] = {}
|
||||||
for x in _entities():
|
for x in _entities():
|
||||||
if isinstance(x, Exception):
|
if isinstance(x, Exception):
|
||||||
yield x
|
yield x
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -10,7 +10,7 @@ INSTALL_REQUIRES = [
|
||||||
'more-itertools' , # it's just too useful and very common anyway
|
'more-itertools' , # it's just too useful and very common anyway
|
||||||
'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
|
'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
|
||||||
'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI
|
'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI
|
||||||
'kompress' , # for transparent access to compressed files via pathlib.Path
|
'kompress>=0.2.20240918' , # for transparent access to compressed files via pathlib.Path
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue