my.github.gdpr/my.zulip.organization: use kompress support for tar.gz if it's available

otherwise fall back onto unpacking into tmp dir via my.core.structure
2024-09-18 23:03:03 +01:00 · 2024-09-18 23:03:03 +01:00 · 6a18f47c37
commit 6a18f47c37
parent 201ddd4d7c
4 changed files with 135 additions and 82 deletions
--- a/my/core/kompress.py
+++ b/my/core/kompress.py
@ -1,4 +1,5 @@
 from .internal import assert_subpackage; assert_subpackage(__name__)
 from . import warnings
 # do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath)
@ -8,10 +9,7 @@ try:
    from kompress import *
 except ModuleNotFoundError as e:
    if e.name == 'kompress':
-        warnings.high('Please install kompress (pip3 install kompress), it will be required in the future. Falling onto vendorized kompress for now.')
+        warnings.high('Please install kompress (pip3 install kompress). Falling onto vendorized kompress for now.')
        from ._deprecated.kompress import *  # type: ignore[assignment]
    else:
        raise e
 # this is deprecated in compress, keep here for backwards compatibility
 open = kopen  # noqa: F405
--- a/my/github/gdpr.py
+++ b/my/github/gdpr.py
@ -1,36 +1,42 @@
 """
 Github data (uses [[https://github.com/settings/admin][official GDPR export]])
 """
-from dataclasses import dataclass
+
 from __future__ import annotations
 import json
 from abc import abstractmethod
 from pathlib import Path
-import tarfile
+from typing import Any, Iterator, Sequence
 from typing import Iterable, Any, Sequence, Dict, Optional
-from my.core import get_files, Res, PathIsh, stat, Stats, make_logger
+from my.core import Paths, Res, Stats, get_files, make_logger, stat, warnings
-from my.core.cfg import make_config
+from my.core.error import echain
 from my.core.error import notnone, echain
 from .common import Event, parse_dt, EventIds
 # TODO later, use a separate user config? (github_gdpr)
 from my.config import github as user_config
@dataclass
 class github(user_config):
    gdpr_dir: PathIsh  # path to unpacked GDPR archive
 config = make_config(github)
 from .common import Event, EventIds, parse_dt
 logger = make_logger(__name__)
 class config:
    @property
    @abstractmethod
    def gdpr_dir(self) -> Paths:
        raise NotImplementedError
 def make_config() -> config:
    # TODO later, use a separate user config? (github_gdpr)
    from my.config import github as user_config
    class combined_config(user_config, config):
        pass
    return combined_config()
 def inputs() -> Sequence[Path]:
-    gdir = config.gdpr_dir
+    gdpr_dir = make_config().gdpr_dir
-    res = get_files(gdir)
+    res = get_files(gdpr_dir)
    schema_json = [f for f in res if f.name == 'schema.json']
    was_unpacked = len(schema_json) > 0
    if was_unpacked:
@ -43,22 +49,37 @@ def inputs() -> Sequence[Path]:
    return res
-def events() -> Iterable[Res[Event]]:
+def events() -> Iterator[Res[Event]]:
    last = max(inputs())
    logger.info(f'extracting data from {last}')
-    # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples
+    root: Path | None = None
-    # another one is zulip archive
+
-    if last.is_dir():
+    if last.is_dir():  # if it's already CPath, this will match it
-        files = sorted(last.glob('*.json'))  # looks like all files are in the root
+        root = last
        open_file = lambda f: f.open()
    else:
-        # treat as .tar.gz
+        try:
-        tfile = tarfile.open(last)
+            from kompress import CPath
-        files = sorted(map(Path, tfile.getnames()))
+
-        files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json']
+            root = CPath(last)
-        open_file = lambda p: notnone(tfile.extractfile(f'./{p}'))  # NOTE odd, doesn't work without ./
+            assert len(list(root.iterdir())) > 0  # trigger to check if we have the kompress version with targz support
        except Exception as e:
            logger.exception(e)
            warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")
    if root is None:
        from my.core.structure import match_structure
        with match_structure(last, expected=()) as res:  # expected=() matches it regardless any patterns
            [root] = res
            yield from _process_one(root)
    else:
        yield from _process_one(root)
 def _process_one(root: Path) -> Iterator[Res[Event]]:
    files = sorted(root.glob('*.json'))  # looks like all files are in the root
    # fmt: off
    handler_map = {
@ -100,8 +121,7 @@ def events() -> Iterable[Res[Event]]:
            # ignored
            continue
-        with open_file(f) as fo:
+        j = json.loads(f.read_text())
            j = json.load(fo)
        for r in j:
            try:
                yield handler(r)
@ -116,7 +136,7 @@ def stats() -> Stats:
 # TODO typing.TypedDict could be handy here..
-def _parse_common(d: Dict) -> Dict:
+def _parse_common(d: dict) -> dict:
    url = d['url']
    body = d.get('body')
    return {
@ -126,7 +146,7 @@ def _parse_common(d: Dict) -> Dict:
    }
-def _parse_repository(d: Dict) -> Event:
+def _parse_repository(d: dict) -> Event:
    pref = 'https://github.com/'
    url = d['url']
    dts = d['created_at']
@ -142,13 +162,13 @@ def _parse_repository(d: Dict) -> Event:
 # user may be None if the user was deleted
-def _is_bot(user: Optional[str]) -> bool:
+def _is_bot(user: str | None) -> bool:
    if user is None:
        return False
    return "[bot]" in user
-def _parse_issue_comment(d: Dict) -> Event:
+def _parse_issue_comment(d: dict) -> Event:
    url = d['url']
    return Event(
        **_parse_common(d),
@ -158,7 +178,7 @@ def _parse_issue_comment(d: Dict) -> Event:
    )
-def _parse_issue(d: Dict) -> Event:
+def _parse_issue(d: dict) -> Event:
    url = d['url']
    title = d['title']
    return Event(
@ -169,7 +189,7 @@ def _parse_issue(d: Dict) -> Event:
    )
-def _parse_pull_request(d: Dict) -> Event:
+def _parse_pull_request(d: dict) -> Event:
    dts = d['created_at']
    url = d['url']
    title = d['title']
@ -183,7 +203,7 @@ def _parse_pull_request(d: Dict) -> Event:
    )
-def _parse_project(d: Dict) -> Event:
+def _parse_project(d: dict) -> Event:
    url = d['url']
    title = d['name']
    is_bot = "[bot]" in d["creator"]
@ -198,7 +218,7 @@ def _parse_project(d: Dict) -> Event:
    )
-def _parse_release(d: Dict) -> Event:
+def _parse_release(d: dict) -> Event:
    tag = d['tag_name']
    return Event(
        **_parse_common(d),
@ -207,7 +227,7 @@ def _parse_release(d: Dict) -> Event:
    )
-def _parse_commit_comment(d: Dict) -> Event:
+def _parse_commit_comment(d: dict) -> Event:
    url = d['url']
    return Event(
        **_parse_common(d),
--- a/my/zulip/organization.py
+++ b/my/zulip/organization.py
@ -1,38 +1,55 @@
 """
 Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
 """
 from __future__ import annotations
 import json
 from abc import abstractmethod
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from itertools import count
 import json
 from pathlib import Path
-from typing import Sequence, Iterator, Dict, Union
+from typing import Iterator, Sequence
 from my.core import (
    assert_never,
    datetime_aware,
    get_files,
    stat,
    Json,
    Paths,
    Res,
    Stats,
    assert_never,
    datetime_aware,
    get_files,
    make_logger,
    stat,
    warnings,
 )
-from my.core.error import notnone
+
-import my.config
+logger = make_logger(__name__)
-@dataclass
+class config:
-class organization(my.config.zulip.organization):
+    @property
-    # paths[s]/glob to the exported JSON data
+    @abstractmethod
-    export_path: Paths
+    def export_path(self) -> Paths:
        """paths[s]/glob to the exported JSON data"""
        raise NotImplementedError
 def make_config() -> config:
    from my.config import zulip as user_config
    class combined_config(user_config.organization, config):
        pass
    return combined_config()
 def inputs() -> Sequence[Path]:
    # TODO: seems like export ids are kinda random..
    # not sure what's the best way to figure out the last without renaming?
    # could use mtime perhaps?
-    return get_files(organization.export_path, sort=False)
+    return get_files(make_config().export_path, sort=False)
@dataclass(frozen=True)
@ -85,19 +102,39 @@ class Message:
 # todo cache it
-def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
+def _entities() -> Iterator[Res[Server | Sender | _Message]]:
    last = max(inputs())
-    # todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
+    logger.info(f'extracting data from {last}')
    # I guess makes sense to have a special implementation for .tar.gz considering how common are they
    import tarfile
-    tfile = tarfile.open(last)
+    root: Path | None = None
-    subdir = tfile.getnames()[0]  # there is a directory inside tar file, first name should be that
+    if last.is_dir():  # if it's already CPath, this will match it
        root = last
    else:
        try:
            from kompress import CPath
-    with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
+            root = CPath(last)
-        rj = json.load(fo)
+            assert len(list(root.iterdir())) > 0  # trigger to check if we have the kompress version with targz support
        except Exception as e:
            logger.exception(e)
            warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.")
    if root is None:
        from my.core.structure import match_structure
        with match_structure(last, expected=()) as res:  # expected=() matches it regardless any patterns
            [root] = res
            yield from _process_one(root)
    else:
        yield from _process_one(root)
 def _process_one(root: Path) -> Iterator[Res[Server | Sender | _Message]]:
    [subdir] = root.iterdir()  # there is a directory inside tar file, first name should be that
    rj = json.loads((subdir / 'realm.json').read_text())
    [sj] = rj['zerver_realm']
    server = Server(
@ -136,12 +173,10 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
    for idx in count(start=1, step=1):
        fname = f'messages-{idx:06}.json'
-        fpath = f'{subdir}/{fname}'
+        fpath = subdir / fname
-        if fpath not in tfile.getnames():
+        if not fpath.exists():
            # tarfile doesn't have .exists?
            break
-        with notnone(tfile.extractfile(fpath)) as fo:
+        mj = json.loads(fpath.read_text())
            mj = json.load(fo)
        # TODO handle  zerver_usermessage
        for j in mj['zerver_message']:
            try:
@ -151,8 +186,8 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
 def messages() -> Iterator[Res[Message]]:
-    id2sender: Dict[int, Sender] = {}
+    id2sender: dict[int, Sender] = {}
-    id2server: Dict[int, Server] = {}
+    id2server: dict[int, Server] = {}
    for x in _entities():
        if isinstance(x, Exception):
            yield x
--- a/setup.py
+++ b/setup.py
@ -4,13 +4,13 @@
 from setuptools import setup, find_namespace_packages # type: ignore
 INSTALL_REQUIRES = [
-    'pytz',           # even though it's not needed by the core, it's so common anyway...
+    'pytz'                   , # even though it's not needed by the core, it's so common anyway...
-    'typing-extensions',  # one of the most common pypi packages, ok to depend for core
+    'typing-extensions'      , # one of the most common pypi packages, ok to depend for core
-    'appdirs',        # very common, and makes it portable
+    'appdirs'                , # very common, and makes it portable
-    'more-itertools', # it's just too useful and very common anyway
+    'more-itertools'         , # it's just too useful and very common anyway
-    'decorator'     , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
+    'decorator'              , # less pain in writing correct decorators. very mature and stable, so worth keeping in core
-    'click>=8.1'    , # for the CLI, printing colors, decorator-based - may allow extensions to CLI
+    'click>=8.1'             , # for the CLI, printing colors, decorator-based - may allow extensions to CLI
-    'kompress'      , # for transparent access to compressed files via pathlib.Path
+    'kompress>=0.2.20240918' , # for transparent access to compressed files via pathlib.Path
 ]