From 201ddd4d7c45f63f3e3196f6b9be22402822680d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 16 Sep 2024 23:41:58 +0100 Subject: [PATCH] my.core.structure: add support for .tar.gz archives this will be useful to migrate .tar.gz processing to kompress in a backwards compatible way, or to run them against unpacked folder structure if user prefers --- my/core/structure.py | 47 +++++++++++------- my/core/tests/structure.py | 7 +-- .../tests/structure_data/gdpr_export.tar.gz | Bin 0 -> 349 bytes 3 files changed, 33 insertions(+), 21 deletions(-) create mode 100644 my/core/tests/structure_data/gdpr_export.tar.gz diff --git a/my/core/structure.py b/my/core/structure.py index be5b307..fa26532 100644 --- a/my/core/structure.py +++ b/my/core/structure.py @@ -1,6 +1,8 @@ import atexit import os import shutil +import sys +import tarfile import tempfile import zipfile from contextlib import contextmanager @@ -34,6 +36,7 @@ def _structure_exists(base_dir: Path, paths: Sequence[str], *, partial: bool = F ZIP_EXT = {".zip"} +TARGZ_EXT = {".tar.gz"} @contextmanager @@ -44,7 +47,7 @@ def match_structure( partial: bool = False, ) -> Generator[Tuple[Path, ...], None, None]: """ - Given a 'base' directory or zipfile, recursively search for one or more paths that match the + Given a 'base' directory or archive (zip/tar.gz), recursively search for one or more paths that match the pattern described in 'expected'. That can be a single string, or a list of relative paths (as strings) you expect at the same directory. @@ -52,12 +55,12 @@ def match_structure( expected be present, not all of them. This reduces the chances of the user misconfiguring gdpr exports, e.g. - if they zipped the folders instead of the parent directory or vice-versa + if they archived the folders instead of the parent directory or vice-versa When this finds a matching directory structure, it stops searching in that subdirectory and continues onto other possible subdirectories which could match - If base is a zipfile, this extracts the zipfile into a temporary directory + If base is an archive, this extracts it into a temporary directory (configured by core_config.config.get_tmp_dir), and then searches the extracted folder for matching structures @@ -93,12 +96,12 @@ def match_structure( This doesn't require an exhaustive list of expected values, but its a good idea to supply a complete picture of the expected structure to avoid false-positives - This does not recursively unzip zipfiles in the subdirectories, - it only unzips into a temporary directory if 'base' is a zipfile + This does not recursively decompress archives in the subdirectories, + it only unpacks into a temporary directory if 'base' is an archive A common pattern for using this might be to use get_files to get a list - of zipfiles or top-level gdpr export directories, and use match_structure - to search the resulting paths for a export structure you're expecting + of archives or top-level gdpr export directories, and use match_structure + to search the resulting paths for an export structure you're expecting """ from . import core_config as CC @@ -108,26 +111,34 @@ def match_structure( expected = (expected,) is_zip: bool = base.suffix in ZIP_EXT + is_targz: bool = any(base.name.endswith(suffix) for suffix in TARGZ_EXT) searchdir: Path = base.absolute() try: - # if the file given by the user is a zipfile, create a temporary - # directory and extract the zipfile to that temporary directory + # if the file given by the user is an archive, create a temporary + # directory and extract it to that temporary directory # # this temporary directory is removed in the finally block - if is_zip: + if is_zip or is_targz: # sanity check before we start creating directories/rm-tree'ing things - assert base.exists(), f"zipfile at {base} doesn't exist" + assert base.exists(), f"archive at {base} doesn't exist" searchdir = Path(tempfile.mkdtemp(dir=tdir)) - # base might already be a ZipPath, and str(base) would end with / - zf = zipfile.ZipFile(str(base).rstrip('/')) - zf.extractall(path=str(searchdir)) - + if is_zip: + # base might already be a ZipPath, and str(base) would end with / + zf = zipfile.ZipFile(str(base).rstrip('/')) + zf.extractall(path=str(searchdir)) + elif is_targz: + with tarfile.open(str(base)) as tar: + # filter is a security feature, will be required param in later python version + mfilter = {'filter': 'data'} if sys.version_info[:2] >= (3, 12) else {} + tar.extractall(path=str(searchdir), **mfilter) # type: ignore[arg-type] + else: + raise RuntimeError("can't happen") else: if not searchdir.is_dir(): - raise NotADirectoryError(f"Expected either a zipfile or a directory, received {searchdir}") + raise NotADirectoryError(f"Expected either a zip/tar.gz archive or a directory, received {searchdir}") matches: List[Path] = [] possible_targets: List[Path] = [searchdir] @@ -150,9 +161,9 @@ def match_structure( finally: - if is_zip: + if is_zip or is_targz: # make sure we're not mistakenly deleting data - assert str(searchdir).startswith(str(tdir)), f"Expected the temporary directory for extracting zip to start with the temporary directory prefix ({tdir}), found {searchdir}" + assert str(searchdir).startswith(str(tdir)), f"Expected the temporary directory for extracting archive to start with the temporary directory prefix ({tdir}), found {searchdir}" shutil.rmtree(str(searchdir)) diff --git a/my/core/tests/structure.py b/my/core/tests/structure.py index 6a94fc4..741e0ea 100644 --- a/my/core/tests/structure.py +++ b/my/core/tests/structure.py @@ -14,8 +14,9 @@ def test_gdpr_structure_exists() -> None: assert results == (structure_data / "gdpr_subdirs" / "gdpr_export",) -def test_gdpr_unzip() -> None: - with match_structure(structure_data / "gdpr_export.zip", expected=gdpr_expected) as results: +@pytest.mark.parametrize("archive", ["gdpr_export.zip", "gdpr_export.tar.gz"]) +def test_gdpr_unpack(archive: str) -> None: + with match_structure(structure_data / archive, expected=gdpr_expected) as results: assert len(results) == 1 extracted = results[0] index_file = extracted / "messages" / "index.csv" @@ -32,6 +33,6 @@ def test_match_partial() -> None: def test_not_directory() -> None: - with pytest.raises(NotADirectoryError, match=r"Expected either a zipfile or a directory"): + with pytest.raises(NotADirectoryError, match=r"Expected either a zip/tar.gz archive or a directory"): with match_structure(structure_data / "messages/index.csv", expected=gdpr_expected): pass diff --git a/my/core/tests/structure_data/gdpr_export.tar.gz b/my/core/tests/structure_data/gdpr_export.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..4f0597cdc7f3aa062ae896786375c5df87d49ec0 GIT binary patch literal 349 zcmV-j0iymNiwFP!000021MQgYZh|lrhWmS0!4+sf=;`GcgpM$UlC~O?W%s_i%*2^Y zXFZ&OZgnulN@i{zdo#NJi2B!+HOBA;|M`W&T_3Tv6*Z`7g2m&BlMz zRr;_f-9Fy`jr_m#&-tr{*NS^|K6I{W~;~O0r%hkN&(*g^YHJq_f1z1i2s=UmlQDnG5?Wd^FP|(pSb=n v1m^!d{15&6^N0Ko&VTw3+YIx63cPkc`*w{t0fHb1f;