From 821bc08a23924955fddd145ec62d0e9785bbef27 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Wed, 7 Jul 2021 16:44:55 -0700 Subject: [PATCH] core/structure: help locate/extract gdpr exports (#175) * core/structure: help locate/extract gdpr exports * ci: add install-types to install stub packages --- my/core/common.py | 1 + my/core/structure.py | 150 ++++++++++++++++++ tests/core/structure_data/gdpr_export.zip | Bin 0 -> 1273 bytes .../broken_export/comments/comments.json | 0 .../broken_export/messages/index.csv | 0 .../gdpr_export/comments/comments.json | 0 .../gdpr_export/messages/index.csv | 1 + .../gdpr_export/profile/settings.json | 0 tests/core/test_structure.py | 37 +++++ tox.ini | 5 +- 10 files changed, 192 insertions(+), 2 deletions(-) create mode 100644 my/core/structure.py create mode 100644 tests/core/structure_data/gdpr_export.zip create mode 100644 tests/core/structure_data/gdpr_subdirs/broken_export/comments/comments.json create mode 100644 tests/core/structure_data/gdpr_subdirs/broken_export/messages/index.csv create mode 100644 tests/core/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json create mode 100644 tests/core/structure_data/gdpr_subdirs/gdpr_export/messages/index.csv create mode 100644 tests/core/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json create mode 100644 tests/core/test_structure.py diff --git a/my/core/common.py b/my/core/common.py index 48bc4c7..ee7533c 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -18,6 +18,7 @@ def import_file(p: PathIsh, name: Optional[str]=None) -> types.ModuleType: name = p.stem import importlib.util spec = importlib.util.spec_from_file_location(name, p) + assert spec is not None, f"Fatal error; Could not create module spec from {name} {p}" foo = importlib.util.module_from_spec(spec) loader = spec.loader; assert loader is not None loader.exec_module(foo) # type: ignore[attr-defined] diff --git a/my/core/structure.py b/my/core/structure.py new file mode 100644 index 0000000..c764828 --- /dev/null +++ b/my/core/structure.py @@ -0,0 +1,150 @@ +import os +import shutil +import tempfile +import zipfile + +from typing import Sequence, Generator, List, Union, Tuple +from contextlib import contextmanager +from pathlib import Path + +from . import warnings as core_warnings + + +def _structure_exists(base_dir: Path, paths: Sequence[str]) -> bool: + """ + Helper function for match_structure to check if + all subpaths exist at some base directory + + For example: + + dir1 + ├── index.json + └── messages + └── messages.csv + + _structure_exists(Path("dir1"), ["index.json", "messages/messages.csv"]) + """ + for p in paths: + target: Path = base_dir / p + if not target.exists(): + return False + return True + + +ZIP_EXT = {".zip"} + + +@contextmanager +def match_structure( + base: Path, + expected: Union[str, Sequence[str]], +) -> Generator[Tuple[Path, ...], None, None]: + """ + Given a 'base' directory or zipfile, recursively search for one or more paths that match the + pattern described in 'expected'. That can be a single string, or a list + of relative paths (as strings) you expect at the same directory. + + This reduces the chances of the user misconfiguring gdpr exports, e.g. + if they zipped the folders instead of the parent directory or vice-versa + + When this finds a matching directory structure, it stops searching in that subdirectory + and continues onto other possible subdirectories which could match + + If base is a zipfile, this extracts the zipfile into a temporary directory + (configured by core_config.config.get_tmp_dir), and then searches the extracted + folder for matching structures + + This returns the top of every matching folder structure it finds + + As an example: + + export_dir + ├── exp_2020 + │   ├── channel_data + │   │   ├── data1 + │   │   └── data2 + │   ├── index.json + │   ├── messages + │   │   └── messages.csv + │   └── profile + │   └── settings.json + └── exp_2021 + ├── channel_data + │   ├── data1 + │   └── data2 + ├── index.json + ├── messages + │   └── messages.csv + └── profile + └── settings.json + + Giving the top directory as the base, and some expected relative path like: + + with match_structure(Path("export_dir"), expected=("messages/messages.csv", "index.json")) as results: + # results in this block is (Path("export_dir/exp_2020"), Path("export_dir/exp_2021")) + + This doesn't require an exhaustive list of expected values, but its a good idea to supply + a complete picture of the expected structure to avoid false-positives + + This does not recursively unzip zipfiles in the subdirectories, + it only unzips into a temporary directory if 'base' is a zipfile + + A common pattern for using this might be to use get_files to get a list + of zipfiles or top-level gdpr export directories, and use match_structure + to search the resulting paths for a export structure you're expecting + """ + from . import core_config as CC + + tdir = CC.config.get_tmp_dir() + + if isinstance(expected, str): + expected = (expected,) + + is_zip: bool = base.suffix in ZIP_EXT + + searchdir: Path = base.absolute() + try: + # if the file given by the user is a zipfile, create a temporary + # directory and extract the zipfile to that temporary directory + # + # this temporary directory is removed in the finally block + if is_zip: + # sanity check before we start creating directories/rm-tree'ing things + assert base.exists(), f"zipfile at {base} doesn't exist" + + searchdir = Path(tempfile.mkdtemp(dir=tdir)) + + zf = zipfile.ZipFile(base) + zf.extractall(path=str(searchdir)) + + else: + if not searchdir.is_dir(): + raise NotADirectoryError(f"Expected either a zipfile or a directory, received {searchdir}") + + matches: List[Path] = [] + possible_targets: List[Path] = [searchdir] + + while len(possible_targets) > 0: + p = possible_targets.pop(0) + + # factored out into a function to avoid weird stuff with continues/loop state + if _structure_exists(p, expected): + matches.append(p) + else: + # extend the list of possible targets with any subdirectories + for f in os.scandir(p): + if f.is_dir(): + possible_targets.append(p / f.name) + + if len(matches) == 0: + core_warnings.high(f"""While searching {base}, could not find a matching folder structure. Expected {expected}. You're probably missing required files in the gdpr/export""") + + yield tuple(matches) + + finally: + + if is_zip: + # make sure we're not mistakenly deleting data + assert str(searchdir).startswith(str(tdir)), f"Expected the temporary directory for extracting zip to start with the temporary directory prefix ({tdir}), found {searchdir}" + + shutil.rmtree(str(searchdir)) diff --git a/tests/core/structure_data/gdpr_export.zip b/tests/core/structure_data/gdpr_export.zip new file mode 100644 index 0000000000000000000000000000000000000000..e0c8857c29fb24b7e8d9dc0b58ec0eb1567a0168 GIT binary patch literal 1273 zcma))Pfo&65QnD}G=d~9FtRX2H~=0&-58S^Q;Eini8b&f7-^fnY8t}=4#CEy@e1C< z3$Spf3m4wNdHvs^kJ@Hly6HE+nfK|MTCRe4uBx9s|06s$VHbu2+qq$$Z8`4#<;4c5 z?0I3lV?BB`P|9B_fPd&QWm3;lwh5(gjmOM%d4M~eE(}rs#&u-4h}*sAmgz?=WY0*{ z24UNdbvJroLDF+fDN34@(%KRlXRbRkhdc=?u)S62!`kYdjv42-Ll*9|IawH@q)7$t z`s4Maju$PeMUuw2kvU*qx6hyaeeNJr6ydszyL;g;Rb5j`Emh`IGc;=pevZUv)~-rm z7s02A%WkEK$AlOef%k{#T+5QM6y1Rc4e&Ob%g|Ae@G^`0P)izdAF-5Q(kC$x$+`9= z6HszHCW{FUl None: + with match_structure(structure_data, expected=gdpr_expected) as results: + assert results == (structure_data / "gdpr_subdirs" / "gdpr_export",) + + +def test_gdpr_unzip() -> None: + + with match_structure( + structure_data / "gdpr_export.zip", expected=gdpr_expected + ) as results: + assert len(results) == 1 + extracted = results[0] + index_file = extracted / "messages" / "index.csv" + assert index_file.read_text().strip() == "test message" + + # make sure the temporary directory this created no longer exists + assert not extracted.exists() + + +def test_not_directory() -> None: + with pytest.raises(NotADirectoryError, match=r"Expected either a zipfile or a directory"): + with match_structure( + structure_data / "messages/index.csv", expected=gdpr_expected + ): + pass diff --git a/tox.ini b/tox.ini index dace409..d19b791 100644 --- a/tox.ini +++ b/tox.ini @@ -59,7 +59,8 @@ commands = pip install -e .[testing,optional] pip install orgparse # used it core.orgmode? # todo add tests? - python3 -m mypy -p my.core \ + python3 -m mypy --install-types --non-interactive \ + -p my.core \ --txt-report .coverage.mypy-core \ --html-report .coverage.mypy-core \ {posargs} @@ -89,7 +90,7 @@ commands = # todo fuck. -p my.github isn't checking the subpackages?? wtf... # guess it wants .pyi file?? - python3 -m mypy \ + python3 -m mypy --install-types --non-interactive \ -p my.endomondo \ -p my.github.ghexport \ -p my.hypothesis \