general: use less explicit kompress boilerplate in modules

now get_files/kompress library can handle it transparently
2023-10-14 23:18:01 +01:00 · 2023-10-14 23:18:01 +01:00 · 8c2d1c9463
commit 8c2d1c9463
parent c63e80ce94
9 changed files with 14 additions and 19 deletions
--- a/my/core/_deprecated/kompress.py
+++ b/my/core/_deprecated/kompress.py
@ -257,4 +257,8 @@ class ZipPath(zipfile_Path):
        )
        return os.stat_result(tuple(params.values()))

+    @property
+    def suffix(self) -> str:
+        return Path(self.parts[-1]).suffix
+
 # fmt: on
--- a/my/core/structure.py
+++ b/my/core/structure.py
@ -123,7 +123,8 @@ def match_structure(

            searchdir = Path(tempfile.mkdtemp(dir=tdir))

-            zf = zipfile.ZipFile(base)
+            # base might already be a ZipPath, and str(base) would end with /
+            zf = zipfile.ZipFile(str(base).rstrip('/'))
            zf.extractall(path=str(searchdir))

        else:
--- a/my/google/takeout/html.py
+++ b/my/google/takeout/html.py
@ -146,12 +146,11 @@ class TakeoutHTMLParser(HTMLParser):


 def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
-    from ...core.kompress import kopen
    results: List[Parsed] = []
    def cb(dt: datetime, url: Url, title: Title) -> None:
        results.append((dt, url, title))
    parser = TakeoutHTMLParser(callback=cb)
-    with kopen(tpath, file) as fo:
+    with (tpath / file).open() as fo:
        data = fo.read()
        parser.feed(data)
    return results
--- a/my/google/takeout/parser.py
+++ b/my/google/takeout/parser.py
@ -94,10 +94,9 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults:
    for path in reversed(inputs()):
        with ExitStack() as exit_stack:
            if config._use_zippath:
-                from my.core.kompress import ZipPath
                # for later takeouts it's just 'Takeout' dir,
                # but for older (pre 2015) it contains email/date in the subdir name
-                results = tuple(cast(Sequence[Path], ZipPath(path).iterdir()))
+                results = tuple(cast(Sequence[Path], path.iterdir()))
            else:
                results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True))
            for m in results:
--- a/my/google/takeout/paths.py
+++ b/my/google/takeout/paths.py
@ -23,8 +23,6 @@ config = make_config(google)
 from pathlib import Path
 from typing import Optional, Iterable

-from ...core.kompress import kexists
-

 def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
    """
@ -33,7 +31,7 @@ def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
    # TODO FIXME zip is not great..
    # allow a lambda expression? that way the user could restrict it
    for takeout in get_files(config.takeout_path, glob='*.zip'):
-        if path is None or kexists(takeout, path):
+        if path is None or (takeout / path).exists():
            yield takeout


--- a/my/instagram/gdpr.py
+++ b/my/instagram/gdpr.py
@ -17,7 +17,6 @@ from my.core import (
    assert_never,
    make_logger,
 )
-from my.core.kompress import ZipPath

 from my.config import instagram as user_config

@ -70,7 +69,7 @@ def _decode(s: str) -> str:


 def _entities() -> Iterator[Res[Union[User, _Message]]]:
-    last = ZipPath(max(inputs()))
+    last = max(inputs())
    # TODO make sure it works both with plan directory
    # idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
    # e.g. possible options are:
--- a/my/location/google.py
+++ b/my/location/google.py
@ -21,7 +21,6 @@ import geopy # type: ignore

 from ..core.common import LazyLogger, mcachew
 from ..core.cachew import cache_dir
-from ..core import kompress

 from my.core.warnings import high

@ -135,7 +134,7 @@ def _iter_locations(path: Path, start=0, stop=None) -> Iterable[Location]:
        ctx = path.open('r')
    else: # must be a takeout archive
        # todo CPath? although not sure if it can be iterative?
-        ctx = kompress.open(path, _LOCATION_JSON)
+        ctx = (path / _LOCATION_JSON).open()

    if USE_GREP:
        unzip = f'unzip -p "{path}" "{_LOCATION_JSON}"'
--- a/my/stackexchange/gdpr.py
+++ b/my/stackexchange/gdpr.py
@ -6,7 +6,7 @@ Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][officia

 ### config
 from my.config import stackexchange as user_config
-from ..core import dataclass, PathIsh, make_config
+from ..core import dataclass, PathIsh, make_config, get_files
@dataclass
 class stackexchange(user_config):
    gdpr_path: PathIsh  # path to GDPR zip file
@ -61,12 +61,11 @@ class Vote(NamedTuple):
    # todo expose vote type?

 import json
-from ..core.kompress import ZipPath
 from ..core.error import Res
 def votes() -> Iterable[Res[Vote]]:
    # TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed
    # todo should be defensive? not sure if present when user has no votes
-    path = ZipPath(config.gdpr_path)
+    path = max(get_files(config.gdpr_path))
    votes_path = path / 'analytics' /  'qa\\vote.submit.json'  # yes, it does contain a backslash...
    j = json.loads(votes_path.read_text(encoding='utf-8-sig'))  # not sure why, but this encoding seems necessary
    for r in reversed(j): # they seem to be in decreasing order by default
--- a/my/twitter/archive.py
+++ b/my/twitter/archive.py
@ -26,7 +26,6 @@ from functools import cached_property
 import html
 from ..core.common import Paths, datetime_aware
 from ..core.error import Res
-from ..core.kompress import ZipPath

@dataclass
 class twitter_archive(user_config):
@ -164,9 +163,7 @@ class Like(NamedTuple):

 class ZipExport:
    def __init__(self, archive_path: Path) -> None:
-        # todo maybe this should be insude get_files instead, perhps covered with a flag?
-        self.zpath = ZipPath(archive_path)
-
+        self.zpath = archive_path
        if (self.zpath / 'tweets.csv').exists():
            from ..core.warnings import high
            high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")