general: use less explicit kompress boilerplate in modules

now get_files/kompress library can handle it transparently
This commit is contained in:
karlicoss 2023-10-14 23:18:01 +01:00
parent c63e80ce94
commit 8c2d1c9463
9 changed files with 14 additions and 19 deletions

View file

@ -257,4 +257,8 @@ class ZipPath(zipfile_Path):
)
return os.stat_result(tuple(params.values()))
@property
def suffix(self) -> str:
return Path(self.parts[-1]).suffix
# fmt: on

View file

@ -123,7 +123,8 @@ def match_structure(
searchdir = Path(tempfile.mkdtemp(dir=tdir))
zf = zipfile.ZipFile(base)
# base might already be a ZipPath, and str(base) would end with /
zf = zipfile.ZipFile(str(base).rstrip('/'))
zf.extractall(path=str(searchdir))
else:

View file

@ -146,12 +146,11 @@ class TakeoutHTMLParser(HTMLParser):
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
from ...core.kompress import kopen
results: List[Parsed] = []
def cb(dt: datetime, url: Url, title: Title) -> None:
results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo:
with (tpath / file).open() as fo:
data = fo.read()
parser.feed(data)
return results

View file

@ -94,10 +94,9 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults:
for path in reversed(inputs()):
with ExitStack() as exit_stack:
if config._use_zippath:
from my.core.kompress import ZipPath
# for later takeouts it's just 'Takeout' dir,
# but for older (pre 2015) it contains email/date in the subdir name
results = tuple(cast(Sequence[Path], ZipPath(path).iterdir()))
results = tuple(cast(Sequence[Path], path.iterdir()))
else:
results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True))
for m in results:

View file

@ -23,8 +23,6 @@ config = make_config(google)
from pathlib import Path
from typing import Optional, Iterable
from ...core.kompress import kexists
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
"""
@ -33,7 +31,7 @@ def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
# TODO FIXME zip is not great..
# allow a lambda expression? that way the user could restrict it
for takeout in get_files(config.takeout_path, glob='*.zip'):
if path is None or kexists(takeout, path):
if path is None or (takeout / path).exists():
yield takeout

View file

@ -17,7 +17,6 @@ from my.core import (
assert_never,
make_logger,
)
from my.core.kompress import ZipPath
from my.config import instagram as user_config
@ -70,7 +69,7 @@ def _decode(s: str) -> str:
def _entities() -> Iterator[Res[Union[User, _Message]]]:
last = ZipPath(max(inputs()))
last = max(inputs())
# TODO make sure it works both with plan directory
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
# e.g. possible options are:

View file

@ -21,7 +21,6 @@ import geopy # type: ignore
from ..core.common import LazyLogger, mcachew
from ..core.cachew import cache_dir
from ..core import kompress
from my.core.warnings import high
@ -135,7 +134,7 @@ def _iter_locations(path: Path, start=0, stop=None) -> Iterable[Location]:
ctx = path.open('r')
else: # must be a takeout archive
# todo CPath? although not sure if it can be iterative?
ctx = kompress.open(path, _LOCATION_JSON)
ctx = (path / _LOCATION_JSON).open()
if USE_GREP:
unzip = f'unzip -p "{path}" "{_LOCATION_JSON}"'

View file

@ -6,7 +6,7 @@ Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][officia
### config
from my.config import stackexchange as user_config
from ..core import dataclass, PathIsh, make_config
from ..core import dataclass, PathIsh, make_config, get_files
@dataclass
class stackexchange(user_config):
gdpr_path: PathIsh # path to GDPR zip file
@ -61,12 +61,11 @@ class Vote(NamedTuple):
# todo expose vote type?
import json
from ..core.kompress import ZipPath
from ..core.error import Res
def votes() -> Iterable[Res[Vote]]:
# TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed
# todo should be defensive? not sure if present when user has no votes
path = ZipPath(config.gdpr_path)
path = max(get_files(config.gdpr_path))
votes_path = path / 'analytics' / 'qa\\vote.submit.json' # yes, it does contain a backslash...
j = json.loads(votes_path.read_text(encoding='utf-8-sig')) # not sure why, but this encoding seems necessary
for r in reversed(j): # they seem to be in decreasing order by default

View file

@ -26,7 +26,6 @@ from functools import cached_property
import html
from ..core.common import Paths, datetime_aware
from ..core.error import Res
from ..core.kompress import ZipPath
@dataclass
class twitter_archive(user_config):
@ -164,9 +163,7 @@ class Like(NamedTuple):
class ZipExport:
def __init__(self, archive_path: Path) -> None:
# todo maybe this should be insude get_files instead, perhps covered with a flag?
self.zpath = ZipPath(archive_path)
self.zpath = archive_path
if (self.zpath / 'tweets.csv').exists():
from ..core.warnings import high
high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")