general: use less explicit kompress boilerplate in modules
now get_files/kompress library can handle it transparently
This commit is contained in:
parent
c63e80ce94
commit
8c2d1c9463
9 changed files with 14 additions and 19 deletions
|
@ -257,4 +257,8 @@ class ZipPath(zipfile_Path):
|
|||
)
|
||||
return os.stat_result(tuple(params.values()))
|
||||
|
||||
@property
|
||||
def suffix(self) -> str:
|
||||
return Path(self.parts[-1]).suffix
|
||||
|
||||
# fmt: on
|
||||
|
|
|
@ -123,7 +123,8 @@ def match_structure(
|
|||
|
||||
searchdir = Path(tempfile.mkdtemp(dir=tdir))
|
||||
|
||||
zf = zipfile.ZipFile(base)
|
||||
# base might already be a ZipPath, and str(base) would end with /
|
||||
zf = zipfile.ZipFile(str(base).rstrip('/'))
|
||||
zf.extractall(path=str(searchdir))
|
||||
|
||||
else:
|
||||
|
|
|
@ -146,12 +146,11 @@ class TakeoutHTMLParser(HTMLParser):
|
|||
|
||||
|
||||
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
||||
from ...core.kompress import kopen
|
||||
results: List[Parsed] = []
|
||||
def cb(dt: datetime, url: Url, title: Title) -> None:
|
||||
results.append((dt, url, title))
|
||||
parser = TakeoutHTMLParser(callback=cb)
|
||||
with kopen(tpath, file) as fo:
|
||||
with (tpath / file).open() as fo:
|
||||
data = fo.read()
|
||||
parser.feed(data)
|
||||
return results
|
||||
|
|
|
@ -94,10 +94,9 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults:
|
|||
for path in reversed(inputs()):
|
||||
with ExitStack() as exit_stack:
|
||||
if config._use_zippath:
|
||||
from my.core.kompress import ZipPath
|
||||
# for later takeouts it's just 'Takeout' dir,
|
||||
# but for older (pre 2015) it contains email/date in the subdir name
|
||||
results = tuple(cast(Sequence[Path], ZipPath(path).iterdir()))
|
||||
results = tuple(cast(Sequence[Path], path.iterdir()))
|
||||
else:
|
||||
results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True))
|
||||
for m in results:
|
||||
|
|
|
@ -23,8 +23,6 @@ config = make_config(google)
|
|||
from pathlib import Path
|
||||
from typing import Optional, Iterable
|
||||
|
||||
from ...core.kompress import kexists
|
||||
|
||||
|
||||
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
|
||||
"""
|
||||
|
@ -33,7 +31,7 @@ def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
|
|||
# TODO FIXME zip is not great..
|
||||
# allow a lambda expression? that way the user could restrict it
|
||||
for takeout in get_files(config.takeout_path, glob='*.zip'):
|
||||
if path is None or kexists(takeout, path):
|
||||
if path is None or (takeout / path).exists():
|
||||
yield takeout
|
||||
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@ from my.core import (
|
|||
assert_never,
|
||||
make_logger,
|
||||
)
|
||||
from my.core.kompress import ZipPath
|
||||
|
||||
from my.config import instagram as user_config
|
||||
|
||||
|
@ -70,7 +69,7 @@ def _decode(s: str) -> str:
|
|||
|
||||
|
||||
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||
last = ZipPath(max(inputs()))
|
||||
last = max(inputs())
|
||||
# TODO make sure it works both with plan directory
|
||||
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
||||
# e.g. possible options are:
|
||||
|
|
|
@ -21,7 +21,6 @@ import geopy # type: ignore
|
|||
|
||||
from ..core.common import LazyLogger, mcachew
|
||||
from ..core.cachew import cache_dir
|
||||
from ..core import kompress
|
||||
|
||||
from my.core.warnings import high
|
||||
|
||||
|
@ -135,7 +134,7 @@ def _iter_locations(path: Path, start=0, stop=None) -> Iterable[Location]:
|
|||
ctx = path.open('r')
|
||||
else: # must be a takeout archive
|
||||
# todo CPath? although not sure if it can be iterative?
|
||||
ctx = kompress.open(path, _LOCATION_JSON)
|
||||
ctx = (path / _LOCATION_JSON).open()
|
||||
|
||||
if USE_GREP:
|
||||
unzip = f'unzip -p "{path}" "{_LOCATION_JSON}"'
|
||||
|
|
|
@ -6,7 +6,7 @@ Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][officia
|
|||
|
||||
### config
|
||||
from my.config import stackexchange as user_config
|
||||
from ..core import dataclass, PathIsh, make_config
|
||||
from ..core import dataclass, PathIsh, make_config, get_files
|
||||
@dataclass
|
||||
class stackexchange(user_config):
|
||||
gdpr_path: PathIsh # path to GDPR zip file
|
||||
|
@ -61,12 +61,11 @@ class Vote(NamedTuple):
|
|||
# todo expose vote type?
|
||||
|
||||
import json
|
||||
from ..core.kompress import ZipPath
|
||||
from ..core.error import Res
|
||||
def votes() -> Iterable[Res[Vote]]:
|
||||
# TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed
|
||||
# todo should be defensive? not sure if present when user has no votes
|
||||
path = ZipPath(config.gdpr_path)
|
||||
path = max(get_files(config.gdpr_path))
|
||||
votes_path = path / 'analytics' / 'qa\\vote.submit.json' # yes, it does contain a backslash...
|
||||
j = json.loads(votes_path.read_text(encoding='utf-8-sig')) # not sure why, but this encoding seems necessary
|
||||
for r in reversed(j): # they seem to be in decreasing order by default
|
||||
|
|
|
@ -26,7 +26,6 @@ from functools import cached_property
|
|||
import html
|
||||
from ..core.common import Paths, datetime_aware
|
||||
from ..core.error import Res
|
||||
from ..core.kompress import ZipPath
|
||||
|
||||
@dataclass
|
||||
class twitter_archive(user_config):
|
||||
|
@ -164,9 +163,7 @@ class Like(NamedTuple):
|
|||
|
||||
class ZipExport:
|
||||
def __init__(self, archive_path: Path) -> None:
|
||||
# todo maybe this should be insude get_files instead, perhps covered with a flag?
|
||||
self.zpath = ZipPath(archive_path)
|
||||
|
||||
self.zpath = archive_path
|
||||
if (self.zpath / 'tweets.csv').exists():
|
||||
from ..core.warnings import high
|
||||
high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")
|
||||
|
|
Loading…
Add table
Reference in a new issue