general: use less explicit kompress boilerplate in modules
now get_files/kompress library can handle it transparently
This commit is contained in:
parent
c63e80ce94
commit
8c2d1c9463
9 changed files with 14 additions and 19 deletions
|
@ -257,4 +257,8 @@ class ZipPath(zipfile_Path):
|
||||||
)
|
)
|
||||||
return os.stat_result(tuple(params.values()))
|
return os.stat_result(tuple(params.values()))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def suffix(self) -> str:
|
||||||
|
return Path(self.parts[-1]).suffix
|
||||||
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
|
@ -123,7 +123,8 @@ def match_structure(
|
||||||
|
|
||||||
searchdir = Path(tempfile.mkdtemp(dir=tdir))
|
searchdir = Path(tempfile.mkdtemp(dir=tdir))
|
||||||
|
|
||||||
zf = zipfile.ZipFile(base)
|
# base might already be a ZipPath, and str(base) would end with /
|
||||||
|
zf = zipfile.ZipFile(str(base).rstrip('/'))
|
||||||
zf.extractall(path=str(searchdir))
|
zf.extractall(path=str(searchdir))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -146,12 +146,11 @@ class TakeoutHTMLParser(HTMLParser):
|
||||||
|
|
||||||
|
|
||||||
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
||||||
from ...core.kompress import kopen
|
|
||||||
results: List[Parsed] = []
|
results: List[Parsed] = []
|
||||||
def cb(dt: datetime, url: Url, title: Title) -> None:
|
def cb(dt: datetime, url: Url, title: Title) -> None:
|
||||||
results.append((dt, url, title))
|
results.append((dt, url, title))
|
||||||
parser = TakeoutHTMLParser(callback=cb)
|
parser = TakeoutHTMLParser(callback=cb)
|
||||||
with kopen(tpath, file) as fo:
|
with (tpath / file).open() as fo:
|
||||||
data = fo.read()
|
data = fo.read()
|
||||||
parser.feed(data)
|
parser.feed(data)
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -94,10 +94,9 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults:
|
||||||
for path in reversed(inputs()):
|
for path in reversed(inputs()):
|
||||||
with ExitStack() as exit_stack:
|
with ExitStack() as exit_stack:
|
||||||
if config._use_zippath:
|
if config._use_zippath:
|
||||||
from my.core.kompress import ZipPath
|
|
||||||
# for later takeouts it's just 'Takeout' dir,
|
# for later takeouts it's just 'Takeout' dir,
|
||||||
# but for older (pre 2015) it contains email/date in the subdir name
|
# but for older (pre 2015) it contains email/date in the subdir name
|
||||||
results = tuple(cast(Sequence[Path], ZipPath(path).iterdir()))
|
results = tuple(cast(Sequence[Path], path.iterdir()))
|
||||||
else:
|
else:
|
||||||
results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True))
|
results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True))
|
||||||
for m in results:
|
for m in results:
|
||||||
|
|
|
@ -23,8 +23,6 @@ config = make_config(google)
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable
|
||||||
|
|
||||||
from ...core.kompress import kexists
|
|
||||||
|
|
||||||
|
|
||||||
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
|
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
|
||||||
"""
|
"""
|
||||||
|
@ -33,7 +31,7 @@ def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
|
||||||
# TODO FIXME zip is not great..
|
# TODO FIXME zip is not great..
|
||||||
# allow a lambda expression? that way the user could restrict it
|
# allow a lambda expression? that way the user could restrict it
|
||||||
for takeout in get_files(config.takeout_path, glob='*.zip'):
|
for takeout in get_files(config.takeout_path, glob='*.zip'):
|
||||||
if path is None or kexists(takeout, path):
|
if path is None or (takeout / path).exists():
|
||||||
yield takeout
|
yield takeout
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@ from my.core import (
|
||||||
assert_never,
|
assert_never,
|
||||||
make_logger,
|
make_logger,
|
||||||
)
|
)
|
||||||
from my.core.kompress import ZipPath
|
|
||||||
|
|
||||||
from my.config import instagram as user_config
|
from my.config import instagram as user_config
|
||||||
|
|
||||||
|
@ -70,7 +69,7 @@ def _decode(s: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
last = ZipPath(max(inputs()))
|
last = max(inputs())
|
||||||
# TODO make sure it works both with plan directory
|
# TODO make sure it works both with plan directory
|
||||||
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
||||||
# e.g. possible options are:
|
# e.g. possible options are:
|
||||||
|
|
|
@ -21,7 +21,6 @@ import geopy # type: ignore
|
||||||
|
|
||||||
from ..core.common import LazyLogger, mcachew
|
from ..core.common import LazyLogger, mcachew
|
||||||
from ..core.cachew import cache_dir
|
from ..core.cachew import cache_dir
|
||||||
from ..core import kompress
|
|
||||||
|
|
||||||
from my.core.warnings import high
|
from my.core.warnings import high
|
||||||
|
|
||||||
|
@ -135,7 +134,7 @@ def _iter_locations(path: Path, start=0, stop=None) -> Iterable[Location]:
|
||||||
ctx = path.open('r')
|
ctx = path.open('r')
|
||||||
else: # must be a takeout archive
|
else: # must be a takeout archive
|
||||||
# todo CPath? although not sure if it can be iterative?
|
# todo CPath? although not sure if it can be iterative?
|
||||||
ctx = kompress.open(path, _LOCATION_JSON)
|
ctx = (path / _LOCATION_JSON).open()
|
||||||
|
|
||||||
if USE_GREP:
|
if USE_GREP:
|
||||||
unzip = f'unzip -p "{path}" "{_LOCATION_JSON}"'
|
unzip = f'unzip -p "{path}" "{_LOCATION_JSON}"'
|
||||||
|
|
|
@ -6,7 +6,7 @@ Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][officia
|
||||||
|
|
||||||
### config
|
### config
|
||||||
from my.config import stackexchange as user_config
|
from my.config import stackexchange as user_config
|
||||||
from ..core import dataclass, PathIsh, make_config
|
from ..core import dataclass, PathIsh, make_config, get_files
|
||||||
@dataclass
|
@dataclass
|
||||||
class stackexchange(user_config):
|
class stackexchange(user_config):
|
||||||
gdpr_path: PathIsh # path to GDPR zip file
|
gdpr_path: PathIsh # path to GDPR zip file
|
||||||
|
@ -61,12 +61,11 @@ class Vote(NamedTuple):
|
||||||
# todo expose vote type?
|
# todo expose vote type?
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from ..core.kompress import ZipPath
|
|
||||||
from ..core.error import Res
|
from ..core.error import Res
|
||||||
def votes() -> Iterable[Res[Vote]]:
|
def votes() -> Iterable[Res[Vote]]:
|
||||||
# TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed
|
# TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed
|
||||||
# todo should be defensive? not sure if present when user has no votes
|
# todo should be defensive? not sure if present when user has no votes
|
||||||
path = ZipPath(config.gdpr_path)
|
path = max(get_files(config.gdpr_path))
|
||||||
votes_path = path / 'analytics' / 'qa\\vote.submit.json' # yes, it does contain a backslash...
|
votes_path = path / 'analytics' / 'qa\\vote.submit.json' # yes, it does contain a backslash...
|
||||||
j = json.loads(votes_path.read_text(encoding='utf-8-sig')) # not sure why, but this encoding seems necessary
|
j = json.loads(votes_path.read_text(encoding='utf-8-sig')) # not sure why, but this encoding seems necessary
|
||||||
for r in reversed(j): # they seem to be in decreasing order by default
|
for r in reversed(j): # they seem to be in decreasing order by default
|
||||||
|
|
|
@ -26,7 +26,6 @@ from functools import cached_property
|
||||||
import html
|
import html
|
||||||
from ..core.common import Paths, datetime_aware
|
from ..core.common import Paths, datetime_aware
|
||||||
from ..core.error import Res
|
from ..core.error import Res
|
||||||
from ..core.kompress import ZipPath
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class twitter_archive(user_config):
|
class twitter_archive(user_config):
|
||||||
|
@ -164,9 +163,7 @@ class Like(NamedTuple):
|
||||||
|
|
||||||
class ZipExport:
|
class ZipExport:
|
||||||
def __init__(self, archive_path: Path) -> None:
|
def __init__(self, archive_path: Path) -> None:
|
||||||
# todo maybe this should be insude get_files instead, perhps covered with a flag?
|
self.zpath = archive_path
|
||||||
self.zpath = ZipPath(archive_path)
|
|
||||||
|
|
||||||
if (self.zpath / 'tweets.csv').exists():
|
if (self.zpath / 'tweets.csv').exists():
|
||||||
from ..core.warnings import high
|
from ..core.warnings import high
|
||||||
high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")
|
high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")
|
||||||
|
|
Loading…
Add table
Reference in a new issue