Merge pull request #38 from karlicoss/updates

More uniform handling for compressed files
This commit is contained in:
karlicoss 2020-05-04 08:57:48 +01:00 committed by GitHub
commit 77d557e172
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 97 additions and 28 deletions

View file

@ -182,15 +182,18 @@ def _magic():
# TODO could reuse in pdf module?
import mimetypes # TODO do I need init()?
def fastermime(path: str) -> str:
import mimetypes # todo do I need init()?
# todo wtf? fastermime thinks it's mime is application/json even if the extension is xz??
# whereas magic detects correctly: application/x-zstd and application/x-xz
def fastermime(path: PathIsh) -> str:
paths = str(path)
# mimetypes is faster
(mime, _) = mimetypes.guess_type(path)
(mime, _) = mimetypes.guess_type(paths)
if mime is not None:
return mime
# magic is slower but returns more stuff
# TODO FIXME Result type; it's inherently racey
return _magic().from_file(path)
# TODO Result type?; it's kinda racey, but perhaps better to let the caller decide?
return _magic().from_file(paths)
Json = Dict[str, Any]

View file

@ -136,7 +136,6 @@ def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo:
# TODO careful, wht if it's a string already? make asutf method?
data = fo.read().decode('utf8')
data = fo.read()
parser.feed(data)
return results

View file

@ -3,36 +3,54 @@ Various helpers for compression
"""
import pathlib
from pathlib import Path
from typing import Union
from typing import Union, IO
import io
PathIsh = Union[Path, str]
def _zstd_open(path: Path):
def _zstd_open(path: Path, *args, **kwargs):
import zstandard as zstd # type: ignore
fh = path.open('rb')
fh = path.open(*args, **kwargs)
dctx = zstd.ZstdDecompressor()
reader = dctx.stream_reader(fh)
return reader
def kopen(path: PathIsh, *args, **kwargs): # TODO is it bytes stream??
# TODO returns protocol that we can call 'read' against?
# TODO use the 'dependent type' trick?
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
# TODO handle mode in *rags?
encoding = kwargs.get('encoding', 'utf8')
kwargs['encoding'] = encoding
pp = Path(path)
suf = pp.suffix
if suf in {'.xz'}:
import lzma
return lzma.open(pp, *args, **kwargs)
return lzma.open(pp, mode, *args, **kwargs)
elif suf in {'.zip'}:
# eh. this behaviour is a bit dodgy...
from zipfile import ZipFile
return ZipFile(pp).open(*args, **kwargs)
zfile = ZipFile(pp)
[subpath] = args # meh?
## oh god... https://stackoverflow.com/a/5639960/706389
ifile = zfile.open(subpath, mode='r')
ifile.readable = lambda: True # type: ignore
ifile.writable = lambda: False # type: ignore
ifile.seekable = lambda: False # type: ignore
ifile.read1 = ifile.read # type: ignore
# TODO pass all kwargs here??
return io.TextIOWrapper(ifile, encoding=encoding)
elif suf in {'.lz4'}:
import lz4.frame # type: ignore
return lz4.frame.open(str(pp))
return lz4.frame.open(str(pp), mode, *args, **kwargs)
elif suf in {'.zstd'}:
return _zstd_open(pp)
return _zstd_open(pp, mode, *args, **kwargs)
else:
kwargs['encoding'] = 'utf-8'
return pp.open(*args, **kwargs)
return pp.open(mode, *args, **kwargs)
import typing
@ -59,7 +77,7 @@ class CPath(BasePath):
return kopen(str(self))
open = kopen # TODO remove?
open = kopen # TODO deprecate
# meh

View file

@ -7,7 +7,7 @@ from collections import deque
from datetime import datetime
from itertools import islice
from pathlib import Path
from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence
from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence, IO
import pytz
# pip3 install geopy
@ -107,6 +107,7 @@ _LOCATION_JSON = 'Takeout/Location History/Location History.json'
# TODO hope they are sorted... (could assert for it)
@mcachew(cache_path, chunk_by=10000, logger=logger)
def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
ctx: IO[str]
if path.suffix == '.json':
ctx = path.open('r')
else: # must be a takeout archive

View file

@ -14,7 +14,6 @@ import my.config.repos.rexport.dal as rexport
def get_sources() -> Sequence[Path]:
# TODO use zstd?
# TODO rename to export_path?
files = get_files(config.export_dir)
res = list(map(CPath, files)); assert len(res) > 0

View file

@ -9,7 +9,7 @@ from typing import Dict, List, Optional, Iterator
from datetime import datetime
from .common import LazyLogger, get_files, group_by_key, cproperty, make_dict
from .kython.kompress import open as kopen
from .kython.kompress import CPath
from my.config import rtm as config
@ -18,7 +18,7 @@ import icalendar # type: ignore
from icalendar.cal import Todo # type: ignore
logger = LazyLogger('my.rtm')
logger = LazyLogger(__name__)
# TODO extract in a module to parse RTM's ical?
@ -80,7 +80,7 @@ class MyTodo:
class DAL:
def __init__(self, data: bytes, revision=None) -> None:
def __init__(self, data: str, revision=None) -> None:
self.cal = icalendar.Calendar.from_ical(data)
self.revision = revision
@ -98,9 +98,8 @@ class DAL:
def dal():
last = get_files(config.export_path, glob='*.ical.xz')[-1]
with kopen(last, 'rb') as fo:
data = fo.read()
last = get_files(config.export_path)[-1]
data = CPath(last).read_text() # TODO make it automatic
return DAL(data=data, revision='TODO')

View file

@ -117,7 +117,7 @@ class ZipExport:
path += '.js'
with kompress.kopen(self.epath, path) as fo:
ddd = fo.read().decode('utf8')
ddd = fo.read()
start = ddd.index('[')
ddd = ddd[start:]
for j in json.loads(ddd):

47
tests/misc.py Normal file
View file

@ -0,0 +1,47 @@
from pathlib import Path
from subprocess import check_call
import gzip
import lzma
import io
import zipfile
from my.kython.kompress import kopen, kexists, CPath
import pytest # type: ignore
@pytest.fixture
def prepare(tmp_path: Path):
(tmp_path / 'file').write_text('just plaintext')
with (tmp_path / 'file.xz').open('wb') as f:
with lzma.open(f, 'w') as lzf:
lzf.write(b'compressed text')
with zipfile.ZipFile(tmp_path / 'file.zip', 'w') as zf:
zf.writestr('path/in/archive', 'data in zip')
try:
yield None
finally:
pass
def test_kopen(prepare, tmp_path: Path) -> None:
"Plaintext handled transparently"
assert kopen(tmp_path / 'file' ).read() == 'just plaintext'
assert kopen(tmp_path / 'file.xz').read() == 'compressed text'
"For zips behaviour is a bit different (not sure about all this, tbh...)"
assert kopen(tmp_path / 'file.zip', 'path/in/archive').read() == 'data in zip'
def test_kexists(prepare, tmp_path: Path) -> None:
assert kexists(str(tmp_path / 'file.zip'), 'path/in/archive')
assert not kexists(str(tmp_path / 'file.zip'), 'path/notin/archive')
# TODO not sure about this?
assert not kexists(tmp_path / 'nosuchzip.zip', 'path/in/archive')
def test_cpath(prepare, tmp_path: Path) -> None:
CPath(str(tmp_path / 'file' )).read_text() == 'just plaintext'
CPath( tmp_path / 'file.xz').read_text() == 'compressed text'
# TODO not sure about zip files??

View file

@ -1,2 +1,4 @@
# ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927
from my.reddit import *
# TODO for reddit test, patch up to take every 10th archive or something; but make sure it's deterministic

View file

@ -8,11 +8,12 @@ passenv = CI CI_*
# deliberately set to nonexistent pathe to check the fallback logic
setenv = MY_CONFIG = nonexistent
commands =
pip install -e .
pip install -e .[testing]
# TODO ??
# python -m pytest {posargs}
python3 -c 'import my.init; from my.config import stub as config; print(config.key)'
python3 -c 'import my.init; import my.config; import my.config.repos' # shouldn't fail at least
python3 -m pytest tests/misc.py tests/get_files.py
# TODO run demo.py? just make sure with_my is a bit cleverer?
# TODO e.g. under CI, rely on installing