Merge pull request #38 from karlicoss/updates

More uniform handling for compressed files
This commit is contained in:
karlicoss 2020-05-04 08:57:48 +01:00 committed by GitHub
commit 77d557e172
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 97 additions and 28 deletions

View file

@ -182,15 +182,18 @@ def _magic():
# TODO could reuse in pdf module? # TODO could reuse in pdf module?
import mimetypes # TODO do I need init()? import mimetypes # todo do I need init()?
def fastermime(path: str) -> str: # todo wtf? fastermime thinks it's mime is application/json even if the extension is xz??
# whereas magic detects correctly: application/x-zstd and application/x-xz
def fastermime(path: PathIsh) -> str:
paths = str(path)
# mimetypes is faster # mimetypes is faster
(mime, _) = mimetypes.guess_type(path) (mime, _) = mimetypes.guess_type(paths)
if mime is not None: if mime is not None:
return mime return mime
# magic is slower but returns more stuff # magic is slower but returns more stuff
# TODO FIXME Result type; it's inherently racey # TODO Result type?; it's kinda racey, but perhaps better to let the caller decide?
return _magic().from_file(path) return _magic().from_file(paths)
Json = Dict[str, Any] Json = Dict[str, Any]

View file

@ -136,7 +136,6 @@ def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
results.append((dt, url, title)) results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb) parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo: with kopen(tpath, file) as fo:
# TODO careful, wht if it's a string already? make asutf method? data = fo.read()
data = fo.read().decode('utf8')
parser.feed(data) parser.feed(data)
return results return results

View file

@ -3,36 +3,54 @@ Various helpers for compression
""" """
import pathlib import pathlib
from pathlib import Path from pathlib import Path
from typing import Union from typing import Union, IO
import io
PathIsh = Union[Path, str] PathIsh = Union[Path, str]
def _zstd_open(path: Path): def _zstd_open(path: Path, *args, **kwargs):
import zstandard as zstd # type: ignore import zstandard as zstd # type: ignore
fh = path.open('rb') fh = path.open(*args, **kwargs)
dctx = zstd.ZstdDecompressor() dctx = zstd.ZstdDecompressor()
reader = dctx.stream_reader(fh) reader = dctx.stream_reader(fh)
return reader return reader
def kopen(path: PathIsh, *args, **kwargs): # TODO is it bytes stream?? # TODO returns protocol that we can call 'read' against?
# TODO use the 'dependent type' trick?
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
# TODO handle mode in *rags?
encoding = kwargs.get('encoding', 'utf8')
kwargs['encoding'] = encoding
pp = Path(path) pp = Path(path)
suf = pp.suffix suf = pp.suffix
if suf in {'.xz'}: if suf in {'.xz'}:
import lzma import lzma
return lzma.open(pp, *args, **kwargs) return lzma.open(pp, mode, *args, **kwargs)
elif suf in {'.zip'}: elif suf in {'.zip'}:
# eh. this behaviour is a bit dodgy...
from zipfile import ZipFile from zipfile import ZipFile
return ZipFile(pp).open(*args, **kwargs) zfile = ZipFile(pp)
[subpath] = args # meh?
## oh god... https://stackoverflow.com/a/5639960/706389
ifile = zfile.open(subpath, mode='r')
ifile.readable = lambda: True # type: ignore
ifile.writable = lambda: False # type: ignore
ifile.seekable = lambda: False # type: ignore
ifile.read1 = ifile.read # type: ignore
# TODO pass all kwargs here??
return io.TextIOWrapper(ifile, encoding=encoding)
elif suf in {'.lz4'}: elif suf in {'.lz4'}:
import lz4.frame # type: ignore import lz4.frame # type: ignore
return lz4.frame.open(str(pp)) return lz4.frame.open(str(pp), mode, *args, **kwargs)
elif suf in {'.zstd'}: elif suf in {'.zstd'}:
return _zstd_open(pp) return _zstd_open(pp, mode, *args, **kwargs)
else: else:
kwargs['encoding'] = 'utf-8' return pp.open(mode, *args, **kwargs)
return pp.open(*args, **kwargs)
import typing import typing
@ -59,7 +77,7 @@ class CPath(BasePath):
return kopen(str(self)) return kopen(str(self))
open = kopen # TODO remove? open = kopen # TODO deprecate
# meh # meh

View file

@ -7,7 +7,7 @@ from collections import deque
from datetime import datetime from datetime import datetime
from itertools import islice from itertools import islice
from pathlib import Path from pathlib import Path
from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence, IO
import pytz import pytz
# pip3 install geopy # pip3 install geopy
@ -107,6 +107,7 @@ _LOCATION_JSON = 'Takeout/Location History/Location History.json'
# TODO hope they are sorted... (could assert for it) # TODO hope they are sorted... (could assert for it)
@mcachew(cache_path, chunk_by=10000, logger=logger) @mcachew(cache_path, chunk_by=10000, logger=logger)
def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]: def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
ctx: IO[str]
if path.suffix == '.json': if path.suffix == '.json':
ctx = path.open('r') ctx = path.open('r')
else: # must be a takeout archive else: # must be a takeout archive

View file

@ -14,7 +14,6 @@ import my.config.repos.rexport.dal as rexport
def get_sources() -> Sequence[Path]: def get_sources() -> Sequence[Path]:
# TODO use zstd?
# TODO rename to export_path? # TODO rename to export_path?
files = get_files(config.export_dir) files = get_files(config.export_dir)
res = list(map(CPath, files)); assert len(res) > 0 res = list(map(CPath, files)); assert len(res) > 0

View file

@ -9,7 +9,7 @@ from typing import Dict, List, Optional, Iterator
from datetime import datetime from datetime import datetime
from .common import LazyLogger, get_files, group_by_key, cproperty, make_dict from .common import LazyLogger, get_files, group_by_key, cproperty, make_dict
from .kython.kompress import open as kopen from .kython.kompress import CPath
from my.config import rtm as config from my.config import rtm as config
@ -18,7 +18,7 @@ import icalendar # type: ignore
from icalendar.cal import Todo # type: ignore from icalendar.cal import Todo # type: ignore
logger = LazyLogger('my.rtm') logger = LazyLogger(__name__)
# TODO extract in a module to parse RTM's ical? # TODO extract in a module to parse RTM's ical?
@ -80,7 +80,7 @@ class MyTodo:
class DAL: class DAL:
def __init__(self, data: bytes, revision=None) -> None: def __init__(self, data: str, revision=None) -> None:
self.cal = icalendar.Calendar.from_ical(data) self.cal = icalendar.Calendar.from_ical(data)
self.revision = revision self.revision = revision
@ -98,9 +98,8 @@ class DAL:
def dal(): def dal():
last = get_files(config.export_path, glob='*.ical.xz')[-1] last = get_files(config.export_path)[-1]
with kopen(last, 'rb') as fo: data = CPath(last).read_text() # TODO make it automatic
data = fo.read()
return DAL(data=data, revision='TODO') return DAL(data=data, revision='TODO')

View file

@ -117,7 +117,7 @@ class ZipExport:
path += '.js' path += '.js'
with kompress.kopen(self.epath, path) as fo: with kompress.kopen(self.epath, path) as fo:
ddd = fo.read().decode('utf8') ddd = fo.read()
start = ddd.index('[') start = ddd.index('[')
ddd = ddd[start:] ddd = ddd[start:]
for j in json.loads(ddd): for j in json.loads(ddd):

47
tests/misc.py Normal file
View file

@ -0,0 +1,47 @@
from pathlib import Path
from subprocess import check_call
import gzip
import lzma
import io
import zipfile
from my.kython.kompress import kopen, kexists, CPath
import pytest # type: ignore
@pytest.fixture
def prepare(tmp_path: Path):
(tmp_path / 'file').write_text('just plaintext')
with (tmp_path / 'file.xz').open('wb') as f:
with lzma.open(f, 'w') as lzf:
lzf.write(b'compressed text')
with zipfile.ZipFile(tmp_path / 'file.zip', 'w') as zf:
zf.writestr('path/in/archive', 'data in zip')
try:
yield None
finally:
pass
def test_kopen(prepare, tmp_path: Path) -> None:
"Plaintext handled transparently"
assert kopen(tmp_path / 'file' ).read() == 'just plaintext'
assert kopen(tmp_path / 'file.xz').read() == 'compressed text'
"For zips behaviour is a bit different (not sure about all this, tbh...)"
assert kopen(tmp_path / 'file.zip', 'path/in/archive').read() == 'data in zip'
def test_kexists(prepare, tmp_path: Path) -> None:
assert kexists(str(tmp_path / 'file.zip'), 'path/in/archive')
assert not kexists(str(tmp_path / 'file.zip'), 'path/notin/archive')
# TODO not sure about this?
assert not kexists(tmp_path / 'nosuchzip.zip', 'path/in/archive')
def test_cpath(prepare, tmp_path: Path) -> None:
CPath(str(tmp_path / 'file' )).read_text() == 'just plaintext'
CPath( tmp_path / 'file.xz').read_text() == 'compressed text'
# TODO not sure about zip files??

View file

@ -1,2 +1,4 @@
# ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927 # ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927
from my.reddit import * from my.reddit import *
# TODO for reddit test, patch up to take every 10th archive or something; but make sure it's deterministic

View file

@ -8,11 +8,12 @@ passenv = CI CI_*
# deliberately set to nonexistent pathe to check the fallback logic # deliberately set to nonexistent pathe to check the fallback logic
setenv = MY_CONFIG = nonexistent setenv = MY_CONFIG = nonexistent
commands = commands =
pip install -e . pip install -e .[testing]
# TODO ?? # TODO ??
# python -m pytest {posargs} # python -m pytest {posargs}
python3 -c 'import my.init; from my.config import stub as config; print(config.key)' python3 -c 'import my.init; from my.config import stub as config; print(config.key)'
python3 -c 'import my.init; import my.config; import my.config.repos' # shouldn't fail at least python3 -c 'import my.init; import my.config; import my.config.repos' # shouldn't fail at least
python3 -m pytest tests/misc.py tests/get_files.py
# TODO run demo.py? just make sure with_my is a bit cleverer? # TODO run demo.py? just make sure with_my is a bit cleverer?
# TODO e.g. under CI, rely on installing # TODO e.g. under CI, rely on installing