kompress.kopen improvements

- tests
- uniform handling for bytes/str, always return utf8 str by default
This commit is contained in:
Dima Gerasimov 2020-05-04 08:37:36 +01:00
parent c3a77b6256
commit 8b8a85e8c3
7 changed files with 52 additions and 24 deletions

View file

@ -136,7 +136,6 @@ def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
results.append((dt, url, title)) results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb) parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo: with kopen(tpath, file) as fo:
# TODO careful, wht if it's a string already? make asutf method? data = fo.read()
data = fo.read().decode('utf8')
parser.feed(data) parser.feed(data)
return results return results

View file

@ -3,37 +3,54 @@ Various helpers for compression
""" """
import pathlib import pathlib
from pathlib import Path from pathlib import Path
from typing import Union from typing import Union, IO
import io
PathIsh = Union[Path, str] PathIsh = Union[Path, str]
def _zstd_open(path: Path): def _zstd_open(path: Path, *args, **kwargs):
import zstandard as zstd # type: ignore import zstandard as zstd # type: ignore
fh = path.open('rb') fh = path.open(*args, **kwargs)
dctx = zstd.ZstdDecompressor() dctx = zstd.ZstdDecompressor()
reader = dctx.stream_reader(fh) reader = dctx.stream_reader(fh)
return reader return reader
def kopen(path: PathIsh, *args, **kwargs): # TODO is it bytes stream?? # TODO returns protocol that we can call 'read' against?
# TODO allow passing in mode? # TODO use the 'dependent type' trick?
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
# TODO handle mode in *rags?
encoding = kwargs.get('encoding', 'utf8')
kwargs['encoding'] = encoding
pp = Path(path) pp = Path(path)
suf = pp.suffix suf = pp.suffix
if suf in {'.xz'}: if suf in {'.xz'}:
import lzma import lzma
return lzma.open(pp, *args, **kwargs) return lzma.open(pp, mode, *args, **kwargs)
elif suf in {'.zip'}: elif suf in {'.zip'}:
# eh. this behaviour is a bit dodgy...
from zipfile import ZipFile from zipfile import ZipFile
return ZipFile(pp).open(*args, **kwargs) zfile = ZipFile(pp)
[subpath] = args # meh?
## oh god... https://stackoverflow.com/a/5639960/706389
ifile = zfile.open(subpath, mode='r')
ifile.readable = lambda: True # type: ignore
ifile.writable = lambda: False # type: ignore
ifile.seekable = lambda: False # type: ignore
ifile.read1 = ifile.read # type: ignore
# TODO pass all kwargs here??
return io.TextIOWrapper(ifile, encoding=encoding)
elif suf in {'.lz4'}: elif suf in {'.lz4'}:
import lz4.frame # type: ignore import lz4.frame # type: ignore
return lz4.frame.open(str(pp)) return lz4.frame.open(str(pp), mode, *args, **kwargs)
elif suf in {'.zstd'}: elif suf in {'.zstd'}:
return _zstd_open(pp) return _zstd_open(pp, mode, *args, **kwargs)
else: else:
kwargs['encoding'] = 'utf-8' return pp.open(mode, *args, **kwargs)
return pp.open(*args, **kwargs)
import typing import typing
@ -60,7 +77,7 @@ class CPath(BasePath):
return kopen(str(self)) return kopen(str(self))
open = kopen # TODO remove? open = kopen # TODO deprecate
# meh # meh

View file

@ -7,7 +7,7 @@ from collections import deque
from datetime import datetime from datetime import datetime
from itertools import islice from itertools import islice
from pathlib import Path from pathlib import Path
from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence, IO
import pytz import pytz
# pip3 install geopy # pip3 install geopy
@ -107,6 +107,7 @@ _LOCATION_JSON = 'Takeout/Location History/Location History.json'
# TODO hope they are sorted... (could assert for it) # TODO hope they are sorted... (could assert for it)
@mcachew(cache_path, chunk_by=10000, logger=logger) @mcachew(cache_path, chunk_by=10000, logger=logger)
def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]: def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
ctx: IO[str]
if path.suffix == '.json': if path.suffix == '.json':
ctx = path.open('r') ctx = path.open('r')
else: # must be a takeout archive else: # must be a takeout archive

View file

@ -18,7 +18,7 @@ import icalendar # type: ignore
from icalendar.cal import Todo # type: ignore from icalendar.cal import Todo # type: ignore
logger = LazyLogger('my.rtm') logger = LazyLogger(__name__)
# TODO extract in a module to parse RTM's ical? # TODO extract in a module to parse RTM's ical?
@ -98,8 +98,8 @@ class DAL:
def dal(): def dal():
last = get_files(config.export_path, glob='*.ical.xz')[-1] last = get_files(config.export_path)[-1]
with kopen(last, 'rb') as fo: with kopen(last) as fo:
data = fo.read() data = fo.read()
return DAL(data=data, revision='TODO') return DAL(data=data, revision='TODO')

View file

@ -117,7 +117,7 @@ class ZipExport:
path += '.js' path += '.js'
with kompress.kopen(self.epath, path) as fo: with kompress.kopen(self.epath, path) as fo:
ddd = fo.read().decode('utf8') ddd = fo.read()
start = ddd.index('[') start = ddd.index('[')
ddd = ddd[start:] ddd = ddd[start:]
for j in json.loads(ddd): for j in json.loads(ddd):

View file

@ -3,8 +3,9 @@ from subprocess import check_call
import gzip import gzip
import lzma import lzma
import io import io
import zipfile
from my.kython.kompress import kopen from my.kython.kompress import kopen, kexists
import pytest # type: ignore import pytest # type: ignore
@ -15,6 +16,8 @@ def prepare(tmp_path: Path):
with (tmp_path / 'file.xz').open('wb') as f: with (tmp_path / 'file.xz').open('wb') as f:
with lzma.open(f, 'w') as lzf: with lzma.open(f, 'w') as lzf:
lzf.write(b'compressed text') lzf.write(b'compressed text')
with zipfile.ZipFile(tmp_path / 'file.zip', 'w') as zf:
zf.writestr('path/in/archive', 'data in zip')
try: try:
yield None yield None
finally: finally:
@ -24,12 +27,18 @@ def prepare(tmp_path: Path):
def test_kopen(prepare, tmp_path: Path) -> None: def test_kopen(prepare, tmp_path: Path) -> None:
"Plaintext handled transparently" "Plaintext handled transparently"
assert kopen(tmp_path / 'file' ).read() == 'just plaintext' assert kopen(tmp_path / 'file' ).read() == 'just plaintext'
assert kopen(tmp_path / 'file.xz').read() == b'compressed text' # FIXME make this str assert kopen(tmp_path / 'file.xz').read() == 'compressed text'
"For zips behaviour is a bit different (not sure about all this, tbh...)"
assert kopen(tmp_path / 'file.zip', 'path/in/archive').read() == 'data in zip'
def test_kexists(tmp_path: Path) -> None: def test_kexists(prepare, tmp_path: Path) -> None:
# TODO assert kexists(str(tmp_path / 'file.zip'), 'path/in/archive')
raise RuntimeError assert not kexists(str(tmp_path / 'file.zip'), 'path/notin/archive')
# TODO not sure about this?
assert not kexists(tmp_path / 'nosuchzip.zip', 'path/in/archive')
def test_cpath(): def test_cpath():

View file

@ -1,2 +1,4 @@
# ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927 # ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927
from my.reddit import * from my.reddit import *
# TODO for reddit test, patch up to take every 10th archive or something; but make sure it's deterministic