kompress.kopen improvements

- tests
- uniform handling for bytes/str, always return utf8 str by default
This commit is contained in:
Dima Gerasimov 2020-05-04 08:37:36 +01:00
parent c3a77b6256
commit 8b8a85e8c3
7 changed files with 52 additions and 24 deletions

View file

@ -136,7 +136,6 @@ def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
results.append((dt, url, title))
parser = TakeoutHTMLParser(callback=cb)
with kopen(tpath, file) as fo:
# TODO careful, wht if it's a string already? make asutf method?
data = fo.read().decode('utf8')
data = fo.read()
parser.feed(data)
return results

View file

@ -3,37 +3,54 @@ Various helpers for compression
"""
import pathlib
from pathlib import Path
from typing import Union
from typing import Union, IO
import io
PathIsh = Union[Path, str]
def _zstd_open(path: Path):
def _zstd_open(path: Path, *args, **kwargs):
import zstandard as zstd # type: ignore
fh = path.open('rb')
fh = path.open(*args, **kwargs)
dctx = zstd.ZstdDecompressor()
reader = dctx.stream_reader(fh)
return reader
def kopen(path: PathIsh, *args, **kwargs): # TODO is it bytes stream??
# TODO allow passing in mode?
# TODO returns protocol that we can call 'read' against?
# TODO use the 'dependent type' trick?
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
# TODO handle mode in *rags?
encoding = kwargs.get('encoding', 'utf8')
kwargs['encoding'] = encoding
pp = Path(path)
suf = pp.suffix
if suf in {'.xz'}:
import lzma
return lzma.open(pp, *args, **kwargs)
return lzma.open(pp, mode, *args, **kwargs)
elif suf in {'.zip'}:
# eh. this behaviour is a bit dodgy...
from zipfile import ZipFile
return ZipFile(pp).open(*args, **kwargs)
zfile = ZipFile(pp)
[subpath] = args # meh?
## oh god... https://stackoverflow.com/a/5639960/706389
ifile = zfile.open(subpath, mode='r')
ifile.readable = lambda: True # type: ignore
ifile.writable = lambda: False # type: ignore
ifile.seekable = lambda: False # type: ignore
ifile.read1 = ifile.read # type: ignore
# TODO pass all kwargs here??
return io.TextIOWrapper(ifile, encoding=encoding)
elif suf in {'.lz4'}:
import lz4.frame # type: ignore
return lz4.frame.open(str(pp))
return lz4.frame.open(str(pp), mode, *args, **kwargs)
elif suf in {'.zstd'}:
return _zstd_open(pp)
return _zstd_open(pp, mode, *args, **kwargs)
else:
kwargs['encoding'] = 'utf-8'
return pp.open(*args, **kwargs)
return pp.open(mode, *args, **kwargs)
import typing
@ -60,7 +77,7 @@ class CPath(BasePath):
return kopen(str(self))
open = kopen # TODO remove?
open = kopen # TODO deprecate
# meh

View file

@ -7,7 +7,7 @@ from collections import deque
from datetime import datetime
from itertools import islice
from pathlib import Path
from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence
from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence, IO
import pytz
# pip3 install geopy
@ -107,6 +107,7 @@ _LOCATION_JSON = 'Takeout/Location History/Location History.json'
# TODO hope they are sorted... (could assert for it)
@mcachew(cache_path, chunk_by=10000, logger=logger)
def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
ctx: IO[str]
if path.suffix == '.json':
ctx = path.open('r')
else: # must be a takeout archive

View file

@ -18,7 +18,7 @@ import icalendar # type: ignore
from icalendar.cal import Todo # type: ignore
logger = LazyLogger('my.rtm')
logger = LazyLogger(__name__)
# TODO extract in a module to parse RTM's ical?
@ -98,8 +98,8 @@ class DAL:
def dal():
last = get_files(config.export_path, glob='*.ical.xz')[-1]
with kopen(last, 'rb') as fo:
last = get_files(config.export_path)[-1]
with kopen(last) as fo:
data = fo.read()
return DAL(data=data, revision='TODO')

View file

@ -117,7 +117,7 @@ class ZipExport:
path += '.js'
with kompress.kopen(self.epath, path) as fo:
ddd = fo.read().decode('utf8')
ddd = fo.read()
start = ddd.index('[')
ddd = ddd[start:]
for j in json.loads(ddd):