kyhton.kompress: move to core (with a fallback, used in promnesia)

This commit is contained in:
Dima Gerasimov 2020-10-29 01:35:43 +00:00 committed by karlicoss
parent 655b86bb0a
commit 15789a4149
9 changed files with 102 additions and 101 deletions

View file

@ -183,7 +183,7 @@ def get_files(
traceback.print_stack() traceback.print_stack()
if guess_compression: if guess_compression:
from ..kython.kompress import CPath # todo move to core? from .kompress import CPath
paths = [CPath(p) if _is_compressed(p) else p for p in paths] paths = [CPath(p) if _is_compressed(p) else p for p in paths]
return tuple(paths) return tuple(paths)

94
my/core/kompress.py Normal file
View file

@ -0,0 +1,94 @@
"""
Various helpers for compression
"""
import pathlib
from pathlib import Path
from typing import Union, IO
import io
PathIsh = Union[Path, str]
def _zstd_open(path: Path, *args, **kwargs) -> IO[str]:
import zstandard as zstd # type: ignore
fh = path.open('rb')
dctx = zstd.ZstdDecompressor()
reader = dctx.stream_reader(fh)
return io.TextIOWrapper(reader, **kwargs) # meh
# TODO returns protocol that we can call 'read' against?
# TODO use the 'dependent type' trick?
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
# TODO handle mode in *rags?
encoding = kwargs.get('encoding', 'utf8')
kwargs['encoding'] = encoding
pp = Path(path)
suf = pp.suffix
if suf in {'.xz'}:
import lzma
r = lzma.open(pp, mode, *args, **kwargs)
# should only happen for binary mode?
# file:///usr/share/doc/python3/html/library/lzma.html?highlight=lzma#lzma.open
assert not isinstance(r, lzma.LZMAFile), r
return r
elif suf in {'.zip'}:
# eh. this behaviour is a bit dodgy...
from zipfile import ZipFile
zfile = ZipFile(pp)
[subpath] = args # meh?
## oh god... https://stackoverflow.com/a/5639960/706389
ifile = zfile.open(subpath, mode='r')
ifile.readable = lambda: True # type: ignore
ifile.writable = lambda: False # type: ignore
ifile.seekable = lambda: False # type: ignore
ifile.read1 = ifile.read # type: ignore
# TODO pass all kwargs here??
# todo 'expected "BinaryIO"'??
return io.TextIOWrapper(ifile, encoding=encoding) # type: ignore[arg-type]
elif suf in {'.lz4'}:
import lz4.frame # type: ignore
return lz4.frame.open(str(pp), mode, *args, **kwargs)
elif suf in {'.zstd'}:
return _zstd_open(pp, mode, *args, **kwargs)
else:
return pp.open(mode, *args, **kwargs)
import typing
import os
if typing.TYPE_CHECKING:
# otherwise mypy can't figure out that BasePath is a type alias..
BasePath = pathlib.Path
else:
BasePath = pathlib.WindowsPath if os.name == 'nt' else pathlib.PosixPath
class CPath(BasePath):
"""
Hacky way to support compressed files.
If you can think of a better way to do this, please let me know! https://github.com/karlicoss/HPI/issues/20
Ugh. So, can't override Path because of some _flavour thing.
Path only has _accessor and _closed slots, so can't directly set .open method
_accessor.open has to return file descriptor, doesn't work for compressed stuff.
"""
def open(self, *args, **kwargs):
# TODO assert read only?
return kopen(str(self))
open = kopen # TODO deprecate
# meh
def kexists(path: PathIsh, subpath: str) -> bool:
try:
kopen(path, subpath)
return True
except Exception:
return False

View file

@ -130,7 +130,7 @@ class TakeoutHTMLParser(HTMLParser):
def read_html(tpath: Path, file: str) -> Iterable[Parsed]: def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
from ...kython.kompress import kopen from ...core.kompress import kopen
results: List[Parsed] = [] results: List[Parsed] = []
def cb(dt: datetime, url: Url, title: Title) -> None: def cb(dt: datetime, url: Url, title: Title) -> None:
results.append((dt, url, title)) results.append((dt, url, title))

View file

@ -23,7 +23,7 @@ config = make_config(google)
from pathlib import Path from pathlib import Path
from typing import Optional, Iterable from typing import Optional, Iterable
from ...kython.kompress import kopen, kexists from ...core.kompress import kopen, kexists
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]: def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:

View file

@ -1 +1 @@
vendorized kython (https://github.com/karlicoss/kython) stuff deprecated, please use my.core directly

View file

@ -1,94 +0,0 @@
"""
Various helpers for compression
"""
import pathlib
from pathlib import Path
from typing import Union, IO
import io
PathIsh = Union[Path, str]
def _zstd_open(path: Path, *args, **kwargs) -> IO[str]:
import zstandard as zstd # type: ignore
fh = path.open('rb')
dctx = zstd.ZstdDecompressor()
reader = dctx.stream_reader(fh)
return io.TextIOWrapper(reader, **kwargs) # meh
# TODO returns protocol that we can call 'read' against?
# TODO use the 'dependent type' trick?
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
# TODO handle mode in *rags?
encoding = kwargs.get('encoding', 'utf8')
kwargs['encoding'] = encoding
pp = Path(path)
suf = pp.suffix
if suf in {'.xz'}:
import lzma
r = lzma.open(pp, mode, *args, **kwargs)
# should only happen for binary mode?
# file:///usr/share/doc/python3/html/library/lzma.html?highlight=lzma#lzma.open
assert not isinstance(r, lzma.LZMAFile), r
return r
elif suf in {'.zip'}:
# eh. this behaviour is a bit dodgy...
from zipfile import ZipFile
zfile = ZipFile(pp)
[subpath] = args # meh?
## oh god... https://stackoverflow.com/a/5639960/706389
ifile = zfile.open(subpath, mode='r')
ifile.readable = lambda: True # type: ignore
ifile.writable = lambda: False # type: ignore
ifile.seekable = lambda: False # type: ignore
ifile.read1 = ifile.read # type: ignore
# TODO pass all kwargs here??
# todo 'expected "BinaryIO"'??
return io.TextIOWrapper(ifile, encoding=encoding) # type: ignore[arg-type]
elif suf in {'.lz4'}:
import lz4.frame # type: ignore
return lz4.frame.open(str(pp), mode, *args, **kwargs)
elif suf in {'.zstd'}:
return _zstd_open(pp, mode, *args, **kwargs)
else:
return pp.open(mode, *args, **kwargs)
import typing
import os
if typing.TYPE_CHECKING:
# otherwise mypy can't figure out that BasePath is a type alias..
BasePath = pathlib.Path
else:
BasePath = pathlib.WindowsPath if os.name == 'nt' else pathlib.PosixPath
class CPath(BasePath):
"""
Hacky way to support compressed files.
If you can think of a better way to do this, please let me know! https://github.com/karlicoss/HPI/issues/20
Ugh. So, can't override Path because of some _flavour thing.
Path only has _accessor and _closed slots, so can't directly set .open method
_accessor.open has to return file descriptor, doesn't work for compressed stuff.
"""
def open(self, *args, **kwargs):
# TODO assert read only?
return kopen(str(self))
open = kopen # TODO deprecate
# meh
def kexists(path: PathIsh, subpath: str) -> bool:
try:
kopen(path, subpath)
return True
except Exception:
return False

1
my/kython/kompress.py Symbolic link
View file

@ -0,0 +1 @@
../core/kompress.py

View file

@ -18,7 +18,7 @@ import geopy # type: ignore
from ..core.common import LazyLogger, mcachew from ..core.common import LazyLogger, mcachew
from ..core.cachew import cache_dir from ..core.cachew import cache_dir
from ..kython import kompress from ..core import kompress
# otherwise uses ijson # otherwise uses ijson

View file

@ -40,7 +40,7 @@ import zipfile
import pytz import pytz
from ..common import PathIsh, get_files, LazyLogger, Json from ..common import PathIsh, get_files, LazyLogger, Json
from ..kython import kompress from ..core import kompress

View file

@ -5,7 +5,7 @@ import lzma
import io import io
import zipfile import zipfile
from my.kython.kompress import kopen, kexists, CPath from my.core.kompress import kopen, kexists, CPath
def test_kopen(tmp_path: Path) -> None: def test_kopen(tmp_path: Path) -> None:
"Plaintext handled transparently" "Plaintext handled transparently"