kyhton.kompress: move to core (with a fallback, used in promnesia)
This commit is contained in:
parent
655b86bb0a
commit
15789a4149
9 changed files with 102 additions and 101 deletions
|
@ -183,7 +183,7 @@ def get_files(
|
||||||
traceback.print_stack()
|
traceback.print_stack()
|
||||||
|
|
||||||
if guess_compression:
|
if guess_compression:
|
||||||
from ..kython.kompress import CPath # todo move to core?
|
from .kompress import CPath
|
||||||
paths = [CPath(p) if _is_compressed(p) else p for p in paths]
|
paths = [CPath(p) if _is_compressed(p) else p for p in paths]
|
||||||
return tuple(paths)
|
return tuple(paths)
|
||||||
|
|
||||||
|
|
94
my/core/kompress.py
Normal file
94
my/core/kompress.py
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
"""
|
||||||
|
Various helpers for compression
|
||||||
|
"""
|
||||||
|
import pathlib
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union, IO
|
||||||
|
import io
|
||||||
|
|
||||||
|
PathIsh = Union[Path, str]
|
||||||
|
|
||||||
|
|
||||||
|
def _zstd_open(path: Path, *args, **kwargs) -> IO[str]:
|
||||||
|
import zstandard as zstd # type: ignore
|
||||||
|
fh = path.open('rb')
|
||||||
|
dctx = zstd.ZstdDecompressor()
|
||||||
|
reader = dctx.stream_reader(fh)
|
||||||
|
return io.TextIOWrapper(reader, **kwargs) # meh
|
||||||
|
|
||||||
|
|
||||||
|
# TODO returns protocol that we can call 'read' against?
|
||||||
|
# TODO use the 'dependent type' trick?
|
||||||
|
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
|
||||||
|
# TODO handle mode in *rags?
|
||||||
|
encoding = kwargs.get('encoding', 'utf8')
|
||||||
|
kwargs['encoding'] = encoding
|
||||||
|
|
||||||
|
pp = Path(path)
|
||||||
|
suf = pp.suffix
|
||||||
|
if suf in {'.xz'}:
|
||||||
|
import lzma
|
||||||
|
r = lzma.open(pp, mode, *args, **kwargs)
|
||||||
|
# should only happen for binary mode?
|
||||||
|
# file:///usr/share/doc/python3/html/library/lzma.html?highlight=lzma#lzma.open
|
||||||
|
assert not isinstance(r, lzma.LZMAFile), r
|
||||||
|
return r
|
||||||
|
elif suf in {'.zip'}:
|
||||||
|
# eh. this behaviour is a bit dodgy...
|
||||||
|
from zipfile import ZipFile
|
||||||
|
zfile = ZipFile(pp)
|
||||||
|
|
||||||
|
[subpath] = args # meh?
|
||||||
|
|
||||||
|
## oh god... https://stackoverflow.com/a/5639960/706389
|
||||||
|
ifile = zfile.open(subpath, mode='r')
|
||||||
|
ifile.readable = lambda: True # type: ignore
|
||||||
|
ifile.writable = lambda: False # type: ignore
|
||||||
|
ifile.seekable = lambda: False # type: ignore
|
||||||
|
ifile.read1 = ifile.read # type: ignore
|
||||||
|
# TODO pass all kwargs here??
|
||||||
|
# todo 'expected "BinaryIO"'??
|
||||||
|
return io.TextIOWrapper(ifile, encoding=encoding) # type: ignore[arg-type]
|
||||||
|
elif suf in {'.lz4'}:
|
||||||
|
import lz4.frame # type: ignore
|
||||||
|
return lz4.frame.open(str(pp), mode, *args, **kwargs)
|
||||||
|
elif suf in {'.zstd'}:
|
||||||
|
return _zstd_open(pp, mode, *args, **kwargs)
|
||||||
|
else:
|
||||||
|
return pp.open(mode, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
import typing
|
||||||
|
import os
|
||||||
|
|
||||||
|
if typing.TYPE_CHECKING:
|
||||||
|
# otherwise mypy can't figure out that BasePath is a type alias..
|
||||||
|
BasePath = pathlib.Path
|
||||||
|
else:
|
||||||
|
BasePath = pathlib.WindowsPath if os.name == 'nt' else pathlib.PosixPath
|
||||||
|
|
||||||
|
|
||||||
|
class CPath(BasePath):
|
||||||
|
"""
|
||||||
|
Hacky way to support compressed files.
|
||||||
|
If you can think of a better way to do this, please let me know! https://github.com/karlicoss/HPI/issues/20
|
||||||
|
|
||||||
|
Ugh. So, can't override Path because of some _flavour thing.
|
||||||
|
Path only has _accessor and _closed slots, so can't directly set .open method
|
||||||
|
_accessor.open has to return file descriptor, doesn't work for compressed stuff.
|
||||||
|
"""
|
||||||
|
def open(self, *args, **kwargs):
|
||||||
|
# TODO assert read only?
|
||||||
|
return kopen(str(self))
|
||||||
|
|
||||||
|
|
||||||
|
open = kopen # TODO deprecate
|
||||||
|
|
||||||
|
|
||||||
|
# meh
|
||||||
|
def kexists(path: PathIsh, subpath: str) -> bool:
|
||||||
|
try:
|
||||||
|
kopen(path, subpath)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
|
@ -130,7 +130,7 @@ class TakeoutHTMLParser(HTMLParser):
|
||||||
|
|
||||||
|
|
||||||
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
||||||
from ...kython.kompress import kopen
|
from ...core.kompress import kopen
|
||||||
results: List[Parsed] = []
|
results: List[Parsed] = []
|
||||||
def cb(dt: datetime, url: Url, title: Title) -> None:
|
def cb(dt: datetime, url: Url, title: Title) -> None:
|
||||||
results.append((dt, url, title))
|
results.append((dt, url, title))
|
||||||
|
|
|
@ -23,7 +23,7 @@ config = make_config(google)
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Iterable
|
from typing import Optional, Iterable
|
||||||
|
|
||||||
from ...kython.kompress import kopen, kexists
|
from ...core.kompress import kopen, kexists
|
||||||
|
|
||||||
|
|
||||||
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
|
def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]:
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
vendorized kython (https://github.com/karlicoss/kython) stuff
|
deprecated, please use my.core directly
|
||||||
|
|
|
@ -1,94 +0,0 @@
|
||||||
"""
|
|
||||||
Various helpers for compression
|
|
||||||
"""
|
|
||||||
import pathlib
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Union, IO
|
|
||||||
import io
|
|
||||||
|
|
||||||
PathIsh = Union[Path, str]
|
|
||||||
|
|
||||||
|
|
||||||
def _zstd_open(path: Path, *args, **kwargs) -> IO[str]:
|
|
||||||
import zstandard as zstd # type: ignore
|
|
||||||
fh = path.open('rb')
|
|
||||||
dctx = zstd.ZstdDecompressor()
|
|
||||||
reader = dctx.stream_reader(fh)
|
|
||||||
return io.TextIOWrapper(reader, **kwargs) # meh
|
|
||||||
|
|
||||||
|
|
||||||
# TODO returns protocol that we can call 'read' against?
|
|
||||||
# TODO use the 'dependent type' trick?
|
|
||||||
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
|
|
||||||
# TODO handle mode in *rags?
|
|
||||||
encoding = kwargs.get('encoding', 'utf8')
|
|
||||||
kwargs['encoding'] = encoding
|
|
||||||
|
|
||||||
pp = Path(path)
|
|
||||||
suf = pp.suffix
|
|
||||||
if suf in {'.xz'}:
|
|
||||||
import lzma
|
|
||||||
r = lzma.open(pp, mode, *args, **kwargs)
|
|
||||||
# should only happen for binary mode?
|
|
||||||
# file:///usr/share/doc/python3/html/library/lzma.html?highlight=lzma#lzma.open
|
|
||||||
assert not isinstance(r, lzma.LZMAFile), r
|
|
||||||
return r
|
|
||||||
elif suf in {'.zip'}:
|
|
||||||
# eh. this behaviour is a bit dodgy...
|
|
||||||
from zipfile import ZipFile
|
|
||||||
zfile = ZipFile(pp)
|
|
||||||
|
|
||||||
[subpath] = args # meh?
|
|
||||||
|
|
||||||
## oh god... https://stackoverflow.com/a/5639960/706389
|
|
||||||
ifile = zfile.open(subpath, mode='r')
|
|
||||||
ifile.readable = lambda: True # type: ignore
|
|
||||||
ifile.writable = lambda: False # type: ignore
|
|
||||||
ifile.seekable = lambda: False # type: ignore
|
|
||||||
ifile.read1 = ifile.read # type: ignore
|
|
||||||
# TODO pass all kwargs here??
|
|
||||||
# todo 'expected "BinaryIO"'??
|
|
||||||
return io.TextIOWrapper(ifile, encoding=encoding) # type: ignore[arg-type]
|
|
||||||
elif suf in {'.lz4'}:
|
|
||||||
import lz4.frame # type: ignore
|
|
||||||
return lz4.frame.open(str(pp), mode, *args, **kwargs)
|
|
||||||
elif suf in {'.zstd'}:
|
|
||||||
return _zstd_open(pp, mode, *args, **kwargs)
|
|
||||||
else:
|
|
||||||
return pp.open(mode, *args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
import typing
|
|
||||||
import os
|
|
||||||
|
|
||||||
if typing.TYPE_CHECKING:
|
|
||||||
# otherwise mypy can't figure out that BasePath is a type alias..
|
|
||||||
BasePath = pathlib.Path
|
|
||||||
else:
|
|
||||||
BasePath = pathlib.WindowsPath if os.name == 'nt' else pathlib.PosixPath
|
|
||||||
|
|
||||||
|
|
||||||
class CPath(BasePath):
|
|
||||||
"""
|
|
||||||
Hacky way to support compressed files.
|
|
||||||
If you can think of a better way to do this, please let me know! https://github.com/karlicoss/HPI/issues/20
|
|
||||||
|
|
||||||
Ugh. So, can't override Path because of some _flavour thing.
|
|
||||||
Path only has _accessor and _closed slots, so can't directly set .open method
|
|
||||||
_accessor.open has to return file descriptor, doesn't work for compressed stuff.
|
|
||||||
"""
|
|
||||||
def open(self, *args, **kwargs):
|
|
||||||
# TODO assert read only?
|
|
||||||
return kopen(str(self))
|
|
||||||
|
|
||||||
|
|
||||||
open = kopen # TODO deprecate
|
|
||||||
|
|
||||||
|
|
||||||
# meh
|
|
||||||
def kexists(path: PathIsh, subpath: str) -> bool:
|
|
||||||
try:
|
|
||||||
kopen(path, subpath)
|
|
||||||
return True
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
1
my/kython/kompress.py
Symbolic link
1
my/kython/kompress.py
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../core/kompress.py
|
|
@ -18,7 +18,7 @@ import geopy # type: ignore
|
||||||
|
|
||||||
from ..core.common import LazyLogger, mcachew
|
from ..core.common import LazyLogger, mcachew
|
||||||
from ..core.cachew import cache_dir
|
from ..core.cachew import cache_dir
|
||||||
from ..kython import kompress
|
from ..core import kompress
|
||||||
|
|
||||||
|
|
||||||
# otherwise uses ijson
|
# otherwise uses ijson
|
||||||
|
|
|
@ -40,7 +40,7 @@ import zipfile
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
from ..common import PathIsh, get_files, LazyLogger, Json
|
from ..common import PathIsh, get_files, LazyLogger, Json
|
||||||
from ..kython import kompress
|
from ..core import kompress
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ import lzma
|
||||||
import io
|
import io
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from my.kython.kompress import kopen, kexists, CPath
|
from my.core.kompress import kopen, kexists, CPath
|
||||||
|
|
||||||
def test_kopen(tmp_path: Path) -> None:
|
def test_kopen(tmp_path: Path) -> None:
|
||||||
"Plaintext handled transparently"
|
"Plaintext handled transparently"
|
||||||
|
|
Loading…
Add table
Reference in a new issue