Merge pull request #38 from karlicoss/updates
More uniform handling for compressed files
This commit is contained in:
commit
77d557e172
11 changed files with 97 additions and 28 deletions
13
my/common.py
13
my/common.py
|
@ -182,15 +182,18 @@ def _magic():
|
||||||
|
|
||||||
|
|
||||||
# TODO could reuse in pdf module?
|
# TODO could reuse in pdf module?
|
||||||
import mimetypes # TODO do I need init()?
|
import mimetypes # todo do I need init()?
|
||||||
def fastermime(path: str) -> str:
|
# todo wtf? fastermime thinks it's mime is application/json even if the extension is xz??
|
||||||
|
# whereas magic detects correctly: application/x-zstd and application/x-xz
|
||||||
|
def fastermime(path: PathIsh) -> str:
|
||||||
|
paths = str(path)
|
||||||
# mimetypes is faster
|
# mimetypes is faster
|
||||||
(mime, _) = mimetypes.guess_type(path)
|
(mime, _) = mimetypes.guess_type(paths)
|
||||||
if mime is not None:
|
if mime is not None:
|
||||||
return mime
|
return mime
|
||||||
# magic is slower but returns more stuff
|
# magic is slower but returns more stuff
|
||||||
# TODO FIXME Result type; it's inherently racey
|
# TODO Result type?; it's kinda racey, but perhaps better to let the caller decide?
|
||||||
return _magic().from_file(path)
|
return _magic().from_file(paths)
|
||||||
|
|
||||||
|
|
||||||
Json = Dict[str, Any]
|
Json = Dict[str, Any]
|
||||||
|
|
|
@ -136,7 +136,6 @@ def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
|
||||||
results.append((dt, url, title))
|
results.append((dt, url, title))
|
||||||
parser = TakeoutHTMLParser(callback=cb)
|
parser = TakeoutHTMLParser(callback=cb)
|
||||||
with kopen(tpath, file) as fo:
|
with kopen(tpath, file) as fo:
|
||||||
# TODO careful, wht if it's a string already? make asutf method?
|
data = fo.read()
|
||||||
data = fo.read().decode('utf8')
|
|
||||||
parser.feed(data)
|
parser.feed(data)
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -3,36 +3,54 @@ Various helpers for compression
|
||||||
"""
|
"""
|
||||||
import pathlib
|
import pathlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Union
|
from typing import Union, IO
|
||||||
|
import io
|
||||||
|
|
||||||
PathIsh = Union[Path, str]
|
PathIsh = Union[Path, str]
|
||||||
|
|
||||||
|
|
||||||
def _zstd_open(path: Path):
|
def _zstd_open(path: Path, *args, **kwargs):
|
||||||
import zstandard as zstd # type: ignore
|
import zstandard as zstd # type: ignore
|
||||||
fh = path.open('rb')
|
fh = path.open(*args, **kwargs)
|
||||||
dctx = zstd.ZstdDecompressor()
|
dctx = zstd.ZstdDecompressor()
|
||||||
reader = dctx.stream_reader(fh)
|
reader = dctx.stream_reader(fh)
|
||||||
return reader
|
return reader
|
||||||
|
|
||||||
|
|
||||||
def kopen(path: PathIsh, *args, **kwargs): # TODO is it bytes stream??
|
# TODO returns protocol that we can call 'read' against?
|
||||||
|
# TODO use the 'dependent type' trick?
|
||||||
|
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
|
||||||
|
# TODO handle mode in *rags?
|
||||||
|
encoding = kwargs.get('encoding', 'utf8')
|
||||||
|
kwargs['encoding'] = encoding
|
||||||
|
|
||||||
pp = Path(path)
|
pp = Path(path)
|
||||||
suf = pp.suffix
|
suf = pp.suffix
|
||||||
if suf in {'.xz'}:
|
if suf in {'.xz'}:
|
||||||
import lzma
|
import lzma
|
||||||
return lzma.open(pp, *args, **kwargs)
|
return lzma.open(pp, mode, *args, **kwargs)
|
||||||
elif suf in {'.zip'}:
|
elif suf in {'.zip'}:
|
||||||
|
# eh. this behaviour is a bit dodgy...
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
return ZipFile(pp).open(*args, **kwargs)
|
zfile = ZipFile(pp)
|
||||||
|
|
||||||
|
[subpath] = args # meh?
|
||||||
|
|
||||||
|
## oh god... https://stackoverflow.com/a/5639960/706389
|
||||||
|
ifile = zfile.open(subpath, mode='r')
|
||||||
|
ifile.readable = lambda: True # type: ignore
|
||||||
|
ifile.writable = lambda: False # type: ignore
|
||||||
|
ifile.seekable = lambda: False # type: ignore
|
||||||
|
ifile.read1 = ifile.read # type: ignore
|
||||||
|
# TODO pass all kwargs here??
|
||||||
|
return io.TextIOWrapper(ifile, encoding=encoding)
|
||||||
elif suf in {'.lz4'}:
|
elif suf in {'.lz4'}:
|
||||||
import lz4.frame # type: ignore
|
import lz4.frame # type: ignore
|
||||||
return lz4.frame.open(str(pp))
|
return lz4.frame.open(str(pp), mode, *args, **kwargs)
|
||||||
elif suf in {'.zstd'}:
|
elif suf in {'.zstd'}:
|
||||||
return _zstd_open(pp)
|
return _zstd_open(pp, mode, *args, **kwargs)
|
||||||
else:
|
else:
|
||||||
kwargs['encoding'] = 'utf-8'
|
return pp.open(mode, *args, **kwargs)
|
||||||
return pp.open(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
import typing
|
import typing
|
||||||
|
@ -59,7 +77,7 @@ class CPath(BasePath):
|
||||||
return kopen(str(self))
|
return kopen(str(self))
|
||||||
|
|
||||||
|
|
||||||
open = kopen # TODO remove?
|
open = kopen # TODO deprecate
|
||||||
|
|
||||||
|
|
||||||
# meh
|
# meh
|
||||||
|
|
|
@ -7,7 +7,7 @@ from collections import deque
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence
|
from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence, IO
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
# pip3 install geopy
|
# pip3 install geopy
|
||||||
|
@ -107,6 +107,7 @@ _LOCATION_JSON = 'Takeout/Location History/Location History.json'
|
||||||
# TODO hope they are sorted... (could assert for it)
|
# TODO hope they are sorted... (could assert for it)
|
||||||
@mcachew(cache_path, chunk_by=10000, logger=logger)
|
@mcachew(cache_path, chunk_by=10000, logger=logger)
|
||||||
def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
|
def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
|
||||||
|
ctx: IO[str]
|
||||||
if path.suffix == '.json':
|
if path.suffix == '.json':
|
||||||
ctx = path.open('r')
|
ctx = path.open('r')
|
||||||
else: # must be a takeout archive
|
else: # must be a takeout archive
|
||||||
|
|
|
@ -14,7 +14,6 @@ import my.config.repos.rexport.dal as rexport
|
||||||
|
|
||||||
|
|
||||||
def get_sources() -> Sequence[Path]:
|
def get_sources() -> Sequence[Path]:
|
||||||
# TODO use zstd?
|
|
||||||
# TODO rename to export_path?
|
# TODO rename to export_path?
|
||||||
files = get_files(config.export_dir)
|
files = get_files(config.export_dir)
|
||||||
res = list(map(CPath, files)); assert len(res) > 0
|
res = list(map(CPath, files)); assert len(res) > 0
|
||||||
|
|
11
my/rtm.py
11
my/rtm.py
|
@ -9,7 +9,7 @@ from typing import Dict, List, Optional, Iterator
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from .common import LazyLogger, get_files, group_by_key, cproperty, make_dict
|
from .common import LazyLogger, get_files, group_by_key, cproperty, make_dict
|
||||||
from .kython.kompress import open as kopen
|
from .kython.kompress import CPath
|
||||||
|
|
||||||
from my.config import rtm as config
|
from my.config import rtm as config
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ import icalendar # type: ignore
|
||||||
from icalendar.cal import Todo # type: ignore
|
from icalendar.cal import Todo # type: ignore
|
||||||
|
|
||||||
|
|
||||||
logger = LazyLogger('my.rtm')
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# TODO extract in a module to parse RTM's ical?
|
# TODO extract in a module to parse RTM's ical?
|
||||||
|
@ -80,7 +80,7 @@ class MyTodo:
|
||||||
|
|
||||||
|
|
||||||
class DAL:
|
class DAL:
|
||||||
def __init__(self, data: bytes, revision=None) -> None:
|
def __init__(self, data: str, revision=None) -> None:
|
||||||
self.cal = icalendar.Calendar.from_ical(data)
|
self.cal = icalendar.Calendar.from_ical(data)
|
||||||
self.revision = revision
|
self.revision = revision
|
||||||
|
|
||||||
|
@ -98,9 +98,8 @@ class DAL:
|
||||||
|
|
||||||
|
|
||||||
def dal():
|
def dal():
|
||||||
last = get_files(config.export_path, glob='*.ical.xz')[-1]
|
last = get_files(config.export_path)[-1]
|
||||||
with kopen(last, 'rb') as fo:
|
data = CPath(last).read_text() # TODO make it automatic
|
||||||
data = fo.read()
|
|
||||||
return DAL(data=data, revision='TODO')
|
return DAL(data=data, revision='TODO')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -117,7 +117,7 @@ class ZipExport:
|
||||||
path += '.js'
|
path += '.js'
|
||||||
|
|
||||||
with kompress.kopen(self.epath, path) as fo:
|
with kompress.kopen(self.epath, path) as fo:
|
||||||
ddd = fo.read().decode('utf8')
|
ddd = fo.read()
|
||||||
start = ddd.index('[')
|
start = ddd.index('[')
|
||||||
ddd = ddd[start:]
|
ddd = ddd[start:]
|
||||||
for j in json.loads(ddd):
|
for j in json.loads(ddd):
|
||||||
|
|
47
tests/misc.py
Normal file
47
tests/misc.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from subprocess import check_call
|
||||||
|
import gzip
|
||||||
|
import lzma
|
||||||
|
import io
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
from my.kython.kompress import kopen, kexists, CPath
|
||||||
|
|
||||||
|
|
||||||
|
import pytest # type: ignore
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def prepare(tmp_path: Path):
|
||||||
|
(tmp_path / 'file').write_text('just plaintext')
|
||||||
|
with (tmp_path / 'file.xz').open('wb') as f:
|
||||||
|
with lzma.open(f, 'w') as lzf:
|
||||||
|
lzf.write(b'compressed text')
|
||||||
|
with zipfile.ZipFile(tmp_path / 'file.zip', 'w') as zf:
|
||||||
|
zf.writestr('path/in/archive', 'data in zip')
|
||||||
|
try:
|
||||||
|
yield None
|
||||||
|
finally:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_kopen(prepare, tmp_path: Path) -> None:
|
||||||
|
"Plaintext handled transparently"
|
||||||
|
assert kopen(tmp_path / 'file' ).read() == 'just plaintext'
|
||||||
|
assert kopen(tmp_path / 'file.xz').read() == 'compressed text'
|
||||||
|
|
||||||
|
"For zips behaviour is a bit different (not sure about all this, tbh...)"
|
||||||
|
assert kopen(tmp_path / 'file.zip', 'path/in/archive').read() == 'data in zip'
|
||||||
|
|
||||||
|
|
||||||
|
def test_kexists(prepare, tmp_path: Path) -> None:
|
||||||
|
assert kexists(str(tmp_path / 'file.zip'), 'path/in/archive')
|
||||||
|
assert not kexists(str(tmp_path / 'file.zip'), 'path/notin/archive')
|
||||||
|
|
||||||
|
# TODO not sure about this?
|
||||||
|
assert not kexists(tmp_path / 'nosuchzip.zip', 'path/in/archive')
|
||||||
|
|
||||||
|
|
||||||
|
def test_cpath(prepare, tmp_path: Path) -> None:
|
||||||
|
CPath(str(tmp_path / 'file' )).read_text() == 'just plaintext'
|
||||||
|
CPath( tmp_path / 'file.xz').read_text() == 'compressed text'
|
||||||
|
# TODO not sure about zip files??
|
|
@ -1,2 +1,4 @@
|
||||||
# ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927
|
# ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927
|
||||||
from my.reddit import *
|
from my.reddit import *
|
||||||
|
|
||||||
|
# TODO for reddit test, patch up to take every 10th archive or something; but make sure it's deterministic
|
||||||
|
|
3
tox.ini
3
tox.ini
|
@ -8,11 +8,12 @@ passenv = CI CI_*
|
||||||
# deliberately set to nonexistent pathe to check the fallback logic
|
# deliberately set to nonexistent pathe to check the fallback logic
|
||||||
setenv = MY_CONFIG = nonexistent
|
setenv = MY_CONFIG = nonexistent
|
||||||
commands =
|
commands =
|
||||||
pip install -e .
|
pip install -e .[testing]
|
||||||
# TODO ??
|
# TODO ??
|
||||||
# python -m pytest {posargs}
|
# python -m pytest {posargs}
|
||||||
python3 -c 'import my.init; from my.config import stub as config; print(config.key)'
|
python3 -c 'import my.init; from my.config import stub as config; print(config.key)'
|
||||||
python3 -c 'import my.init; import my.config; import my.config.repos' # shouldn't fail at least
|
python3 -c 'import my.init; import my.config; import my.config.repos' # shouldn't fail at least
|
||||||
|
python3 -m pytest tests/misc.py tests/get_files.py
|
||||||
# TODO run demo.py? just make sure with_my is a bit cleverer?
|
# TODO run demo.py? just make sure with_my is a bit cleverer?
|
||||||
# TODO e.g. under CI, rely on installing
|
# TODO e.g. under CI, rely on installing
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue