Merge pull request #38 from karlicoss/updates

More uniform handling for compressed files
2020-05-04 08:57:48 +01:00 · 2020-05-04 08:57:48 +01:00 · 77d557e172
commit 77d557e172
parent 5aecc037e9 55ac85c7e7
11 changed files with 97 additions and 28 deletions
--- a/my/common.py
+++ b/my/common.py
@ -182,15 +182,18 @@ def _magic():


 # TODO could reuse in pdf module?
-import mimetypes # TODO do I need init()?
-def fastermime(path: str) -> str:
+import mimetypes # todo do I need init()?
+# todo wtf? fastermime thinks it's mime is application/json even if the extension is xz??
+# whereas magic detects correctly: application/x-zstd and application/x-xz
+def fastermime(path: PathIsh) -> str:
+    paths = str(path)
    # mimetypes is faster
-    (mime, _) = mimetypes.guess_type(path)
+    (mime, _) = mimetypes.guess_type(paths)
    if mime is not None:
        return mime
    # magic is slower but returns more stuff
-    # TODO FIXME Result type; it's inherently racey
-    return _magic().from_file(path)
+    # TODO Result type?; it's kinda racey, but perhaps better to let the caller decide?
+    return _magic().from_file(paths)


 Json = Dict[str, Any]
--- a/my/google/takeout/html.py
+++ b/my/google/takeout/html.py
@ -136,7 +136,6 @@ def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
        results.append((dt, url, title))
    parser = TakeoutHTMLParser(callback=cb)
    with kopen(tpath, file) as fo:
-        # TODO careful, wht if it's a string already? make asutf method?
-        data = fo.read().decode('utf8')
+        data = fo.read()
        parser.feed(data)
    return results
--- a/my/kython/kompress.py
+++ b/my/kython/kompress.py
@ -3,36 +3,54 @@ Various helpers for compression
 """
 import pathlib
 from pathlib import Path
-from typing import Union
+from typing import Union, IO
+import io

 PathIsh = Union[Path, str]


-def _zstd_open(path: Path):
+def _zstd_open(path: Path, *args, **kwargs):
    import zstandard as zstd # type: ignore
-    fh = path.open('rb')
+    fh = path.open(*args, **kwargs)
    dctx = zstd.ZstdDecompressor()
    reader = dctx.stream_reader(fh)
    return reader


-def kopen(path: PathIsh, *args, **kwargs): # TODO is it bytes stream??
+# TODO returns protocol that we can call 'read' against?
+# TODO use the 'dependent type' trick?
+def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
+    # TODO handle mode in *rags?
+    encoding = kwargs.get('encoding', 'utf8')
+    kwargs['encoding'] = encoding
+
    pp = Path(path)
    suf = pp.suffix
    if suf in {'.xz'}:
        import lzma
-        return lzma.open(pp, *args, **kwargs)
+        return lzma.open(pp, mode, *args, **kwargs)
    elif suf in {'.zip'}:
+        # eh. this behaviour is a bit dodgy...
        from zipfile import ZipFile
-        return ZipFile(pp).open(*args, **kwargs)
+        zfile = ZipFile(pp)
+
+        [subpath] = args # meh?
+
+        ## oh god... https://stackoverflow.com/a/5639960/706389
+        ifile = zfile.open(subpath, mode='r')
+        ifile.readable = lambda: True  # type: ignore
+        ifile.writable = lambda: False # type: ignore
+        ifile.seekable = lambda: False # type: ignore
+        ifile.read1    = ifile.read    # type: ignore
+        # TODO pass all kwargs here??
+        return io.TextIOWrapper(ifile, encoding=encoding)
    elif suf in {'.lz4'}:
        import lz4.frame # type: ignore
-        return lz4.frame.open(str(pp))
+        return lz4.frame.open(str(pp), mode, *args, **kwargs)
    elif suf in {'.zstd'}:
-        return _zstd_open(pp)
+        return _zstd_open(pp, mode, *args, **kwargs)
    else:
-        kwargs['encoding'] = 'utf-8'
-        return pp.open(*args, **kwargs)
+        return pp.open(mode, *args, **kwargs)


 import typing
@ -59,7 +77,7 @@ class CPath(BasePath):
        return kopen(str(self))


-open = kopen # TODO remove?
+open = kopen # TODO deprecate


 # meh
--- a/my/location/takeout.py
+++ b/my/location/takeout.py
@ -7,7 +7,7 @@ from collections import deque
 from datetime import datetime
 from itertools import islice
 from pathlib import Path
-from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence
+from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence, IO
 import pytz

 # pip3 install geopy
@ -107,6 +107,7 @@ _LOCATION_JSON = 'Takeout/Location History/Location History.json'
 # TODO hope they are sorted... (could assert for it)
@mcachew(cache_path, chunk_by=10000, logger=logger)
 def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
+    ctx: IO[str]
    if path.suffix == '.json':
        ctx = path.open('r')
    else: # must be a takeout archive
--- a/my/reddit.py
+++ b/my/reddit.py
@ -14,7 +14,6 @@ import my.config.repos.rexport.dal as rexport


 def get_sources() -> Sequence[Path]:
-    # TODO use zstd?
    # TODO rename to export_path?
    files = get_files(config.export_dir)
    res = list(map(CPath, files)); assert len(res) > 0
--- a/my/rtm.py
+++ b/my/rtm.py
@ -9,7 +9,7 @@ from typing import Dict, List, Optional, Iterator
 from datetime import datetime

 from .common import LazyLogger, get_files, group_by_key, cproperty, make_dict
-from .kython.kompress import open as kopen
+from .kython.kompress import CPath

 from my.config import rtm as config

@ -18,7 +18,7 @@ import icalendar # type: ignore
 from icalendar.cal import Todo # type: ignore


-logger = LazyLogger('my.rtm')
+logger = LazyLogger(__name__)


 # TODO extract in a module to parse RTM's ical?
@ -80,7 +80,7 @@ class MyTodo:


 class DAL:
-    def __init__(self, data: bytes, revision=None) -> None:
+    def __init__(self, data: str, revision=None) -> None:
        self.cal = icalendar.Calendar.from_ical(data)
        self.revision = revision

@ -98,9 +98,8 @@ class DAL:


 def dal():
-    last = get_files(config.export_path, glob='*.ical.xz')[-1]
-    with kopen(last, 'rb') as fo:
-        data = fo.read()
+    last = get_files(config.export_path)[-1]
+    data = CPath(last).read_text() # TODO make it automatic
    return DAL(data=data, revision='TODO')


--- a/my/twitter/archive.py
+++ b/my/twitter/archive.py
@ -117,7 +117,7 @@ class ZipExport:
        path += '.js'

        with kompress.kopen(self.epath, path) as fo:
-            ddd = fo.read().decode('utf8')
+            ddd = fo.read()
        start = ddd.index('[')
        ddd = ddd[start:]
        for j in json.loads(ddd):
--- a/tests/get_files.py
+++ b/tests/get_files.py
--- a/tests/misc.py
+++ b/tests/misc.py
@ -0,0 +1,47 @@
+from pathlib import Path
+from subprocess import check_call
+import gzip
+import lzma
+import io
+import zipfile
+
+from my.kython.kompress import kopen, kexists, CPath
+
+
+import pytest # type: ignore
+
+@pytest.fixture
+def prepare(tmp_path: Path):
+    (tmp_path / 'file').write_text('just plaintext')
+    with (tmp_path / 'file.xz').open('wb') as f:
+        with lzma.open(f, 'w') as lzf:
+            lzf.write(b'compressed text')
+    with zipfile.ZipFile(tmp_path / 'file.zip', 'w') as zf:
+        zf.writestr('path/in/archive', 'data in zip')
+    try:
+        yield None
+    finally:
+        pass
+
+
+def test_kopen(prepare, tmp_path: Path) -> None:
+    "Plaintext handled transparently"
+    assert kopen(tmp_path / 'file'   ).read() == 'just plaintext'
+    assert kopen(tmp_path / 'file.xz').read() == 'compressed text'
+
+    "For zips behaviour is a bit different (not sure about all this, tbh...)"
+    assert kopen(tmp_path / 'file.zip', 'path/in/archive').read() == 'data in zip'
+
+
+def test_kexists(prepare, tmp_path: Path) -> None:
+    assert     kexists(str(tmp_path / 'file.zip'), 'path/in/archive')
+    assert not kexists(str(tmp_path / 'file.zip'), 'path/notin/archive')
+
+    # TODO not sure about this?
+    assert not kexists(tmp_path / 'nosuchzip.zip', 'path/in/archive')
+
+
+def test_cpath(prepare, tmp_path: Path) -> None:
+    CPath(str(tmp_path / 'file'  )).read_text() == 'just plaintext'
+    CPath(    tmp_path / 'file.xz').read_text() == 'compressed text'
+    # TODO not sure about zip files??
--- a/tests/reddit.py
+++ b/tests/reddit.py
@ -1,2 +1,4 @@
 # ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927
 from my.reddit import *
+
+# TODO for reddit test, patch up to take every 10th archive or something; but make sure it's deterministic
--- a/tox.ini
+++ b/tox.ini
@ -8,11 +8,12 @@ passenv = CI CI_*
 # deliberately set to nonexistent pathe to check the fallback logic
 setenv = MY_CONFIG = nonexistent
 commands =
-    pip install -e .
+    pip install -e .[testing]
    # TODO ??
    # python -m pytest {posargs}
    python3 -c 'import my.init; from my.config import stub as config; print(config.key)'
    python3 -c 'import my.init; import my.config; import my.config.repos' # shouldn't fail at least
+    python3 -m pytest tests/misc.py tests/get_files.py
    # TODO run demo.py? just make sure with_my is a bit cleverer?
    # TODO e.g. under CI, rely on installing