From 8b8a85e8c361fad6252d2966b86bfbc4320ccf30 Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Mon, 4 May 2020 08:37:36 +0100
Subject: [PATCH] kompress.kopen improvements

- tests
- uniform handling for bytes/str, always return utf8 str by default
---
 my/google/takeout/html.py |  3 +--
 my/kython/kompress.py     | 41 +++++++++++++++++++++++++++------------
 my/location/takeout.py    |  3 ++-
 my/rtm.py                 |  6 +++---
 my/twitter/archive.py     |  2 +-
 tests/misc.py             | 19 +++++++++++++-----
 tests/reddit.py           |  2 ++
 7 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/my/google/takeout/html.py b/my/google/takeout/html.py
index b6c55d3..a87dd04 100644
--- a/my/google/takeout/html.py
+++ b/my/google/takeout/html.py
@@ -136,7 +136,6 @@ def read_html(tpath: Path, file: str) -> Iterable[Parsed]:
         results.append((dt, url, title))
     parser = TakeoutHTMLParser(callback=cb)
     with kopen(tpath, file) as fo:
-        # TODO careful, wht if it's a string already? make asutf method?
-        data = fo.read().decode('utf8')
+        data = fo.read()
         parser.feed(data)
     return results
diff --git a/my/kython/kompress.py b/my/kython/kompress.py
index 73181ea..35c6e4e 100644
--- a/my/kython/kompress.py
+++ b/my/kython/kompress.py
@@ -3,37 +3,54 @@ Various helpers for compression
 """
 import pathlib
 from pathlib import Path
-from typing import Union
+from typing import Union, IO
+import io
 
 PathIsh = Union[Path, str]
 
 
-def _zstd_open(path: Path):
+def _zstd_open(path: Path, *args, **kwargs):
     import zstandard as zstd # type: ignore
-    fh = path.open('rb')
+    fh = path.open(*args, **kwargs)
     dctx = zstd.ZstdDecompressor()
     reader = dctx.stream_reader(fh)
     return reader
 
 
-def kopen(path: PathIsh, *args, **kwargs): # TODO is it bytes stream??
-    # TODO allow passing in mode?
+# TODO returns protocol that we can call 'read' against?
+# TODO use the 'dependent type' trick?
+def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
+    # TODO handle mode in *rags?
+    encoding = kwargs.get('encoding', 'utf8')
+    kwargs['encoding'] = encoding
+
     pp = Path(path)
     suf = pp.suffix
     if suf in {'.xz'}:
         import lzma
-        return lzma.open(pp, *args, **kwargs)
+        return lzma.open(pp, mode, *args, **kwargs)
     elif suf in {'.zip'}:
+        # eh. this behaviour is a bit dodgy...
         from zipfile import ZipFile
-        return ZipFile(pp).open(*args, **kwargs)
+        zfile = ZipFile(pp)
+
+        [subpath] = args # meh?
+
+        ## oh god... https://stackoverflow.com/a/5639960/706389
+        ifile = zfile.open(subpath, mode='r')
+        ifile.readable = lambda: True  # type: ignore
+        ifile.writable = lambda: False # type: ignore
+        ifile.seekable = lambda: False # type: ignore
+        ifile.read1    = ifile.read    # type: ignore
+        # TODO pass all kwargs here??
+        return io.TextIOWrapper(ifile, encoding=encoding)
     elif suf in {'.lz4'}:
         import lz4.frame # type: ignore
-        return lz4.frame.open(str(pp))
+        return lz4.frame.open(str(pp), mode, *args, **kwargs)
     elif suf in {'.zstd'}:
-        return _zstd_open(pp)
+        return _zstd_open(pp, mode, *args, **kwargs)
     else:
-        kwargs['encoding'] = 'utf-8'
-        return pp.open(*args, **kwargs)
+        return pp.open(mode, *args, **kwargs)
 
 
 import typing
@@ -60,7 +77,7 @@ class CPath(BasePath):
         return kopen(str(self))
 
 
-open = kopen # TODO remove?
+open = kopen # TODO deprecate
 
 
 # meh
diff --git a/my/location/takeout.py b/my/location/takeout.py
index bb76292..a7cfb9f 100644
--- a/my/location/takeout.py
+++ b/my/location/takeout.py
@@ -7,7 +7,7 @@ from collections import deque
 from datetime import datetime
 from itertools import islice
 from pathlib import Path
-from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence
+from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence, IO
 import pytz
 
 # pip3 install geopy
@@ -107,6 +107,7 @@ _LOCATION_JSON = 'Takeout/Location History/Location History.json'
 # TODO hope they are sorted... (could assert for it)
 @mcachew(cache_path, chunk_by=10000, logger=logger)
 def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
+    ctx: IO[str]
     if path.suffix == '.json':
         ctx = path.open('r')
     else: # must be a takeout archive
diff --git a/my/rtm.py b/my/rtm.py
index 55ed7a0..dfaaf02 100755
--- a/my/rtm.py
+++ b/my/rtm.py
@@ -18,7 +18,7 @@ import icalendar # type: ignore
 from icalendar.cal import Todo # type: ignore
 
 
-logger = LazyLogger('my.rtm')
+logger = LazyLogger(__name__)
 
 
 # TODO extract in a module to parse RTM's ical?
@@ -98,8 +98,8 @@ class DAL:
 
 
 def dal():
-    last = get_files(config.export_path, glob='*.ical.xz')[-1]
-    with kopen(last, 'rb') as fo:
+    last = get_files(config.export_path)[-1]
+    with kopen(last) as fo:
         data = fo.read()
     return DAL(data=data, revision='TODO')
 
diff --git a/my/twitter/archive.py b/my/twitter/archive.py
index afc1c8c..96a0f5a 100755
--- a/my/twitter/archive.py
+++ b/my/twitter/archive.py
@@ -117,7 +117,7 @@ class ZipExport:
         path += '.js'
 
         with kompress.kopen(self.epath, path) as fo:
-            ddd = fo.read().decode('utf8')
+            ddd = fo.read()
         start = ddd.index('[')
         ddd = ddd[start:]
         for j in json.loads(ddd):
diff --git a/tests/misc.py b/tests/misc.py
index dbb3fa9..e8c4fc2 100644
--- a/tests/misc.py
+++ b/tests/misc.py
@@ -3,8 +3,9 @@ from subprocess import check_call
 import gzip
 import lzma
 import io
+import zipfile
 
-from my.kython.kompress import kopen
+from my.kython.kompress import kopen, kexists
 
 
 import pytest # type: ignore
@@ -15,6 +16,8 @@ def prepare(tmp_path: Path):
     with (tmp_path / 'file.xz').open('wb') as f:
         with lzma.open(f, 'w') as lzf:
             lzf.write(b'compressed text')
+    with zipfile.ZipFile(tmp_path / 'file.zip', 'w') as zf:
+        zf.writestr('path/in/archive', 'data in zip')
     try:
         yield None
     finally:
@@ -24,12 +27,18 @@ def prepare(tmp_path: Path):
 def test_kopen(prepare, tmp_path: Path) -> None:
     "Plaintext handled transparently"
     assert kopen(tmp_path / 'file'   ).read() == 'just plaintext'
-    assert kopen(tmp_path / 'file.xz').read() == b'compressed text' # FIXME make this str
+    assert kopen(tmp_path / 'file.xz').read() == 'compressed text'
+
+    "For zips behaviour is a bit different (not sure about all this, tbh...)"
+    assert kopen(tmp_path / 'file.zip', 'path/in/archive').read() == 'data in zip'
 
 
-def test_kexists(tmp_path: Path) -> None:
-    # TODO
-    raise RuntimeError
+def test_kexists(prepare, tmp_path: Path) -> None:
+    assert     kexists(str(tmp_path / 'file.zip'), 'path/in/archive')
+    assert not kexists(str(tmp_path / 'file.zip'), 'path/notin/archive')
+
+    # TODO not sure about this?
+    assert not kexists(tmp_path / 'nosuchzip.zip', 'path/in/archive')
 
 
 def test_cpath():
diff --git a/tests/reddit.py b/tests/reddit.py
index 30f2353..45be487 100644
--- a/tests/reddit.py
+++ b/tests/reddit.py
@@ -1,2 +1,4 @@
 # ugh. workaround for https://github.com/pytest-dev/pytest/issues/1927
 from my.reddit import *
+
+# TODO for reddit test, patch up to take every 10th archive or something; but make sure it's deterministic