HPI/my/core/konsume.py
2024-11-26 21:51:40 +00:00

254 lines
7.3 KiB
Python

'''
Some experimental JSON parsing, basically to ensure that all data is consumed.
This can potentially allow both for safer defensive parsing, and let you know if the data started returning more data
TODO perhaps need to get some inspiration from linear logic to decide on a nice API...
'''
from __future__ import annotations
from collections import OrderedDict
from typing import Any
def ignore(w, *keys):
for k in keys:
w[k].ignore()
def zoom(w, *keys):
return [w[k].zoom() for k in keys]
# TODO need to support lists
class Zoomable:
def __init__(self, parent, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.parent = parent
# TODO not sure, maybe do it via del??
# TODO need to make sure they are in proper order? object should be last..
@property
def dependants(self):
raise NotImplementedError
def ignore(self) -> None:
self.consume_all()
def consume_all(self) -> None:
for d in self.dependants:
d.consume_all()
self.consume()
def consume(self) -> None:
assert self.parent is not None
self.parent._remove(self)
def zoom(self) -> Zoomable:
self.consume()
return self
def _remove(self, xx):
raise NotImplementedError
def this_consumed(self):
raise NotImplementedError
class Wdict(Zoomable, OrderedDict):
def _remove(self, xx):
keys = [k for k, v in self.items() if v is xx]
assert len(keys) == 1
del self[keys[0]]
@property
def dependants(self):
return list(self.values())
def this_consumed(self):
return len(self) == 0
# TODO specify mypy type for the index special method?
class Wlist(Zoomable, list):
def _remove(self, xx):
self.remove(xx)
@property
def dependants(self):
return list(self)
def this_consumed(self):
return len(self) == 0
class Wvalue(Zoomable):
def __init__(self, parent, value: Any) -> None:
super().__init__(parent)
self.value = value
@property
def dependants(self):
return []
def this_consumed(self):
return True # TODO not sure..
def __repr__(self):
return 'WValue{' + repr(self.value) + '}'
def _wrap(j, parent=None) -> tuple[Zoomable, list[Zoomable]]:
res: Zoomable
cc: list[Zoomable]
if isinstance(j, dict):
res = Wdict(parent)
cc = [res]
for k, v in j.items():
vv, c = _wrap(v, parent=res)
res[k] = vv
cc.extend(c)
return res, cc
elif isinstance(j, list):
res = Wlist(parent)
cc = [res]
for i in j:
ii, c = _wrap(i, parent=res)
res.append(ii)
cc.extend(c)
return res, cc
elif isinstance(j, (int, float, str, type(None))):
res = Wvalue(parent, j)
return res, [res]
else:
raise RuntimeError(f'Unexpected type: {type(j)} {j}')
from collections.abc import Iterator
from contextlib import contextmanager
class UnconsumedError(Exception):
pass
# TODO think about error policy later...
@contextmanager
def wrap(j, *, throw=True) -> Iterator[Zoomable]:
w, children = _wrap(j)
yield w
for c in children:
if not c.this_consumed(): # TODO hmm. how does it figure out if it's consumed???
if throw:
# TODO need to keep a full path or something...
raise UnconsumedError(f'''
Expected {c} to be fully consumed by the parser.
'''.lstrip())
else:
# TODO log?
pass
from typing import cast
def test_unconsumed() -> None:
import pytest
with pytest.raises(UnconsumedError):
with wrap({'a': 1234}) as w:
w = cast(Wdict, w)
pass
with pytest.raises(UnconsumedError):
with wrap({'c': {'d': 2222}}) as w:
w = cast(Wdict, w)
d = w['c']['d'].zoom()
def test_consumed() -> None:
with wrap({'a': 1234}) as w:
w = cast(Wdict, w)
a = w['a'].zoom()
with wrap({'c': {'d': 2222}}) as w:
w = cast(Wdict, w)
c = w['c'].zoom()
d = c['d'].zoom()
def test_types() -> None:
# (string, number, object, array, boolean or nul
with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w:
w = cast(Wdict, w)
w['string'].zoom()
w['number'].consume()
w['boolean'].zoom()
w['null'].zoom()
for x in list(w['list'].zoom()): # TODO eh. how to avoid the extra list thing?
x.consume()
def test_consume_all() -> None:
with wrap({'aaa': {'bbb': {'hi': 123}}}) as w:
w = cast(Wdict, w)
aaa = w['aaa'].zoom()
aaa['bbb'].consume_all()
def test_consume_few() -> None:
import pytest
pytest.skip('Will think about it later..')
with wrap({'important': 123, 'unimportant': 'whatever'}) as w:
w = cast(Wdict, w)
w['important'].zoom()
w.consume_all()
# TODO hmm, we want smth like this to work..
def test_zoom() -> None:
import pytest
with wrap({'aaa': 'whatever'}) as w:
w = cast(Wdict, w)
with pytest.raises(KeyError):
w['nosuchkey'].zoom()
w['aaa'].zoom()
# TODO type check this...
# TODO feels like the whole thing kind of unnecessarily complex
# - cons:
# - in most cases this is not even needed? who cares if we miss a few attributes?
# - pro: on the other hand it could be interesting to know about new attributes in data,
# and without this kind of processing we wouldn't even know
# alternatives
# - manually process data
# e.g. use asserts, dict.pop and dict.values() methods to unpack things
# - pros:
# - very simple, since uses built in syntax
# - very performant, as fast as it gets
# - very flexible, easy to adjust behaviour
# - cons:
# - can forget to assert about extra entities etc, so error prone
# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes error handling harder
# - a bit verbose.. so probably requires some helper functions though (could be much leaner than current konsume though)
# - if we assert, then terminates parsing too early, if we're defensive then inflates the code a lot with if statements
# - TODO perhaps combine warnings somehow or at least only emit once per module?
# - hmm actually tbh if we carefully go through everything and don't make copies, then only requires one assert at the very end?
# - TODO this is kinda useful? https://discuss.python.org/t/syntax-for-dictionnary-unpacking-to-variables/18718
# operator.itemgetter?
# - TODO can use match operator in python for this? quite nice actually! and allows for dynamic behaviour
# only from 3.10 tho, and gonna be tricky to do dynamic defensive behaviour with this
# - TODO in a sense, blenser already would hint if some meaningful fields aren't being processed? only if they are changing though
# - define a "schema" for data, then just recursively match data against the schema?
# possibly pydantic already does something like that? not sure about performance though
# pros:
# - much simpler to extend and understand what's going on
# cons:
# - more rigid, so it becomes tricky to do dynamic stuff (e.g. if schema actually changes)