From f1d23c5e96d95819d383485f22b480d8d190fe98 Mon Sep 17 00:00:00 2001 From: purarue <7804791+purarue@users.noreply.github.com> Date: Sun, 22 Dec 2024 21:50:03 -0800 Subject: [PATCH] smscalls: allow large XML files as input once XML files increase past a certain size (was about 220MB for me), the parser just throws an error because the tree is too large (iirc for security reasons) could maybe look at using iterparse in the future to parse it without loading the whole file, but this seems to fix it fine for me --- my/smscalls.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/my/smscalls.py b/my/smscalls.py index 0ff2553..27d08be 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -57,9 +57,12 @@ class Call(NamedTuple): # The '(Unknown)' is just what my android phone does, not sure if there are others UNKNOWN: set[str] = {'(Unknown)'} +def _parse_xml(xml: Path) -> Any: + return etree.parse(str(xml), parser=etree.XMLParser(huge_tree=True)) + def _extract_calls(path: Path) -> Iterator[Res[Call]]: - tr = etree.parse(str(path)) + tr = _parse_xml(path) for cxml in tr.findall('call'): dt = cxml.get('date') dt_readable = cxml.get('readable_date') @@ -133,7 +136,7 @@ def messages() -> Iterator[Res[Message]]: def _extract_messages(path: Path) -> Iterator[Res[Message]]: - tr = etree.parse(str(path)) + tr = _parse_xml(path) for mxml in tr.findall('sms'): dt = mxml.get('date') dt_readable = mxml.get('readable_date') @@ -225,8 +228,7 @@ def _resolve_null_str(value: str | None) -> str | None: def _extract_mms(path: Path) -> Iterator[Res[MMS]]: - tr = etree.parse(str(path)) - + tr = _parse_xml(path) for mxml in tr.findall('mms'): dt = mxml.get('date') dt_readable = mxml.get('readable_date') @@ -271,10 +273,7 @@ def _extract_mms(path: Path) -> Iterator[Res[MMS]]: # # This seems pretty useless, so we should try and skip it, and just return the # text/images/data - # - # man, attrib is some internal cpython ._Attrib type which can't - # be typed by any sort of mappingproxy. maybe a protocol could work..? - part_data: dict[str, Any] = part.attrib # type: ignore + part_data: dict[str, Any] = part.attrib seq: str | None = part_data.get('seq') if seq == '-1': continue