smscalls: allow large XML files as input
once XML files increase past a certain size (was about 220MB for me), the parser just throws an error because the tree is too large (iirc for security reasons) could maybe look at using iterparse in the future to parse it without loading the whole file, but this seems to fix it fine for me
This commit is contained in:
parent
d8c53bde34
commit
f1d23c5e96
1 changed files with 7 additions and 8 deletions
|
@ -57,9 +57,12 @@ class Call(NamedTuple):
|
||||||
# The '(Unknown)' is just what my android phone does, not sure if there are others
|
# The '(Unknown)' is just what my android phone does, not sure if there are others
|
||||||
UNKNOWN: set[str] = {'(Unknown)'}
|
UNKNOWN: set[str] = {'(Unknown)'}
|
||||||
|
|
||||||
|
def _parse_xml(xml: Path) -> Any:
|
||||||
|
return etree.parse(str(xml), parser=etree.XMLParser(huge_tree=True))
|
||||||
|
|
||||||
|
|
||||||
def _extract_calls(path: Path) -> Iterator[Res[Call]]:
|
def _extract_calls(path: Path) -> Iterator[Res[Call]]:
|
||||||
tr = etree.parse(str(path))
|
tr = _parse_xml(path)
|
||||||
for cxml in tr.findall('call'):
|
for cxml in tr.findall('call'):
|
||||||
dt = cxml.get('date')
|
dt = cxml.get('date')
|
||||||
dt_readable = cxml.get('readable_date')
|
dt_readable = cxml.get('readable_date')
|
||||||
|
@ -133,7 +136,7 @@ def messages() -> Iterator[Res[Message]]:
|
||||||
|
|
||||||
|
|
||||||
def _extract_messages(path: Path) -> Iterator[Res[Message]]:
|
def _extract_messages(path: Path) -> Iterator[Res[Message]]:
|
||||||
tr = etree.parse(str(path))
|
tr = _parse_xml(path)
|
||||||
for mxml in tr.findall('sms'):
|
for mxml in tr.findall('sms'):
|
||||||
dt = mxml.get('date')
|
dt = mxml.get('date')
|
||||||
dt_readable = mxml.get('readable_date')
|
dt_readable = mxml.get('readable_date')
|
||||||
|
@ -225,8 +228,7 @@ def _resolve_null_str(value: str | None) -> str | None:
|
||||||
|
|
||||||
|
|
||||||
def _extract_mms(path: Path) -> Iterator[Res[MMS]]:
|
def _extract_mms(path: Path) -> Iterator[Res[MMS]]:
|
||||||
tr = etree.parse(str(path))
|
tr = _parse_xml(path)
|
||||||
|
|
||||||
for mxml in tr.findall('mms'):
|
for mxml in tr.findall('mms'):
|
||||||
dt = mxml.get('date')
|
dt = mxml.get('date')
|
||||||
dt_readable = mxml.get('readable_date')
|
dt_readable = mxml.get('readable_date')
|
||||||
|
@ -271,10 +273,7 @@ def _extract_mms(path: Path) -> Iterator[Res[MMS]]:
|
||||||
#
|
#
|
||||||
# This seems pretty useless, so we should try and skip it, and just return the
|
# This seems pretty useless, so we should try and skip it, and just return the
|
||||||
# text/images/data
|
# text/images/data
|
||||||
#
|
part_data: dict[str, Any] = part.attrib
|
||||||
# man, attrib is some internal cpython ._Attrib type which can't
|
|
||||||
# be typed by any sort of mappingproxy. maybe a protocol could work..?
|
|
||||||
part_data: dict[str, Any] = part.attrib # type: ignore
|
|
||||||
seq: str | None = part_data.get('seq')
|
seq: str | None = part_data.get('seq')
|
||||||
if seq == '-1':
|
if seq == '-1':
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Add table
Reference in a new issue