my.instagram.gdpr: handle missing message content defensively

This commit is contained in:
Dima Gerasimov 2023-03-22 00:29:21 +00:00 committed by karlicoss
parent 347cd1ef77
commit e7be680841
2 changed files with 26 additions and 15 deletions

View file

@ -12,6 +12,7 @@ class Message(Protocol):
created: datetime created: datetime
text: str text: str
# TODO add some sort of thread id # TODO add some sort of thread id
# ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
@warn_if_empty @warn_if_empty

View file

@ -3,13 +3,28 @@ Instagram data (uses [[https://www.instagram.com/download/request][official GDPR
""" """
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from typing import Iterator, Any, Sequence, Dict import json
from pathlib import Path
from my.config import instagram as user_config from typing import Iterator, Sequence, Dict, Union
from more_itertools import bucket from more_itertools import bucket
from ..core import Paths from my.core import (
get_files,
Paths,
datetime_naive,
Res,
assert_never,
LazyLogger,
)
from my.core.kompress import ZipPath
from my.config import instagram as user_config
logger = LazyLogger(__name__, level='debug')
@dataclass @dataclass
class config(user_config.gdpr): class config(user_config.gdpr):
# paths[s]/glob to the exported zip archives # paths[s]/glob to the exported zip archives
@ -17,8 +32,6 @@ class config(user_config.gdpr):
# TODO later also support unpacked directories? # TODO later also support unpacked directories?
from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
return get_files(config.export_path) return get_files(config.export_path)
@ -31,7 +44,6 @@ class User:
full_name: str full_name: str
from ..core import datetime_naive
@dataclass @dataclass
class _BaseMessage: class _BaseMessage:
# ugh, this is insane, but does look like it's just keeping local device time??? # ugh, this is insane, but does look like it's just keeping local device time???
@ -57,11 +69,7 @@ def _decode(s: str) -> str:
return s.encode('latin-1').decode('utf8') return s.encode('latin-1').decode('utf8')
import json
from typing import Union
from ..core import Res, assert_never
def _entities() -> Iterator[Res[Union[User, _Message]]]: def _entities() -> Iterator[Res[Union[User, _Message]]]:
from ..core.kompress import ZipPath
last = ZipPath(max(inputs())) last = ZipPath(max(inputs()))
# TODO make sure it works both with plan directory # TODO make sure it works both with plan directory
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here # idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
@ -128,9 +136,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
# todo "thread_type": "Regular" ? # todo "thread_type": "Regular" ?
for jm in j['messages']: for jm in j['messages']:
# todo defensive?
try: try:
mtype = jm['type'] # Generic/Share?
content = None content = None
if 'content' in jm: if 'content' in jm:
content = _decode(jm['content']) content = _decode(jm['content'])
@ -141,7 +147,12 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
cc = share or photos or videos cc = share or photos or videos
if cc is not None: if cc is not None:
content = str(cc) content = str(cc)
assert content is not None, jm
if content is None:
# not sure what it means.. perhaps likes or something?
logger.warning(f'content is None: {jm}')
continue
timestamp_ms = jm['timestamp_ms'] timestamp_ms = jm['timestamp_ms']
sender_name = _decode(jm['sender_name']) sender_name = _decode(jm['sender_name'])
@ -153,7 +164,6 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
thread_id=fname, # meh.. but no better way? thread_id=fname, # meh.. but no better way?
) )
except Exception as e: except Exception as e:
# TODO sometimes messages are just missing content?? even with Generic type
yield e yield e