my.instagram.gdpr: handle missing message content defensively
This commit is contained in:
parent
347cd1ef77
commit
e7be680841
2 changed files with 26 additions and 15 deletions
|
@ -12,6 +12,7 @@ class Message(Protocol):
|
||||||
created: datetime
|
created: datetime
|
||||||
text: str
|
text: str
|
||||||
# TODO add some sort of thread id
|
# TODO add some sort of thread id
|
||||||
|
# ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
|
||||||
|
|
||||||
|
|
||||||
@warn_if_empty
|
@warn_if_empty
|
||||||
|
|
|
@ -3,13 +3,28 @@ Instagram data (uses [[https://www.instagram.com/download/request][official GDPR
|
||||||
"""
|
"""
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterator, Any, Sequence, Dict
|
import json
|
||||||
|
from pathlib import Path
|
||||||
from my.config import instagram as user_config
|
from typing import Iterator, Sequence, Dict, Union
|
||||||
|
|
||||||
from more_itertools import bucket
|
from more_itertools import bucket
|
||||||
|
|
||||||
from ..core import Paths
|
from my.core import (
|
||||||
|
get_files,
|
||||||
|
Paths,
|
||||||
|
datetime_naive,
|
||||||
|
Res,
|
||||||
|
assert_never,
|
||||||
|
LazyLogger,
|
||||||
|
)
|
||||||
|
from my.core.kompress import ZipPath
|
||||||
|
|
||||||
|
from my.config import instagram as user_config
|
||||||
|
|
||||||
|
|
||||||
|
logger = LazyLogger(__name__, level='debug')
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class config(user_config.gdpr):
|
class config(user_config.gdpr):
|
||||||
# paths[s]/glob to the exported zip archives
|
# paths[s]/glob to the exported zip archives
|
||||||
|
@ -17,8 +32,6 @@ class config(user_config.gdpr):
|
||||||
# TODO later also support unpacked directories?
|
# TODO later also support unpacked directories?
|
||||||
|
|
||||||
|
|
||||||
from ..core import get_files
|
|
||||||
from pathlib import Path
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
return get_files(config.export_path)
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
@ -31,7 +44,6 @@ class User:
|
||||||
full_name: str
|
full_name: str
|
||||||
|
|
||||||
|
|
||||||
from ..core import datetime_naive
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class _BaseMessage:
|
class _BaseMessage:
|
||||||
# ugh, this is insane, but does look like it's just keeping local device time???
|
# ugh, this is insane, but does look like it's just keeping local device time???
|
||||||
|
@ -57,11 +69,7 @@ def _decode(s: str) -> str:
|
||||||
return s.encode('latin-1').decode('utf8')
|
return s.encode('latin-1').decode('utf8')
|
||||||
|
|
||||||
|
|
||||||
import json
|
|
||||||
from typing import Union
|
|
||||||
from ..core import Res, assert_never
|
|
||||||
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
from ..core.kompress import ZipPath
|
|
||||||
last = ZipPath(max(inputs()))
|
last = ZipPath(max(inputs()))
|
||||||
# TODO make sure it works both with plan directory
|
# TODO make sure it works both with plan directory
|
||||||
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
||||||
|
@ -128,9 +136,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
|
|
||||||
# todo "thread_type": "Regular" ?
|
# todo "thread_type": "Regular" ?
|
||||||
for jm in j['messages']:
|
for jm in j['messages']:
|
||||||
# todo defensive?
|
|
||||||
try:
|
try:
|
||||||
mtype = jm['type'] # Generic/Share?
|
|
||||||
content = None
|
content = None
|
||||||
if 'content' in jm:
|
if 'content' in jm:
|
||||||
content = _decode(jm['content'])
|
content = _decode(jm['content'])
|
||||||
|
@ -141,7 +147,12 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
cc = share or photos or videos
|
cc = share or photos or videos
|
||||||
if cc is not None:
|
if cc is not None:
|
||||||
content = str(cc)
|
content = str(cc)
|
||||||
assert content is not None, jm
|
|
||||||
|
if content is None:
|
||||||
|
# not sure what it means.. perhaps likes or something?
|
||||||
|
logger.warning(f'content is None: {jm}')
|
||||||
|
continue
|
||||||
|
|
||||||
timestamp_ms = jm['timestamp_ms']
|
timestamp_ms = jm['timestamp_ms']
|
||||||
sender_name = _decode(jm['sender_name'])
|
sender_name = _decode(jm['sender_name'])
|
||||||
|
|
||||||
|
@ -153,7 +164,6 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
thread_id=fname, # meh.. but no better way?
|
thread_id=fname, # meh.. but no better way?
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# TODO sometimes messages are just missing content?? even with Generic type
|
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue