my.instagram: somewhat mad merging mechanism to correlate gdpr and android exports

This commit is contained in:
Dima Gerasimov 2023-03-22 02:19:57 +00:00 committed by karlicoss
parent e7be680841
commit 8f7d14e7c6
2 changed files with 53 additions and 6 deletions

View file

@ -22,7 +22,11 @@ def _messages_android() -> Iterator[Res[Message]]:
def messages() -> Iterator[Res[Message]]:
# TODO in general best to prefer android, it has more data
# but for now prefer gdpr prefix until we figure out how to correlate conversation threads
# - message ids
# - usernames are correct for Android data
# - thread ids more meaninful?
# but for now prefer gdpr prefix since it makes a bit things a bit more consistent?
# e.g. a new batch of android exports can throw off ids if we rely on it for mapping
yield from _merge_messages(
_messages_gdpr(),
_messages_android(),

View file

@ -1,22 +1,31 @@
from dataclasses import replace
from datetime import datetime
from itertools import chain
from typing import Iterator
from typing import Iterator, Dict, Any
from my.core import warn_if_empty, Res
from my.core.compat import Protocol
from more_itertools import unique_everseen
class User(Protocol):
id: str
username: str
full_name: str
class Message(Protocol):
created: datetime
text: str
# TODO add some sort of thread id
# ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
thread_id: str
# property because it's more mypy friendly
@property
def user(self) -> User: ...
@warn_if_empty
def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
# TODO double check it works w.r.t. naive/aware timestamps?
def key(r: Res[Message]):
if isinstance(r, Exception):
# NOTE: using str() against Exception is nice so exceptions with same args are treated the same..
@ -28,4 +37,38 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
without_us = r.created.replace(microsecond=round_us)
# using text as key is a bit crap.. but atm there are no better shared fields
return (without_us, r.text)
return unique_everseen(chain(*sources), key=key)
# ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
# so the only way to correlate is to try and match messages
# we also can't use unique_everseen here, otherwise will never get a chance to unify threads
mmap: Dict[str, Message] = {}
thread_map = {}
user_map = {}
for m in chain(*sources):
if isinstance(m, Exception):
yield m
continue
k = key(m)
mm = mmap.get(k)
if mm is not None:
# already emitted, we get a chance to populate mappings
if m.thread_id not in thread_map:
thread_map[m.thread_id] = mm.thread_id
if m.user.id not in user_map:
user_map[m.user.id] = mm.user
else:
# not emitted yet, need to emit
repls: Dict[str, Any] = {}
tid = thread_map.get(m.thread_id)
if tid is not None:
repls['thread_id'] = tid
user = user_map.get(m.user.id)
if user is not None:
repls['user'] = user
if len(repls) > 0:
m = replace(m, **repls) # type: ignore[type-var] # ugh mypy is confused because of Protocol?
mmap[k] = m
yield m