my.instagram: somewhat mad merging mechanism to correlate gdpr and android exports

This commit is contained in:
Dima Gerasimov 2023-03-22 02:19:57 +00:00 committed by karlicoss
parent e7be680841
commit 8f7d14e7c6
2 changed files with 53 additions and 6 deletions

View file

@ -22,7 +22,11 @@ def _messages_android() -> Iterator[Res[Message]]:
def messages() -> Iterator[Res[Message]]: def messages() -> Iterator[Res[Message]]:
# TODO in general best to prefer android, it has more data # TODO in general best to prefer android, it has more data
# but for now prefer gdpr prefix until we figure out how to correlate conversation threads # - message ids
# - usernames are correct for Android data
# - thread ids more meaninful?
# but for now prefer gdpr prefix since it makes a bit things a bit more consistent?
# e.g. a new batch of android exports can throw off ids if we rely on it for mapping
yield from _merge_messages( yield from _merge_messages(
_messages_gdpr(), _messages_gdpr(),
_messages_android(), _messages_android(),

View file

@ -1,22 +1,31 @@
from dataclasses import replace
from datetime import datetime from datetime import datetime
from itertools import chain from itertools import chain
from typing import Iterator from typing import Iterator, Dict, Any
from my.core import warn_if_empty, Res from my.core import warn_if_empty, Res
from my.core.compat import Protocol from my.core.compat import Protocol
from more_itertools import unique_everseen
class User(Protocol):
id: str
username: str
full_name: str
class Message(Protocol): class Message(Protocol):
created: datetime created: datetime
text: str text: str
# TODO add some sort of thread id thread_id: str
# ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
# property because it's more mypy friendly
@property
def user(self) -> User: ...
@warn_if_empty @warn_if_empty
def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
# TODO double check it works w.r.t. naive/aware timestamps?
def key(r: Res[Message]): def key(r: Res[Message]):
if isinstance(r, Exception): if isinstance(r, Exception):
# NOTE: using str() against Exception is nice so exceptions with same args are treated the same.. # NOTE: using str() against Exception is nice so exceptions with same args are treated the same..
@ -28,4 +37,38 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
without_us = r.created.replace(microsecond=round_us) without_us = r.created.replace(microsecond=round_us)
# using text as key is a bit crap.. but atm there are no better shared fields # using text as key is a bit crap.. but atm there are no better shared fields
return (without_us, r.text) return (without_us, r.text)
return unique_everseen(chain(*sources), key=key)
# ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
# so the only way to correlate is to try and match messages
# we also can't use unique_everseen here, otherwise will never get a chance to unify threads
mmap: Dict[str, Message] = {}
thread_map = {}
user_map = {}
for m in chain(*sources):
if isinstance(m, Exception):
yield m
continue
k = key(m)
mm = mmap.get(k)
if mm is not None:
# already emitted, we get a chance to populate mappings
if m.thread_id not in thread_map:
thread_map[m.thread_id] = mm.thread_id
if m.user.id not in user_map:
user_map[m.user.id] = mm.user
else:
# not emitted yet, need to emit
repls: Dict[str, Any] = {}
tid = thread_map.get(m.thread_id)
if tid is not None:
repls['thread_id'] = tid
user = user_map.get(m.user.id)
if user is not None:
repls['user'] = user
if len(repls) > 0:
m = replace(m, **repls) # type: ignore[type-var] # ugh mypy is confused because of Protocol?
mmap[k] = m
yield m