From 8f7d14e7c6b20405ae6dad71e8274af29ae4f024 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 22 Mar 2023 02:19:57 +0000 Subject: [PATCH] my.instagram: somewhat mad merging mechanism to correlate gdpr and android exports --- my/instagram/all.py | 6 ++++- my/instagram/common.py | 53 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/my/instagram/all.py b/my/instagram/all.py index 4be2b5b..8007399 100644 --- a/my/instagram/all.py +++ b/my/instagram/all.py @@ -22,7 +22,11 @@ def _messages_android() -> Iterator[Res[Message]]: def messages() -> Iterator[Res[Message]]: # TODO in general best to prefer android, it has more data - # but for now prefer gdpr prefix until we figure out how to correlate conversation threads + # - message ids + # - usernames are correct for Android data + # - thread ids more meaninful? + # but for now prefer gdpr prefix since it makes a bit things a bit more consistent? + # e.g. a new batch of android exports can throw off ids if we rely on it for mapping yield from _merge_messages( _messages_gdpr(), _messages_android(), diff --git a/my/instagram/common.py b/my/instagram/common.py index b345b8e..a172ac8 100644 --- a/my/instagram/common.py +++ b/my/instagram/common.py @@ -1,22 +1,31 @@ +from dataclasses import replace from datetime import datetime from itertools import chain -from typing import Iterator +from typing import Iterator, Dict, Any from my.core import warn_if_empty, Res from my.core.compat import Protocol -from more_itertools import unique_everseen + +class User(Protocol): + id: str + username: str + full_name: str class Message(Protocol): created: datetime text: str - # TODO add some sort of thread id - # ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump) + thread_id: str + + # property because it's more mypy friendly + @property + def user(self) -> User: ... @warn_if_empty def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: + # TODO double check it works w.r.t. naive/aware timestamps? def key(r: Res[Message]): if isinstance(r, Exception): # NOTE: using str() against Exception is nice so exceptions with same args are treated the same.. @@ -28,4 +37,38 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: without_us = r.created.replace(microsecond=round_us) # using text as key is a bit crap.. but atm there are no better shared fields return (without_us, r.text) - return unique_everseen(chain(*sources), key=key) + + # ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump) + # so the only way to correlate is to try and match messages + # we also can't use unique_everseen here, otherwise will never get a chance to unify threads + mmap: Dict[str, Message] = {} + thread_map = {} + user_map = {} + + for m in chain(*sources): + if isinstance(m, Exception): + yield m + continue + + k = key(m) + mm = mmap.get(k) + + if mm is not None: + # already emitted, we get a chance to populate mappings + if m.thread_id not in thread_map: + thread_map[m.thread_id] = mm.thread_id + if m.user.id not in user_map: + user_map[m.user.id] = mm.user + else: + # not emitted yet, need to emit + repls: Dict[str, Any] = {} + tid = thread_map.get(m.thread_id) + if tid is not None: + repls['thread_id'] = tid + user = user_map.get(m.user.id) + if user is not None: + repls['user'] = user + if len(repls) > 0: + m = replace(m, **repls) # type: ignore[type-var] # ugh mypy is confused because of Protocol? + mmap[k] = m + yield m