my.instagram: somewhat mad merging mechanism to correlate gdpr and android exports
This commit is contained in:
parent
e7be680841
commit
8f7d14e7c6
2 changed files with 53 additions and 6 deletions
|
@ -22,7 +22,11 @@ def _messages_android() -> Iterator[Res[Message]]:
|
||||||
|
|
||||||
def messages() -> Iterator[Res[Message]]:
|
def messages() -> Iterator[Res[Message]]:
|
||||||
# TODO in general best to prefer android, it has more data
|
# TODO in general best to prefer android, it has more data
|
||||||
# but for now prefer gdpr prefix until we figure out how to correlate conversation threads
|
# - message ids
|
||||||
|
# - usernames are correct for Android data
|
||||||
|
# - thread ids more meaninful?
|
||||||
|
# but for now prefer gdpr prefix since it makes a bit things a bit more consistent?
|
||||||
|
# e.g. a new batch of android exports can throw off ids if we rely on it for mapping
|
||||||
yield from _merge_messages(
|
yield from _merge_messages(
|
||||||
_messages_gdpr(),
|
_messages_gdpr(),
|
||||||
_messages_android(),
|
_messages_android(),
|
||||||
|
|
|
@ -1,22 +1,31 @@
|
||||||
|
from dataclasses import replace
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import Iterator
|
from typing import Iterator, Dict, Any
|
||||||
|
|
||||||
from my.core import warn_if_empty, Res
|
from my.core import warn_if_empty, Res
|
||||||
from my.core.compat import Protocol
|
from my.core.compat import Protocol
|
||||||
|
|
||||||
from more_itertools import unique_everseen
|
|
||||||
|
class User(Protocol):
|
||||||
|
id: str
|
||||||
|
username: str
|
||||||
|
full_name: str
|
||||||
|
|
||||||
|
|
||||||
class Message(Protocol):
|
class Message(Protocol):
|
||||||
created: datetime
|
created: datetime
|
||||||
text: str
|
text: str
|
||||||
# TODO add some sort of thread id
|
thread_id: str
|
||||||
# ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
|
|
||||||
|
# property because it's more mypy friendly
|
||||||
|
@property
|
||||||
|
def user(self) -> User: ...
|
||||||
|
|
||||||
|
|
||||||
@warn_if_empty
|
@warn_if_empty
|
||||||
def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
|
def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
|
||||||
|
# TODO double check it works w.r.t. naive/aware timestamps?
|
||||||
def key(r: Res[Message]):
|
def key(r: Res[Message]):
|
||||||
if isinstance(r, Exception):
|
if isinstance(r, Exception):
|
||||||
# NOTE: using str() against Exception is nice so exceptions with same args are treated the same..
|
# NOTE: using str() against Exception is nice so exceptions with same args are treated the same..
|
||||||
|
@ -28,4 +37,38 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
|
||||||
without_us = r.created.replace(microsecond=round_us)
|
without_us = r.created.replace(microsecond=round_us)
|
||||||
# using text as key is a bit crap.. but atm there are no better shared fields
|
# using text as key is a bit crap.. but atm there are no better shared fields
|
||||||
return (without_us, r.text)
|
return (without_us, r.text)
|
||||||
return unique_everseen(chain(*sources), key=key)
|
|
||||||
|
# ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump)
|
||||||
|
# so the only way to correlate is to try and match messages
|
||||||
|
# we also can't use unique_everseen here, otherwise will never get a chance to unify threads
|
||||||
|
mmap: Dict[str, Message] = {}
|
||||||
|
thread_map = {}
|
||||||
|
user_map = {}
|
||||||
|
|
||||||
|
for m in chain(*sources):
|
||||||
|
if isinstance(m, Exception):
|
||||||
|
yield m
|
||||||
|
continue
|
||||||
|
|
||||||
|
k = key(m)
|
||||||
|
mm = mmap.get(k)
|
||||||
|
|
||||||
|
if mm is not None:
|
||||||
|
# already emitted, we get a chance to populate mappings
|
||||||
|
if m.thread_id not in thread_map:
|
||||||
|
thread_map[m.thread_id] = mm.thread_id
|
||||||
|
if m.user.id not in user_map:
|
||||||
|
user_map[m.user.id] = mm.user
|
||||||
|
else:
|
||||||
|
# not emitted yet, need to emit
|
||||||
|
repls: Dict[str, Any] = {}
|
||||||
|
tid = thread_map.get(m.thread_id)
|
||||||
|
if tid is not None:
|
||||||
|
repls['thread_id'] = tid
|
||||||
|
user = user_map.get(m.user.id)
|
||||||
|
if user is not None:
|
||||||
|
repls['user'] = user
|
||||||
|
if len(repls) > 0:
|
||||||
|
m = replace(m, **repls) # type: ignore[type-var] # ugh mypy is confused because of Protocol?
|
||||||
|
mmap[k] = m
|
||||||
|
yield m
|
||||||
|
|
Loading…
Add table
Reference in a new issue