my.instagram.gdpr: process all historic archives + better normalising

This commit is contained in:
Dima Gerasimov 2023-10-23 02:19:23 +01:00 committed by karlicoss
parent f9a1050ceb
commit f355a55e06

View file

@ -7,7 +7,7 @@ import json
from pathlib import Path
from typing import Iterator, Sequence, Dict, Union
from more_itertools import bucket
from more_itertools import bucket, unique_everseen
from my.core import (
get_files,
@ -69,7 +69,20 @@ def _decode(s: str) -> str:
def _entities() -> Iterator[Res[Union[User, _Message]]]:
last = max(inputs())
# it's worth processing all previous export -- sometimes instagram removes some metadata from newer ones
# NOTE: here there are basically two options
# - process inputs as is (from oldest to newest)
# this would be more stable wrt newer exports (e.g. existing thread ids won't change)
# the downside is that newer exports seem to have better thread ids, so might be preferrable to use them
# - process inputs reversed (from newest to oldest)
# the upside is that thread ids/usernames might be better
# the downside is that if for example the user renames, thread ids will change _a lot_, might be undesirable..
# (from newest to oldest)
for path in inputs():
yield from _entitites_from_path(path)
def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]:
# TODO make sure it works both with plan directory
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
# e.g. possible options are:
@ -84,10 +97,10 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
# whereas here I don't need it..
# so for now will just implement this adhoc thing and think about properly fixing later
personal_info = last / 'personal_information'
personal_info = path / 'personal_information'
if not personal_info.exists():
# old path, used up to somewhere between feb-aug 2022
personal_info = last / 'account_information'
personal_info = path / 'account_information'
j = json.loads((personal_info / 'personal_information.json').read_text())
[profile] = j['profile_user']
@ -104,8 +117,8 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
)
yield self_user
files = list(last.rglob('messages/inbox/*/message_*.json'))
assert len(files) > 0, last
files = list(path.rglob('messages/inbox/*/message_*.json'))
assert len(files) > 0, path
buckets = bucket(files, key=lambda p: p.parts[-2])
file_map = {k: list(buckets[k]) for k in buckets}
@ -135,7 +148,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
)
# todo "thread_type": "Regular" ?
for jm in j['messages']:
for jm in reversed(j['messages']): # in json, they are in reverse order for some reason
try:
content = None
if 'content' in jm:
@ -144,7 +157,15 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
# ugh. for some reason these contain an extra space and that messes up message merging..
content = content.strip()
else:
share = jm.get('share')
if (share := jm.get('share')) is not None:
if (share_link := share.get('link')) is not None:
# somewhere around 20231007, instagram removed these from gdpr links and they show up a lot in various diffs
share_link = share_link.replace('feed_type=reshare_chaining&', '')
share_link = share_link.replace('?feed_type=reshare_chaining', '')
share['link'] = share_link
if (share_text := share.get('share_text')) is not None:
share['share_text'] = _decode(share_text)
photos = jm.get('photos')
videos = jm.get('videos')
cc = share or photos or videos
@ -175,7 +196,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
# TODO basically copy pasted from android.py... hmm
def messages() -> Iterator[Res[Message]]:
id2user: Dict[str, User] = {}
for x in _entities():
for x in unique_everseen(_entities()):
if isinstance(x, Exception):
yield x
continue