my.instagram.gdpr: process all historic archives + better normalising

2023-10-23 02:19:23 +01:00 · 2023-10-23 02:19:23 +01:00 · f355a55e06
commit f355a55e06
parent f9a1050ceb
1 changed files with 32 additions and 11 deletions
--- a/my/instagram/gdpr.py
+++ b/my/instagram/gdpr.py
@ -7,7 +7,7 @@ import json
 from pathlib import Path
 from typing import Iterator, Sequence, Dict, Union

-from more_itertools import bucket
+from more_itertools import bucket, unique_everseen

 from my.core import (
    get_files,
@ -69,7 +69,20 @@ def _decode(s: str) -> str:


 def _entities() -> Iterator[Res[Union[User, _Message]]]:
-    last = max(inputs())
+    # it's worth processing all previous export -- sometimes instagram removes some metadata from newer ones
+    # NOTE: here there are basically two options
+    # - process inputs as is (from oldest to newest)
+    #   this would be more stable wrt newer exports (e.g. existing thread ids won't change)
+    #   the downside is that newer exports seem to have better thread ids, so might be preferrable to use them
+    # - process inputs reversed (from newest to oldest)
+    #   the upside is that thread ids/usernames might be better
+    #   the downside is that if for example the user renames, thread ids will change _a lot_, might be undesirable..
+    # (from newest to oldest)
+    for path in inputs():
+        yield from _entitites_from_path(path)
+
+
+def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]:
    # TODO make sure it works both with plan directory
    # idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
    # e.g. possible options are:
@ -84,10 +97,10 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
    # whereas here I don't need it..
    # so for now will just implement this adhoc thing and think about properly fixing later

-    personal_info = last / 'personal_information'
+    personal_info = path / 'personal_information'
    if not personal_info.exists():
        # old path, used up to somewhere between feb-aug 2022
-        personal_info = last / 'account_information'
+        personal_info = path / 'account_information'

    j = json.loads((personal_info / 'personal_information.json').read_text())
    [profile] = j['profile_user']
@ -104,8 +117,8 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
    )
    yield self_user

-    files = list(last.rglob('messages/inbox/*/message_*.json'))
-    assert len(files) > 0, last
+    files = list(path.rglob('messages/inbox/*/message_*.json'))
+    assert len(files) > 0, path

    buckets = bucket(files, key=lambda p: p.parts[-2])
    file_map = {k: list(buckets[k]) for k in buckets}
@ -126,7 +139,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
            # so I feel like there is just not guaranteed way to correlate :(
            other_id = fname[-id_len:]
            # NOTE: no match in android db?
-            other_username = fname[:-id_len - 1]
+            other_username = fname[: -id_len - 1]
            other_full_name = _decode(j['title'])
            yield User(
                id=other_id,
@ -135,7 +148,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
            )

            # todo "thread_type": "Regular" ?
-            for jm in j['messages']:
+            for jm in reversed(j['messages']):  # in json, they are in reverse order for some reason
                try:
                    content = None
                    if 'content' in jm:
@ -144,7 +157,15 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
                            # ugh. for some reason these contain an extra space and that messes up message merging..
                            content = content.strip()
                    else:
-                        share  = jm.get('share')
+                        if (share := jm.get('share')) is not None:
+                            if (share_link := share.get('link')) is not None:
+                                # somewhere around 20231007, instagram removed these from gdpr links and they show up a lot in various diffs
+                                share_link = share_link.replace('feed_type=reshare_chaining&', '')
+                                share_link = share_link.replace('?feed_type=reshare_chaining', '')
+                                share['link'] = share_link
+                            if (share_text := share.get('share_text')) is not None:
+                                share['share_text'] = _decode(share_text)
+
                        photos = jm.get('photos')
                        videos = jm.get('videos')
                        cc = share or photos or videos
@ -166,7 +187,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
                        created=datetime.fromtimestamp(timestamp_ms / 1000),
                        text=content,
                        user_id=user_id,
-                        thread_id=fname, # meh.. but no better way?
+                        thread_id=fname,  # meh.. but no better way?
                    )
                except Exception as e:
                    yield e
@ -175,7 +196,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
 # TODO basically copy pasted from android.py... hmm
 def messages() -> Iterator[Res[Message]]:
    id2user: Dict[str, User] = {}
-    for x in _entities():
+    for x in unique_everseen(_entities()):
        if isinstance(x, Exception):
            yield x
            continue