my.instagram.gdpr: process all historic archives + better normalising
This commit is contained in:
parent
f9a1050ceb
commit
f355a55e06
1 changed files with 32 additions and 11 deletions
|
@ -7,7 +7,7 @@ import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, Sequence, Dict, Union
|
from typing import Iterator, Sequence, Dict, Union
|
||||||
|
|
||||||
from more_itertools import bucket
|
from more_itertools import bucket, unique_everseen
|
||||||
|
|
||||||
from my.core import (
|
from my.core import (
|
||||||
get_files,
|
get_files,
|
||||||
|
@ -69,7 +69,20 @@ def _decode(s: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
last = max(inputs())
|
# it's worth processing all previous export -- sometimes instagram removes some metadata from newer ones
|
||||||
|
# NOTE: here there are basically two options
|
||||||
|
# - process inputs as is (from oldest to newest)
|
||||||
|
# this would be more stable wrt newer exports (e.g. existing thread ids won't change)
|
||||||
|
# the downside is that newer exports seem to have better thread ids, so might be preferrable to use them
|
||||||
|
# - process inputs reversed (from newest to oldest)
|
||||||
|
# the upside is that thread ids/usernames might be better
|
||||||
|
# the downside is that if for example the user renames, thread ids will change _a lot_, might be undesirable..
|
||||||
|
# (from newest to oldest)
|
||||||
|
for path in inputs():
|
||||||
|
yield from _entitites_from_path(path)
|
||||||
|
|
||||||
|
|
||||||
|
def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]:
|
||||||
# TODO make sure it works both with plan directory
|
# TODO make sure it works both with plan directory
|
||||||
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
||||||
# e.g. possible options are:
|
# e.g. possible options are:
|
||||||
|
@ -84,10 +97,10 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
# whereas here I don't need it..
|
# whereas here I don't need it..
|
||||||
# so for now will just implement this adhoc thing and think about properly fixing later
|
# so for now will just implement this adhoc thing and think about properly fixing later
|
||||||
|
|
||||||
personal_info = last / 'personal_information'
|
personal_info = path / 'personal_information'
|
||||||
if not personal_info.exists():
|
if not personal_info.exists():
|
||||||
# old path, used up to somewhere between feb-aug 2022
|
# old path, used up to somewhere between feb-aug 2022
|
||||||
personal_info = last / 'account_information'
|
personal_info = path / 'account_information'
|
||||||
|
|
||||||
j = json.loads((personal_info / 'personal_information.json').read_text())
|
j = json.loads((personal_info / 'personal_information.json').read_text())
|
||||||
[profile] = j['profile_user']
|
[profile] = j['profile_user']
|
||||||
|
@ -104,8 +117,8 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
)
|
)
|
||||||
yield self_user
|
yield self_user
|
||||||
|
|
||||||
files = list(last.rglob('messages/inbox/*/message_*.json'))
|
files = list(path.rglob('messages/inbox/*/message_*.json'))
|
||||||
assert len(files) > 0, last
|
assert len(files) > 0, path
|
||||||
|
|
||||||
buckets = bucket(files, key=lambda p: p.parts[-2])
|
buckets = bucket(files, key=lambda p: p.parts[-2])
|
||||||
file_map = {k: list(buckets[k]) for k in buckets}
|
file_map = {k: list(buckets[k]) for k in buckets}
|
||||||
|
@ -135,7 +148,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
)
|
)
|
||||||
|
|
||||||
# todo "thread_type": "Regular" ?
|
# todo "thread_type": "Regular" ?
|
||||||
for jm in j['messages']:
|
for jm in reversed(j['messages']): # in json, they are in reverse order for some reason
|
||||||
try:
|
try:
|
||||||
content = None
|
content = None
|
||||||
if 'content' in jm:
|
if 'content' in jm:
|
||||||
|
@ -144,7 +157,15 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
# ugh. for some reason these contain an extra space and that messes up message merging..
|
# ugh. for some reason these contain an extra space and that messes up message merging..
|
||||||
content = content.strip()
|
content = content.strip()
|
||||||
else:
|
else:
|
||||||
share = jm.get('share')
|
if (share := jm.get('share')) is not None:
|
||||||
|
if (share_link := share.get('link')) is not None:
|
||||||
|
# somewhere around 20231007, instagram removed these from gdpr links and they show up a lot in various diffs
|
||||||
|
share_link = share_link.replace('feed_type=reshare_chaining&', '')
|
||||||
|
share_link = share_link.replace('?feed_type=reshare_chaining', '')
|
||||||
|
share['link'] = share_link
|
||||||
|
if (share_text := share.get('share_text')) is not None:
|
||||||
|
share['share_text'] = _decode(share_text)
|
||||||
|
|
||||||
photos = jm.get('photos')
|
photos = jm.get('photos')
|
||||||
videos = jm.get('videos')
|
videos = jm.get('videos')
|
||||||
cc = share or photos or videos
|
cc = share or photos or videos
|
||||||
|
@ -175,7 +196,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
# TODO basically copy pasted from android.py... hmm
|
# TODO basically copy pasted from android.py... hmm
|
||||||
def messages() -> Iterator[Res[Message]]:
|
def messages() -> Iterator[Res[Message]]:
|
||||||
id2user: Dict[str, User] = {}
|
id2user: Dict[str, User] = {}
|
||||||
for x in _entities():
|
for x in unique_everseen(_entities()):
|
||||||
if isinstance(x, Exception):
|
if isinstance(x, Exception):
|
||||||
yield x
|
yield x
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Add table
Reference in a new issue