226 lines
8.8 KiB
Python
226 lines
8.8 KiB
Python
"""
|
|
Instagram data (uses [[https://www.instagram.com/download/request][official GDPR export]])
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Iterator, Sequence, Dict, Union
|
|
|
|
from more_itertools import bucket
|
|
|
|
from my.core import (
|
|
get_files,
|
|
Paths,
|
|
datetime_naive,
|
|
Res,
|
|
assert_never,
|
|
make_logger,
|
|
)
|
|
from my.core.common import unique_everseen
|
|
|
|
from my.config import instagram as user_config
|
|
|
|
|
|
logger = make_logger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class config(user_config.gdpr):
|
|
# paths[s]/glob to the exported zip archives
|
|
export_path: Paths
|
|
# TODO later also support unpacked directories?
|
|
|
|
|
|
def inputs() -> Sequence[Path]:
|
|
return get_files(config.export_path)
|
|
|
|
|
|
# TODO think about unifying with stuff from android.py
|
|
@dataclass(unsafe_hash=True)
|
|
class User:
|
|
id: str
|
|
username: str
|
|
full_name: str
|
|
|
|
|
|
@dataclass
|
|
class _BaseMessage:
|
|
# ugh, this is insane, but does look like it's just keeping local device time???
|
|
# checked against a message sent on 3 June, which should be UTC+1, but timestamp seems local
|
|
created: datetime_naive
|
|
text: str
|
|
thread_id: str
|
|
# NOTE: doesn't look like there aren't any meaningful message ids in the export
|
|
|
|
|
|
@dataclass(unsafe_hash=True)
|
|
class _Message(_BaseMessage):
|
|
user_id: str
|
|
|
|
|
|
@dataclass(unsafe_hash=True)
|
|
class Message(_BaseMessage):
|
|
user: User
|
|
|
|
|
|
def _decode(s: str) -> str:
|
|
# yeah... idk why they do that
|
|
return s.encode('latin-1').decode('utf8')
|
|
|
|
|
|
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
|
# it's worth processing all previous export -- sometimes instagram removes some metadata from newer ones
|
|
# NOTE: here there are basically two options
|
|
# - process inputs as is (from oldest to newest)
|
|
# this would be more stable wrt newer exports (e.g. existing thread ids won't change)
|
|
# the downside is that newer exports seem to have better thread ids, so might be preferrable to use them
|
|
# - process inputs reversed (from newest to oldest)
|
|
# the upside is that thread ids/usernames might be better
|
|
# the downside is that if for example the user renames, thread ids will change _a lot_, might be undesirable..
|
|
# (from newest to oldest)
|
|
for path in inputs():
|
|
yield from _entitites_from_path(path)
|
|
|
|
|
|
def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]:
|
|
# TODO make sure it works both with plan directory
|
|
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
|
|
# e.g. possible options are:
|
|
# - if packed things are detected, just return ZipPath
|
|
# - if packed things are detected, possibly return match_structure_wrapper
|
|
# it might be a bit tricky because it's a context manager -- who will recycle it?
|
|
# - if unpacked things are detected, just return the dir as it is
|
|
# (possibly detect them via match_structure? e.g. what if we have a bunch of unpacked dirs)
|
|
#
|
|
# I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
|
|
# https://github.com/karlicoss/HPI/pull/175
|
|
# whereas here I don't need it..
|
|
# so for now will just implement this adhoc thing and think about properly fixing later
|
|
|
|
personal_info = path / 'personal_information'
|
|
if not personal_info.exists():
|
|
# old path, used up to somewhere between feb-aug 2022
|
|
personal_info = path / 'account_information'
|
|
|
|
personal_info_json = personal_info / 'personal_information.json'
|
|
if not personal_info_json.exists():
|
|
# new path, started somewhere around april 2024
|
|
personal_info_json = personal_info / 'personal_information' / 'personal_information.json'
|
|
|
|
j = json.loads(personal_info_json.read_text())
|
|
[profile] = j['profile_user']
|
|
pdata = profile['string_map_data']
|
|
username = pdata['Username']['value']
|
|
full_name = _decode(pdata['Name']['value'])
|
|
|
|
# just make up something :shrug:
|
|
self_id = username
|
|
self_user = User(
|
|
id=self_id,
|
|
username=username,
|
|
full_name=full_name,
|
|
)
|
|
yield self_user
|
|
|
|
files = list(path.rglob('messages/inbox/*/message_*.json'))
|
|
assert len(files) > 0, path
|
|
|
|
buckets = bucket(files, key=lambda p: p.parts[-2])
|
|
file_map = {k: list(buckets[k]) for k in buckets}
|
|
|
|
for fname, ffiles in file_map.items():
|
|
for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])):
|
|
logger.info(f'{ffile} : processing...')
|
|
j = json.loads(ffile.read_text())
|
|
|
|
id_len = 10
|
|
# NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole converstation
|
|
# but I stared a bit at these ids vs database ids and can't see any way to find the correspondence :(
|
|
# so basically the only way to merge is to actually try some magic and correlate timestamps/message texts?
|
|
# another option is perhaps to query user id from username with some free API
|
|
# it's still fragile: e.g. if user deletes themselves there is no more username (it becomes "instagramuser")
|
|
# if we use older exports we might be able to figure it out though... so think about it?
|
|
# it also names grouped ones like instagramuserchrisfoodishblogand25others_einihreoog
|
|
# so I feel like there is just not guaranteed way to correlate :(
|
|
other_id = fname[-id_len:]
|
|
# NOTE: no match in android db?
|
|
other_username = fname[: -id_len - 1]
|
|
other_full_name = _decode(j['title'])
|
|
yield User(
|
|
id=other_id,
|
|
username=other_username,
|
|
full_name=other_full_name,
|
|
)
|
|
|
|
# todo "thread_type": "Regular" ?
|
|
for jm in reversed(j['messages']): # in json, they are in reverse order for some reason
|
|
try:
|
|
content = None
|
|
if 'content' in jm:
|
|
content = _decode(jm['content'])
|
|
if content.endswith(' to your message '):
|
|
# ugh. for some reason these contain an extra space and that messes up message merging..
|
|
content = content.strip()
|
|
else:
|
|
if (share := jm.get('share')) is not None:
|
|
if (share_link := share.get('link')) is not None:
|
|
# somewhere around 20231007, instagram removed these from gdpr links and they show up a lot in various diffs
|
|
share_link = share_link.replace('feed_type=reshare_chaining&', '')
|
|
share_link = share_link.replace('?feed_type=reshare_chaining', '')
|
|
share['link'] = share_link
|
|
if (share_text := share.get('share_text')) is not None:
|
|
share['share_text'] = _decode(share_text)
|
|
|
|
photos = jm.get('photos')
|
|
videos = jm.get('videos')
|
|
cc = share or photos or videos
|
|
if cc is not None:
|
|
content = str(cc)
|
|
|
|
if content is None:
|
|
# this happens e.g. on reel shares..
|
|
# not sure what we can do properly, GPDR has literally no other info in this case
|
|
# on android in this case at the moment we have as content ''
|
|
# so for consistency let's do that too
|
|
content = ''
|
|
|
|
timestamp_ms = jm['timestamp_ms']
|
|
sender_name = _decode(jm['sender_name'])
|
|
|
|
user_id = other_id if sender_name == other_full_name else self_id
|
|
yield _Message(
|
|
created=datetime.fromtimestamp(timestamp_ms / 1000),
|
|
text=content,
|
|
user_id=user_id,
|
|
thread_id=fname, # meh.. but no better way?
|
|
)
|
|
except Exception as e:
|
|
yield e
|
|
|
|
|
|
# TODO basically copy pasted from android.py... hmm
|
|
def messages() -> Iterator[Res[Message]]:
|
|
id2user: Dict[str, User] = {}
|
|
for x in unique_everseen(_entities):
|
|
if isinstance(x, Exception):
|
|
yield x
|
|
continue
|
|
if isinstance(x, User):
|
|
id2user[x.id] = x
|
|
continue
|
|
if isinstance(x, _Message):
|
|
try:
|
|
user = id2user[x.user_id]
|
|
except Exception as e:
|
|
yield e
|
|
continue
|
|
yield Message(
|
|
created=x.created,
|
|
text=x.text,
|
|
thread_id=x.thread_id,
|
|
user=user,
|
|
)
|
|
continue
|
|
assert_never(x)
|