HPI/my/instagram/gdpr.py

199 lines
7 KiB
Python

"""
Instagram data (uses [[https://www.instagram.com/download/request][official GDPR export]])
"""
from dataclasses import dataclass
from datetime import datetime
import json
from pathlib import Path
from typing import Iterator, Sequence, Dict, Union
from more_itertools import bucket
from my.core import (
get_files,
Paths,
datetime_naive,
Res,
assert_never,
make_logger,
)
from my.core.kompress import ZipPath
from my.config import instagram as user_config
logger = make_logger(__name__)
@dataclass
class config(user_config.gdpr):
# paths[s]/glob to the exported zip archives
export_path: Paths
# TODO later also support unpacked directories?
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
# TODO think about unifying with stuff from android.py
@dataclass(unsafe_hash=True)
class User:
id: str
username: str
full_name: str
@dataclass
class _BaseMessage:
# ugh, this is insane, but does look like it's just keeping local device time???
# checked against a message sent on 3 June, which should be UTC+1, but timestamp seems local
created: datetime_naive
text: str
thread_id: str
# NOTE: doesn't look like there aren't any meaningful message ids in the export
@dataclass(unsafe_hash=True)
class _Message(_BaseMessage):
user_id: str
@dataclass(unsafe_hash=True)
class Message(_BaseMessage):
user: User
def _decode(s: str) -> str:
# yeah... idk why they do that
return s.encode('latin-1').decode('utf8')
def _entities() -> Iterator[Res[Union[User, _Message]]]:
last = ZipPath(max(inputs()))
# TODO make sure it works both with plan directory
# idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
# e.g. possible options are:
# - if packed things are detected, just return ZipPath
# - if packed things are detected, possibly return match_structure_wrapper
# it might be a bit tricky because it's a context manager -- who will recycle it?
# - if unpacked things are detected, just return the dir as it is
# (possibly detect them via match_structure? e.g. what if we have a bunch of unpacked dirs)
#
# I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
# https://github.com/karlicoss/HPI/pull/175
# whereas here I don't need it..
# so for now will just implement this adhoc thing and think about properly fixing later
personal_info = last / 'personal_information'
if not personal_info.exists():
# old path, used up to somewhere between feb-aug 2022
personal_info = last / 'account_information'
j = json.loads((personal_info / 'personal_information.json').read_text())
[profile] = j['profile_user']
pdata = profile['string_map_data']
username = pdata['Username']['value']
full_name = _decode(pdata['Name']['value'])
# just make up something :shrug:
self_id = username
self_user = User(
id=self_id,
username=username,
full_name=full_name,
)
yield self_user
files = list(last.rglob('messages/inbox/*/message_*.json'))
assert len(files) > 0, last
buckets = bucket(files, key=lambda p: p.parts[-2])
file_map = {k: list(buckets[k]) for k in buckets}
for fname, ffiles in file_map.items():
for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])):
logger.info(f'{ffile} : processing...')
j = json.loads(ffile.read_text())
id_len = 10
# NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole converstation
# but I stared a bit at these ids vs database ids and can't see any way to find the correspondence :(
# so basically the only way to merge is to actually try some magic and correlate timestamps/message texts?
# another option is perhaps to query user id from username with some free API
# it's still fragile: e.g. if user deletes themselves there is no more username (it becomes "instagramuser")
# if we use older exports we might be able to figure it out though... so think about it?
# it also names grouped ones like instagramuserchrisfoodishblogand25others_einihreoog
# so I feel like there is just not guaranteed way to correlate :(
other_id = fname[-id_len:]
# NOTE: no match in android db?
other_username = fname[:-id_len - 1]
other_full_name = _decode(j['title'])
yield User(
id=other_id,
username=other_username,
full_name=other_full_name,
)
# todo "thread_type": "Regular" ?
for jm in j['messages']:
try:
content = None
if 'content' in jm:
content = _decode(jm['content'])
if content.endswith(' to your message '):
# ugh. for some reason these contain an extra space and that messes up message merging..
content = content.strip()
else:
share = jm.get('share')
photos = jm.get('photos')
videos = jm.get('videos')
cc = share or photos or videos
if cc is not None:
content = str(cc)
if content is None:
# this happens e.g. on reel shares..
# not sure what we can do properly, GPDR has literally no other info in this case
# on android in this case at the moment we have as content ''
# so for consistency let's do that too
content = ''
timestamp_ms = jm['timestamp_ms']
sender_name = _decode(jm['sender_name'])
user_id = other_id if sender_name == other_full_name else self_id
yield _Message(
created=datetime.fromtimestamp(timestamp_ms / 1000),
text=content,
user_id=user_id,
thread_id=fname, # meh.. but no better way?
)
except Exception as e:
yield e
# TODO basically copy pasted from android.py... hmm
def messages() -> Iterator[Res[Message]]:
id2user: Dict[str, User] = {}
for x in _entities():
if isinstance(x, Exception):
yield x
continue
if isinstance(x, User):
id2user[x.id] = x
continue
if isinstance(x, _Message):
try:
user = id2user[x.user_id]
except Exception as e:
yield e
continue
yield Message(
created=x.created,
text=x.text,
thread_id=x.thread_id,
user=user,
)
continue
assert_never(x)