HPI/my/instagram/gdpr.py

"""
Instagram data (uses [[https://www.instagram.com/download/request][official GDPR export]])
"""

from dataclasses import dataclass
from datetime import datetime
import json
from pathlib import Path
from typing import Iterator, Sequence, Dict, Union

from more_itertools import bucket

from my.core import (
    get_files,
    Paths,
    datetime_naive,
    Res,
    assert_never,
    make_logger,
)
from my.core.common import unique_everseen

from my.config import instagram as user_config


logger = make_logger(__name__)


@dataclass
class config(user_config.gdpr):
    # paths[s]/glob to the exported zip archives
    export_path: Paths
    # TODO later also support unpacked directories?


def inputs() -> Sequence[Path]:
    return get_files(config.export_path)


# TODO think about unifying with stuff from android.py
@dataclass(unsafe_hash=True)
class User:
    id: str
    username: str
    full_name: str


@dataclass
class _BaseMessage:
    # ugh, this is insane, but does look like it's just keeping local device time???
    # checked against a message sent on 3 June, which should be UTC+1, but timestamp seems local
    created: datetime_naive
    text: str
    thread_id: str
    # NOTE: doesn't look like there aren't any meaningful message ids in the export


@dataclass(unsafe_hash=True)
class _Message(_BaseMessage):
    user_id: str


@dataclass(unsafe_hash=True)
class Message(_BaseMessage):
    user: User


def _decode(s: str) -> str:
    # yeah... idk why they do that
    return s.encode('latin-1').decode('utf8')


def _entities() -> Iterator[Res[Union[User, _Message]]]:
    # it's worth processing all previous export -- sometimes instagram removes some metadata from newer ones
    # NOTE: here there are basically two options
    # - process inputs as is (from oldest to newest)
    #   this would be more stable wrt newer exports (e.g. existing thread ids won't change)
    #   the downside is that newer exports seem to have better thread ids, so might be preferrable to use them
    # - process inputs reversed (from newest to oldest)
    #   the upside is that thread ids/usernames might be better
    #   the downside is that if for example the user renames, thread ids will change _a lot_, might be undesirable..
    # (from newest to oldest)
    for path in inputs():
        yield from _entitites_from_path(path)


def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]:
    # TODO make sure it works both with plan directory
    # idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here
    # e.g. possible options are:
    # - if packed things are detected, just return ZipPath
    # - if packed things are detected, possibly return match_structure_wrapper
    #   it might be a bit tricky because it's a context manager -- who will recycle it?
    # - if unpacked things are detected, just return the dir as it is
    #   (possibly detect them via match_structure? e.g. what if we have a bunch of unpacked dirs)
    #
    # I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
    # https://github.com/karlicoss/HPI/pull/175
    # whereas here I don't need it..
    # so for now will just implement this adhoc thing and think about properly fixing later

    personal_info = path / 'personal_information'
    if not personal_info.exists():
        # old path, used up to somewhere between feb-aug 2022
        personal_info = path / 'account_information'

    personal_info_json = personal_info / 'personal_information.json'
    if not personal_info_json.exists():
        # new path, started somewhere around april 2024
        personal_info_json = personal_info / 'personal_information' / 'personal_information.json'

    j = json.loads(personal_info_json.read_text())
    [profile] = j['profile_user']
    pdata = profile['string_map_data']
    username = pdata['Username']['value']
    full_name = _decode(pdata['Name']['value'])

    # just make up something :shrug:
    self_id = username
    self_user = User(
        id=self_id,
        username=username,
        full_name=full_name,
    )
    yield self_user

    files = list(path.rglob('messages/inbox/*/message_*.json'))
    assert len(files) > 0, path

    buckets = bucket(files, key=lambda p: p.parts[-2])
    file_map = {k: list(buckets[k]) for k in buckets}

    for fname, ffiles in file_map.items():
        for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])):
            logger.info(f'{ffile} : processing...')
            j = json.loads(ffile.read_text())

            id_len = 10
            # NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole converstation
            # but I stared a bit at these ids vs database ids and can't see any way to find the correspondence :(
            # so basically the only way to merge is to actually try some magic and correlate timestamps/message texts?
            # another option is perhaps to query user id from username with some free API
            # it's still fragile: e.g. if user deletes themselves there is no more username (it becomes "instagramuser")
            # if we use older exports we might be able to figure it out though... so think about it?
            # it also names grouped ones like instagramuserchrisfoodishblogand25others_einihreoog
            # so I feel like there is just not guaranteed way to correlate :(
            other_id = fname[-id_len:]
            # NOTE: no match in android db?
            other_username = fname[: -id_len - 1]
            other_full_name = _decode(j['title'])
            yield User(
                id=other_id,
                username=other_username,
                full_name=other_full_name,
            )

            # todo "thread_type": "Regular" ?
            for jm in reversed(j['messages']):  # in json, they are in reverse order for some reason
                try:
                    content = None
                    if 'content' in jm:
                        content = _decode(jm['content'])
                        if content.endswith(' to your message '):
                            # ugh. for some reason these contain an extra space and that messes up message merging..
                            content = content.strip()
                    else:
                        if (share := jm.get('share')) is not None:
                            if (share_link := share.get('link')) is not None:
                                # somewhere around 20231007, instagram removed these from gdpr links and they show up a lot in various diffs
                                share_link = share_link.replace('feed_type=reshare_chaining&', '')
                                share_link = share_link.replace('?feed_type=reshare_chaining', '')
                                share['link'] = share_link
                            if (share_text := share.get('share_text')) is not None:
                                share['share_text'] = _decode(share_text)

                        photos = jm.get('photos')
                        videos = jm.get('videos')
                        cc = share or photos or videos
                        if cc is not None:
                            content = str(cc)

                    if content is None:
                        # this happens e.g. on reel shares..
                        # not sure what we can do properly, GPDR has literally no other info in this case
                        # on android in this case at the moment we have as content ''
                        # so for consistency let's do that too
                        content = ''

                    timestamp_ms = jm['timestamp_ms']
                    sender_name = _decode(jm['sender_name'])

                    user_id = other_id if sender_name == other_full_name else self_id
                    yield _Message(
                        created=datetime.fromtimestamp(timestamp_ms / 1000),
                        text=content,
                        user_id=user_id,
                        thread_id=fname,  # meh.. but no better way?
                    )
                except Exception as e:
                    yield e


# TODO basically copy pasted from android.py... hmm
def messages() -> Iterator[Res[Message]]:
    id2user: Dict[str, User] = {}
    for x in unique_everseen(_entities):
        if isinstance(x, Exception):
            yield x
            continue
        if isinstance(x, User):
            id2user[x.id] = x
            continue
        if isinstance(x, _Message):
            try:
                user = id2user[x.user_id]
            except Exception as e:
                yield e
                continue
            yield Message(
                created=x.created,
                text=x.text,
                thread_id=x.thread_id,
                user=user,
            )
            continue
        assert_never(x)