diff --git a/my/config.py b/my/config.py index 2fed239..cd1bfe8 100644 --- a/my/config.py +++ b/my/config.py @@ -110,3 +110,5 @@ class bumble: class instagram: class android: export_path: Paths + class gdpr: + export_path: Paths diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py new file mode 100644 index 0000000..7e73eaa --- /dev/null +++ b/my/instagram/gdpr.py @@ -0,0 +1,168 @@ +""" +Instagram data (uses [[https://www.instagram.com/download/request][official GDPR export]]) +""" +from dataclasses import dataclass +from datetime import datetime +from typing import Iterator, Any, Sequence, Dict + +from my.config import instagram as user_config + +from more_itertools import bucket + +from ..core import Paths +@dataclass +class config(user_config.gdpr): + # paths[s]/glob to the exported zip archives + export_path: Paths + # TODO later also support unpacked directories? + + +from ..core import get_files +from pathlib import Path +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +# TODO think about unifying with stuff from android.py +@dataclass(unsafe_hash=True) +class User: + id: str + username: str + full_name: str + + +@dataclass +class _BaseMessage: + # TODO id is missing? + created: datetime + text: str + thread_id: str + + +@dataclass(unsafe_hash=True) +class _Message(_BaseMessage): + user_id: str + + +@dataclass(unsafe_hash=True) +class Message(_BaseMessage): + user: User + + +def _decode(s: str) -> str: + # yeah... idk why they do that + return s.encode('latin-1').decode('utf8') + + +import json +from typing import Union +from ..core.kompress import kopen +from ..core.error import Res +from ..core.structure import match_structure +def _entities() -> Iterator[Res[Union[User, _Message]]]: + last = max(inputs()) + + with kopen(last, 'account_information/personal_information.json') as fo: + j = json.load(fo) + [profile] = j['profile_user'] + pdata = profile['string_map_data'] + username = pdata['Username']['value'] + full_name = _decode(pdata['Name']['value']) + + # just make up something :shrug: + self_id = username + self_user = User( + id=self_id, + username=username, + full_name=full_name, + ) + yield self_user + + # TODO maybe move it to kompress/match_structure? + # would be nice to support it without unpacking + # I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure + # https://github.com/karlicoss/HPI/pull/175 + # whereas here I don't need it.. + # so for now will just implement this adhoc thing and think about properly fixing later + + from zipfile import ZipFile + z = ZipFile(last) + files = [Path(p) for p in z.namelist() if Path(p).match('messages/inbox/*/message_*.json')] + assert len(files) > 0, last + + buckets = bucket(files, key=lambda p: p.parts[2]) + file_map = {k: list(buckets[k]) for k in buckets} + + for fname, ffiles in file_map.items(): + # sort by file number (.../message_.json) + for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])): + with kopen(last, str(ffile)) as fo: + j = json.load(fo) + + id_len = 10 + # NOTE: no match in android db/api responses? + other_id = fname[-id_len:] + # NOTE: no match in android db? + other_username = fname[:-id_len - 1] + other_full_name = _decode(j['title']) + yield User( + id=other_id, + username=other_username, + full_name=other_full_name, + ) + + # todo "thread_type": "Regular" ? + for jm in j['messages']: + # todo defensive? + try: + mtype = jm['type'] # Generic/Share? + content = None + if 'content' in jm: + content = _decode(jm['content']) + else: + share = jm.get('share') + photos = jm.get('photos') + videos = jm.get('videos') + cc = share or photos or videos + if cc is not None: + content = str(cc) + assert content is not None, jm + timestamp_ms = jm['timestamp_ms'] + sender_name = _decode(jm['sender_name']) + + user_id = other_id if sender_name == other_full_name else self_id + yield _Message( + created=datetime.fromtimestamp(timestamp_ms / 1000), + text=content, + user_id=user_id, + thread_id=fname, # meh.. but no better way? + ) + except Exception as e: + # TODO sometimes messages are just missing content?? even with Generic type + yield e + + +# TODO basically copy pasted from android.py... hmm +def messages() -> Iterator[Res[Message]]: + id2user: Dict[str, User] = {} + for x in _entities(): + if isinstance(x, Exception): + yield x + continue + if isinstance(x, User): + id2user[x.id] = x + continue + if isinstance(x, _Message): + try: + user = id2user[x.user_id] + except Exception as e: + yield e + continue + yield Message( + created=x.created, + text=x.text, + thread_id=x.thread_id, + user=user, + ) + continue + assert False, type(x) # should not happen