instagram: initial module for GDPR export
still somewhat WIP, unclear how to correlate it with android data
This commit is contained in:
parent
0e891a267f
commit
1e635502a2
2 changed files with 170 additions and 0 deletions
|
@ -110,3 +110,5 @@ class bumble:
|
|||
class instagram:
|
||||
class android:
|
||||
export_path: Paths
|
||||
class gdpr:
|
||||
export_path: Paths
|
||||
|
|
168
my/instagram/gdpr.py
Normal file
168
my/instagram/gdpr.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
"""
|
||||
Instagram data (uses [[https://www.instagram.com/download/request][official GDPR export]])
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Iterator, Any, Sequence, Dict
|
||||
|
||||
from my.config import instagram as user_config
|
||||
|
||||
from more_itertools import bucket
|
||||
|
||||
from ..core import Paths
|
||||
@dataclass
|
||||
class config(user_config.gdpr):
|
||||
# paths[s]/glob to the exported zip archives
|
||||
export_path: Paths
|
||||
# TODO later also support unpacked directories?
|
||||
|
||||
|
||||
from ..core import get_files
|
||||
from pathlib import Path
|
||||
def inputs() -> Sequence[Path]:
|
||||
return get_files(config.export_path)
|
||||
|
||||
|
||||
# TODO think about unifying with stuff from android.py
|
||||
@dataclass(unsafe_hash=True)
|
||||
class User:
|
||||
id: str
|
||||
username: str
|
||||
full_name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class _BaseMessage:
|
||||
# TODO id is missing?
|
||||
created: datetime
|
||||
text: str
|
||||
thread_id: str
|
||||
|
||||
|
||||
@dataclass(unsafe_hash=True)
|
||||
class _Message(_BaseMessage):
|
||||
user_id: str
|
||||
|
||||
|
||||
@dataclass(unsafe_hash=True)
|
||||
class Message(_BaseMessage):
|
||||
user: User
|
||||
|
||||
|
||||
def _decode(s: str) -> str:
|
||||
# yeah... idk why they do that
|
||||
return s.encode('latin-1').decode('utf8')
|
||||
|
||||
|
||||
import json
|
||||
from typing import Union
|
||||
from ..core.kompress import kopen
|
||||
from ..core.error import Res
|
||||
from ..core.structure import match_structure
|
||||
def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||
last = max(inputs())
|
||||
|
||||
with kopen(last, 'account_information/personal_information.json') as fo:
|
||||
j = json.load(fo)
|
||||
[profile] = j['profile_user']
|
||||
pdata = profile['string_map_data']
|
||||
username = pdata['Username']['value']
|
||||
full_name = _decode(pdata['Name']['value'])
|
||||
|
||||
# just make up something :shrug:
|
||||
self_id = username
|
||||
self_user = User(
|
||||
id=self_id,
|
||||
username=username,
|
||||
full_name=full_name,
|
||||
)
|
||||
yield self_user
|
||||
|
||||
# TODO maybe move it to kompress/match_structure?
|
||||
# would be nice to support it without unpacking
|
||||
# I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure
|
||||
# https://github.com/karlicoss/HPI/pull/175
|
||||
# whereas here I don't need it..
|
||||
# so for now will just implement this adhoc thing and think about properly fixing later
|
||||
|
||||
from zipfile import ZipFile
|
||||
z = ZipFile(last)
|
||||
files = [Path(p) for p in z.namelist() if Path(p).match('messages/inbox/*/message_*.json')]
|
||||
assert len(files) > 0, last
|
||||
|
||||
buckets = bucket(files, key=lambda p: p.parts[2])
|
||||
file_map = {k: list(buckets[k]) for k in buckets}
|
||||
|
||||
for fname, ffiles in file_map.items():
|
||||
# sort by file number (.../message_<number>.json)
|
||||
for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])):
|
||||
with kopen(last, str(ffile)) as fo:
|
||||
j = json.load(fo)
|
||||
|
||||
id_len = 10
|
||||
# NOTE: no match in android db/api responses?
|
||||
other_id = fname[-id_len:]
|
||||
# NOTE: no match in android db?
|
||||
other_username = fname[:-id_len - 1]
|
||||
other_full_name = _decode(j['title'])
|
||||
yield User(
|
||||
id=other_id,
|
||||
username=other_username,
|
||||
full_name=other_full_name,
|
||||
)
|
||||
|
||||
# todo "thread_type": "Regular" ?
|
||||
for jm in j['messages']:
|
||||
# todo defensive?
|
||||
try:
|
||||
mtype = jm['type'] # Generic/Share?
|
||||
content = None
|
||||
if 'content' in jm:
|
||||
content = _decode(jm['content'])
|
||||
else:
|
||||
share = jm.get('share')
|
||||
photos = jm.get('photos')
|
||||
videos = jm.get('videos')
|
||||
cc = share or photos or videos
|
||||
if cc is not None:
|
||||
content = str(cc)
|
||||
assert content is not None, jm
|
||||
timestamp_ms = jm['timestamp_ms']
|
||||
sender_name = _decode(jm['sender_name'])
|
||||
|
||||
user_id = other_id if sender_name == other_full_name else self_id
|
||||
yield _Message(
|
||||
created=datetime.fromtimestamp(timestamp_ms / 1000),
|
||||
text=content,
|
||||
user_id=user_id,
|
||||
thread_id=fname, # meh.. but no better way?
|
||||
)
|
||||
except Exception as e:
|
||||
# TODO sometimes messages are just missing content?? even with Generic type
|
||||
yield e
|
||||
|
||||
|
||||
# TODO basically copy pasted from android.py... hmm
|
||||
def messages() -> Iterator[Res[Message]]:
|
||||
id2user: Dict[str, User] = {}
|
||||
for x in _entities():
|
||||
if isinstance(x, Exception):
|
||||
yield x
|
||||
continue
|
||||
if isinstance(x, User):
|
||||
id2user[x.id] = x
|
||||
continue
|
||||
if isinstance(x, _Message):
|
||||
try:
|
||||
user = id2user[x.user_id]
|
||||
except Exception as e:
|
||||
yield e
|
||||
continue
|
||||
yield Message(
|
||||
created=x.created,
|
||||
text=x.text,
|
||||
thread_id=x.thread_id,
|
||||
user=user,
|
||||
)
|
||||
continue
|
||||
assert False, type(x) # should not happen
|
Loading…
Add table
Reference in a new issue