fbmessenger: process Android app data
for now, no merging, will figure it out later
This commit is contained in:
parent
fcd7ca6480
commit
403ca9c111
2 changed files with 175 additions and 0 deletions
|
@ -117,3 +117,8 @@ class instagram:
|
||||||
class hackernews:
|
class hackernews:
|
||||||
class dogsheep:
|
class dogsheep:
|
||||||
export_path: Paths
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class fbmessenger:
|
||||||
|
class android:
|
||||||
|
export_path: Paths
|
||||||
|
|
170
my/fbmessenger/android.py
Normal file
170
my/fbmessenger/android.py
Normal file
|
@ -0,0 +1,170 @@
|
||||||
|
"""
|
||||||
|
Messenger data from Android app database (in =/data/data/com.facebook.orca/databases/threads_db2=)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Iterator, Sequence, Optional, Dict
|
||||||
|
|
||||||
|
|
||||||
|
from my.config import fbmessenger as user_config
|
||||||
|
|
||||||
|
|
||||||
|
from ..core import Paths
|
||||||
|
@dataclass
|
||||||
|
class config(user_config.android):
|
||||||
|
# paths[s]/glob to the exported sqlite databases
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
from ..core import get_files
|
||||||
|
from pathlib import Path
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class Sender:
|
||||||
|
id: str
|
||||||
|
name: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class Thread:
|
||||||
|
id: str
|
||||||
|
name: Optional[str]
|
||||||
|
|
||||||
|
# todo not sure about order of fields...
|
||||||
|
@dataclass
|
||||||
|
class _BaseMessage:
|
||||||
|
# todo nice, ids are same as in fbchat??
|
||||||
|
id: str
|
||||||
|
dt: datetime
|
||||||
|
# is_incoming: bool
|
||||||
|
text: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class _Message(_BaseMessage):
|
||||||
|
thread_id: str
|
||||||
|
sender_id: str
|
||||||
|
reply_to_id: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class Message(_BaseMessage):
|
||||||
|
thread: Thread
|
||||||
|
sender: Sender
|
||||||
|
reply_to: Optional[Message]
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Union
|
||||||
|
from ..core.error import Res
|
||||||
|
from ..core.dataset import connect_readonly
|
||||||
|
Entity = Union[Sender, Thread, _Message]
|
||||||
|
def _entities() -> Iterator[Res[Entity]]:
|
||||||
|
for f in inputs():
|
||||||
|
with connect_readonly(f) as db:
|
||||||
|
yield from _process_db(db)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_db(db) -> Iterator[Res[Entity]]:
|
||||||
|
# works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user
|
||||||
|
threadkey2id = lambda key: key.split(':')[1]
|
||||||
|
|
||||||
|
for r in db['threads']:
|
||||||
|
try:
|
||||||
|
yield Thread(
|
||||||
|
id=threadkey2id(r['thread_key']),
|
||||||
|
name=r['name'],
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
yield e
|
||||||
|
continue
|
||||||
|
|
||||||
|
for r in db['messages'].all(order_by='timestamp_ms'):
|
||||||
|
mtype = r['msg_type']
|
||||||
|
if mtype == -1:
|
||||||
|
# likely immediately deleted or something? doesn't have any data at all
|
||||||
|
continue
|
||||||
|
|
||||||
|
user_id = None
|
||||||
|
try:
|
||||||
|
# todo could use thread_users?
|
||||||
|
sj = json.loads(r['sender'])
|
||||||
|
ukey = sj['user_key']
|
||||||
|
prefix = 'FACEBOOK:'
|
||||||
|
assert ukey.startswith(prefix), ukey
|
||||||
|
user_id = ukey[len(prefix):]
|
||||||
|
yield Sender(
|
||||||
|
id=user_id,
|
||||||
|
name=sj['name'],
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
yield e
|
||||||
|
continue
|
||||||
|
|
||||||
|
thread_id = None
|
||||||
|
try:
|
||||||
|
thread_id = threadkey2id(r['thread_key'])
|
||||||
|
except Exception as e:
|
||||||
|
yield e
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert user_id is not None
|
||||||
|
assert thread_id is not None
|
||||||
|
yield _Message(
|
||||||
|
id=r['msg_id'],
|
||||||
|
dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000),
|
||||||
|
# is_incoming=False, TODO??
|
||||||
|
text=r['text'],
|
||||||
|
thread_id=thread_id,
|
||||||
|
sender_id=user_id,
|
||||||
|
reply_to_id=r['message_replied_to_id']
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
yield e
|
||||||
|
|
||||||
|
|
||||||
|
from more_itertools import unique_everseen
|
||||||
|
def messages() -> Iterator[Res[Message]]:
|
||||||
|
senders: Dict[str, Sender] = {}
|
||||||
|
msgs: Dict[str, Message] = {}
|
||||||
|
threads: Dict[str, Thread] = {}
|
||||||
|
for x in unique_everseen(_entities()):
|
||||||
|
if isinstance(x, Exception):
|
||||||
|
yield x
|
||||||
|
continue
|
||||||
|
if isinstance(x, Sender):
|
||||||
|
senders[x.id] = x
|
||||||
|
continue
|
||||||
|
if isinstance(x, Thread):
|
||||||
|
threads[x.id] = x
|
||||||
|
continue
|
||||||
|
if isinstance(x, _Message):
|
||||||
|
reply_to_id = x.reply_to_id
|
||||||
|
try:
|
||||||
|
sender = senders[x.sender_id]
|
||||||
|
# hmm, reply_to be missing due to the synthetic nature of export
|
||||||
|
# also would be interesting to merge together entities rather than resuling messages from different sources..
|
||||||
|
# then the merging thing could be moved to common?
|
||||||
|
reply_to = None if reply_to_id is None else msgs[reply_to_id]
|
||||||
|
thread = threads[x.thread_id]
|
||||||
|
except Exception as e:
|
||||||
|
yield e
|
||||||
|
continue
|
||||||
|
m = Message(
|
||||||
|
id=x.id,
|
||||||
|
dt=x.dt,
|
||||||
|
text=x.text,
|
||||||
|
thread=thread,
|
||||||
|
sender=sender,
|
||||||
|
reply_to=reply_to,
|
||||||
|
)
|
||||||
|
msgs[m.id] = m
|
||||||
|
yield m
|
||||||
|
continue
|
||||||
|
assert False, type(x) # should be unreachable
|
Loading…
Add table
Reference in a new issue