HPI/my/whatsapp/android.py
2024-11-26 21:51:40 +00:00

238 lines
7.1 KiB
Python

"""
Whatsapp data from Android app database (in =/data/data/com.whatsapp/databases/msgstore.db=)
"""
from __future__ import annotations
import sqlite3
from collections.abc import Iterator, Sequence
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Union
from my.core import Paths, Res, datetime_aware, get_files, make_config, make_logger
from my.core.common import unique_everseen
from my.core.error import echain, notnone
from my.core.sqlite import sqlite_connection
import my.config # isort: skip
logger = make_logger(__name__)
@dataclass
class Config(my.config.whatsapp.android):
# paths[s]/glob to the exported sqlite databases
export_path: Paths
my_user_id: str | None = None
config = make_config(Config)
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
@dataclass(unsafe_hash=True)
class Chat:
id: str
# todo not sure how to support renames?
# could change Chat object itself, but this won't work well with incremental processing..
name: str | None
@dataclass(unsafe_hash=True)
class Sender:
id: str
name: str | None
@dataclass(unsafe_hash=True)
class Message:
chat: Chat
id: str
dt: datetime_aware
sender: Sender
text: str | None
Entity = Union[Chat, Sender, Message]
def _process_db(db: sqlite3.Connection) -> Iterator[Entity]:
# TODO later, split out Chat/Sender objects separately to safe on object creation, similar to other android data sources
try:
db.execute('SELECT jid_row_id FROM chat_view')
except sqlite3.OperationalError as oe:
if 'jid_row_id' not in str(oe):
raise oe
new_version_202410 = False
else:
new_version_202410 = True
if new_version_202410:
chat_id_col = 'jid.raw_string'
jid_join = 'JOIN jid ON jid._id == chat_view.jid_row_id'
else:
chat_id_col = 'chat_view.raw_string_jid'
jid_join = ''
chats = {}
for r in db.execute(
f'''
SELECT {chat_id_col} AS chat_id, subject
FROM chat_view {jid_join}
WHERE chat_id IS NOT NULL /* seems that it might be null for chats that are 'recycled' (the db is more like an LRU cache) */
'''
):
chat_id = r['chat_id']
subject = r['subject']
chat = Chat(
id=chat_id,
name=subject,
)
yield chat
chats[chat.id] = chat
senders = {}
for r in db.execute(
'''
SELECT _id, raw_string
FROM jid
'''
):
# TODO seems that msgstore.db doesn't have contact names
# perhaps should extract from wa.db and match against wa_contacts.jid?
# TODO these can also be chats? not sure if need to include...
s = Sender(
id=r['raw_string'],
name=None,
)
yield s
senders[r['_id']] = s
# NOTE: hmm, seems that message_view or available_message_view use lots of NULL as ...
# so even if it seems as if it has a column (e.g. for attachment path), there is actually no such data
# so makes more sense to just query message column directly
for r in db.execute(
f'''
SELECT
{chat_id_col} AS chat_id,
M.key_id, M.timestamp,
sender_jid_row_id,
M.from_me,
M.text_data,
MM.file_path,
MM.file_size,
M.message_type
FROM message AS M
LEFT JOIN chat_view ON M.chat_row_id = chat_view._id
{jid_join}
left JOIN message_media AS MM ON M._id = MM.message_row_id
WHERE M.key_id != -1 /* key_id -1 is some sort of fake message where everything is null */
/* type 7 seems to be some dummy system message.
sometimes contain chat name, but usually null, so ignore them
for normal messages it's 0
*/
AND M.message_type != 7
ORDER BY M.timestamp
'''
):
msg_id: str = notnone(r['key_id'])
ts: int = notnone(r['timestamp'])
dt = datetime.fromtimestamp(ts / 1000, tz=timezone.utc)
text: str | None = r['text_data']
media_file_path: str | None = r['file_path']
media_file_size: int | None = r['file_size']
message_type = r['message_type']
if text is None:
# fmt: off
text = {
5 : '[MAP LOCATION]',
10: '[MISSED VOICE CALL]',
15: '[DELETED]',
16: '[LIVE LOCATION]',
64: '[DELETED]', # seems like 'deleted by admin'?
}.get(message_type)
# fmt: on
# check against known msg types
# fmt: off
if text is None and message_type not in {
0, # normal
1, # image
2, # voice note
3, # video
7, # "system" message, e.g. chat name
8, # document
9, # also document?
13, # animated gif?
20, # webp/sticker?
}:
text = f"[UNKNOWN TYPE {message_type}]"
# fmt: on
if media_file_size is not None:
# this is always not null for message_media table
# however media_file_path sometimes may be none
mm = f'MEDIA: {media_file_path}'
if text is None:
text = mm
else:
text = text + '\n' + mm
from_me = r['from_me'] == 1
chat_id = r['chat_id']
if chat_id is None:
# ugh, I think these might have been edited messages? unclear..
logger.warning(f"CHAT ID IS NONE, WTF?? {dt} {ts} {text}")
continue
chat = chats[chat_id]
sender_row_id = r['sender_jid_row_id']
if sender_row_id == 0:
# seems that it's always 0 for 1-1 chats
# for group chats our own id is still 0, but other ids are properly set
if from_me:
myself_user_id = config.my_user_id or 'MYSELF_USER_ID'
sender = Sender(id=myself_user_id, name=None) # TODO set my own name as well?
else:
sender = Sender(id=chat.id, name=None)
else:
sender = senders[sender_row_id]
m = Message(chat=chat, id=msg_id, dt=dt, sender=sender, text=text)
yield m
def _entities() -> Iterator[Res[Entity]]:
paths = inputs()
total = len(paths)
width = len(str(total))
for idx, path in enumerate(paths):
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
with sqlite_connection(path, immutable=True, row_factory='row') as db:
try:
yield from _process_db(db)
except Exception as e:
yield echain(RuntimeError(f'While processing {path}'), cause=e)
def entities() -> Iterator[Res[Entity]]:
return unique_everseen(_entities)
def messages() -> Iterator[Res[Message]]:
# TODO hmm, specify key=lambda m: m.id?
# not sure since might be useful to keep track of sender changes etc
# probably best not to, or maybe query messages/senders separately and merge later?
for e in entities():
if isinstance(e, (Exception, Message)):
yield e