my.whatsapp.android: adapt to new db format

This commit is contained in:
Dima Gerasimov 2024-10-22 20:50:37 +01:00 committed by karlicoss
parent 8496d131e7
commit a2b397ec4a
2 changed files with 27 additions and 8 deletions

View file

@ -3,4 +3,4 @@ from my.core import warnings
warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!') warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!')
from my.core.util import __NOT_HPI_MODULE__ from my.core.util import __NOT_HPI_MODULE__
from my.kobo import * # type: ignore[no-redef] from my.kobo import *

View file

@ -1,6 +1,7 @@
""" """
Whatsapp data from Android app database (in =/data/data/com.whatsapp/databases/msgstore.db=) Whatsapp data from Android app database (in =/data/data/com.whatsapp/databases/msgstore.db=)
""" """
from __future__ import annotations from __future__ import annotations
import sqlite3 import sqlite3
@ -63,11 +64,27 @@ Entity = Union[Chat, Sender, Message]
def _process_db(db: sqlite3.Connection) -> Iterator[Entity]: def _process_db(db: sqlite3.Connection) -> Iterator[Entity]:
# TODO later, split out Chat/Sender objects separately to safe on object creation, similar to other android data sources # TODO later, split out Chat/Sender objects separately to safe on object creation, similar to other android data sources
try:
db.execute('SELECT jid_row_id FROM chat_view')
except sqlite3.OperationalError as oe:
if 'jid_row_id' not in str(oe):
raise oe
new_version_202410 = False
else:
new_version_202410 = True
if new_version_202410:
chat_id_col = 'jid.raw_string'
jid_join = 'JOIN jid ON jid._id == chat_view.jid_row_id'
else:
chat_id_col = 'chat_view.raw_string_jid'
jid_join = ''
chats = {} chats = {}
for r in db.execute( for r in db.execute(
''' f'''
SELECT raw_string_jid AS chat_id, subject SELECT {chat_id_col} AS chat_id, subject
FROM chat_view FROM chat_view {jid_join}
WHERE chat_id IS NOT NULL /* seems that it might be null for chats that are 'recycled' (the db is more like an LRU cache) */ WHERE chat_id IS NOT NULL /* seems that it might be null for chats that are 'recycled' (the db is more like an LRU cache) */
''' '''
): ):
@ -89,6 +106,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Entity]:
): ):
# TODO seems that msgstore.db doesn't have contact names # TODO seems that msgstore.db doesn't have contact names
# perhaps should extract from wa.db and match against wa_contacts.jid? # perhaps should extract from wa.db and match against wa_contacts.jid?
# TODO these can also be chats? not sure if need to include...
s = Sender( s = Sender(
id=r['raw_string'], id=r['raw_string'],
name=None, name=None,
@ -100,9 +118,9 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Entity]:
# so even if it seems as if it has a column (e.g. for attachment path), there is actually no such data # so even if it seems as if it has a column (e.g. for attachment path), there is actually no such data
# so makes more sense to just query message column directly # so makes more sense to just query message column directly
for r in db.execute( for r in db.execute(
''' f'''
SELECT SELECT
C.raw_string_jid AS chat_id, {chat_id_col} AS chat_id,
M.key_id, M.timestamp, M.key_id, M.timestamp,
sender_jid_row_id, sender_jid_row_id,
M.from_me, M.from_me,
@ -111,8 +129,9 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Entity]:
MM.file_size, MM.file_size,
M.message_type M.message_type
FROM message AS M FROM message AS M
LEFT JOIN chat_view AS C ON M.chat_row_id = C._id LEFT JOIN chat_view ON M.chat_row_id = chat_view._id
LEFT JOIN message_media AS MM ON M._id = MM.message_row_id {jid_join}
left JOIN message_media AS MM ON M._id = MM.message_row_id
WHERE M.key_id != -1 /* key_id -1 is some sort of fake message where everything is null */ WHERE M.key_id != -1 /* key_id -1 is some sort of fake message where everything is null */
/* type 7 seems to be some dummy system message. /* type 7 seems to be some dummy system message.
sometimes contain chat name, but usually null, so ignore them sometimes contain chat name, but usually null, so ignore them