HPI/my/telegram/telegram_backup.py

"""
Telegram data via [fabianonline/telegram_backup](https://github.com/fabianonline/telegram_backup) tool
"""

from dataclasses import dataclass
from datetime import datetime, timezone
from struct import unpack_from, calcsize
import sqlite3
from typing import Dict, Iterator, Optional

from my.core import datetime_aware, PathIsh
from my.core.sqlite import sqlite_connection

from my.config import telegram as user_config


@dataclass
class config(user_config.telegram_backup):
    # path to the export database.sqlite
    export_path: PathIsh


@dataclass
class Chat:
    id: str
    name: Optional[str]
    # not all users have short handle + groups don't have them either?
    # TODO hmm some groups have it -- it's just the tool doesn't dump them??
    handle: Optional[str]
    # not sure if need type?


@dataclass
class User:
    id: str
    name: Optional[str]


@dataclass
class Message:
    # NOTE: message id is NOT unique globally -- only with respect to chat!
    id: int
    time: datetime_aware
    chat: Chat
    sender: User
    text: str
    extra_media_info: Optional[str] = None

    @property
    def permalink(self) -> str:
        handle = self.chat.handle
        if handle is None:
            clink = str(self.chat.id)
        else:
            # FIXME add c/
            clink = f'{handle}'

        # NOTE: don't think deep links to messages work for private conversations sadly https://core.telegram.org/api/links#message-links
        # NOTE: doesn't look like this works with private groups at all, doesn't even jump into it
        return f'https://t.me/{clink}/{self.id}'


Chats = Dict[str, Chat]
def _message_from_row(r: sqlite3.Row, *, chats: Chats, with_extra_media_info: bool) -> Message:
    ts = r['time']
    # desktop export uses UTC (checked by exporting in winter time vs summer time)
    # and telegram_backup timestamps seem same as in desktop export
    time = datetime.fromtimestamp(ts, tz=timezone.utc)
    chat = chats[r['source_id']]
    sender = chats[r['sender_id']]

    extra_media_info: Optional[str] = None
    if with_extra_media_info and r['has_media'] == 1:
        # also it's quite hacky, so at least for now it's just an optional attribute behind the flag
        # defensive because it's a bit tricky to correctly parse without a proper api parser..
        # maybe later we'll improve it
        try:
            extra_media_info = _extract_extra_media_info(data=r['data'])
        except Exception as e:
            pass

    return Message(
        id=r['message_id'],
        time=time,
        chat=chat,
        sender=User(id=sender.id, name=sender.name),
        text=r['text'],
        extra_media_info=extra_media_info,
    )


def messages(*, extra_where: Optional[str]=None, with_extra_media_info: bool=False) -> Iterator[Message]:
    messages_query = 'SELECT * FROM messages WHERE message_type NOT IN ("service_message", "empty_message")'
    if extra_where is not None:
        messages_query += ' AND ' + extra_where
    messages_query += ' ORDER BY time'

    with sqlite_connection(config.export_path, immutable=True, row_factory='row') as db:
        chats: Chats = {}
        for r in db.execute('SELECT * FROM chats ORDER BY id'):
            chat = Chat(id=r['id'], name=r['name'], handle=None)
            assert chat.id not in chats
            chats[chat.id] = chat

        for r in db.execute('SELECT * FROM users ORDER BY id'):
            first = r["first_name"]
            last = r["last_name"]
            name: Optional[str]
            if first is not None and last is not None:
                name = f'{first} {last}'
            else:
                name = first or last

            chat = Chat(id=r['id'], name=name, handle=r['username'])
            assert chat.id not in chats
            chats[chat.id] = chat

        for r in db.execute(messages_query):
            # seems like the only remaining have message_type = 'message'
            yield _message_from_row(r, chats=chats, with_extra_media_info=with_extra_media_info)


def _extract_extra_media_info(data: bytes) -> Optional[str]:
    # ugh... very hacky, but it does manage to extract from 90% of messages that have media
    pos = 0

    def skip(count: int) -> None:
        nonlocal pos
        pos += count

    def getstring() -> str:
        # jesus
        # https://core.telegram.org/type/string
        if data[pos] == 254:
            skip(1)
            (sz1, sz2, sz3) = unpack_from('BBB', data, offset=pos)
            skip(3)
            sz = 256 ** 2 * sz3 + 256 * sz2 + sz1
            short = 0
        else:
            (sz, ) = unpack_from('B', data, offset=pos)
            skip(1)
            short = 1
        assert sz > 0, sz

        padding = 0 if (sz + short) % 4 == 0 else 4 - (sz + short) % 4

        (ss,) = unpack_from(f'{sz}s{padding}x', data, offset=pos)
        skip(sz + padding)
        try:
            return ss.decode('utf8')
        except UnicodeDecodeError as e:
            raise RuntimeError(f'Failed to decode {ss}') from e

    def debug(count: int=10) -> None:
        print([hex(x) for x in data[pos: pos + count]])
        print([chr(x) for x in data[pos: pos + count]])

    header = 'H2xII8xI'
    (flags, mid, src, ts) = unpack_from(header, data, offset=pos)
    pos += calcsize(header)

    # see https://core.telegram.org/constructor/message
    has_media = (flags >> 9) & 1
    if has_media == 0:
        return None

    msg_body = getstring()
    skip(20)
    url1 = getstring()
    url2 = getstring()
    ss_type = getstring()
    # not sure if assert is really necessary her
    # assert ss_type in {
    #     'article',
    #     'photo',
    #     'app',
    #     'video',
    # }, ss_type
    link_title = getstring()
    link_title_2 = getstring()
    link_description = getstring()
    return link_description