''' VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) ''' # note: could reuse the original repo, but little point I guess since VK closed their API from datetime import datetime from dataclasses import dataclass import json from typing import Dict, Iterator import pytz from my.core import stat, Stats, Json, Res, datetime_aware, get_files from my.core.common import unique_everseen from my.config import vk_messages_backup as config # I think vk_messages_backup used this tz? # not sure if vk actually used to return this tz in api? TZ = pytz.timezone('Europe/Moscow') Uid = int @dataclass(frozen=True) class User: id: Uid first_name: str last_name: str @dataclass(frozen=True) class Chat: chat_id: str title: str @dataclass(frozen=True) class Message: dt: datetime_aware chat: Chat id: str # todo not sure it's unique? user: User body: str Users = Dict[Uid, User] def users() -> Users: files = list(sorted(config.storage_path.glob('user_*.json'))) res = {} for f in files: j = json.loads(f.read_text()) uid = j['id'] res[uid] = User( id=uid, first_name=j['first_name'], last_name=j['last_name'], ) return res GROUP_CHAT_MIN_ID = 2000000000 def _parse_chat(*, msg: Json, udict: Users) -> Chat: # exported with newer api, peer_id is a proper identifier both for users and chats peer_id = msg.get('peer_id') if peer_id is not None: chat_id = peer_id else: group_chat_id = msg.get('chat_id') if group_chat_id is not None: chat_id = GROUP_CHAT_MIN_ID + group_chat_id else: chat_id = msg['user_id'] is_group_chat = chat_id >= GROUP_CHAT_MIN_ID if is_group_chat: title = msg['title'] else: user_id = msg.get('user_id') or msg.get('from_id') assert user_id is not None user = udict[user_id] title = f'{user.first_name} {user.last_name}' return Chat( chat_id=chat_id, title=title, ) def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: mid = msg['id'] md = msg['date'] dt = datetime.fromtimestamp(md, tz=TZ) # todo attachments? e.g. url could be an attachment # todo might be forwarded? mb = msg.get('body') if mb is None: mb = msg.get('text') assert mb is not None, msg out = msg['out'] == 1 if out: user = udict[config.user_id] else: mu = msg.get('user_id') or msg.get('from_id') assert mu is not None, msg user = udict[mu] return Message( dt=dt, chat=chat, id=mid, user=user, body=mb, ) def _messages() -> Iterator[Res[Message]]: udict = users() uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ list(sorted(config.storage_path.glob('groupchat_*.json'))) for f in uchats: j = json.loads(f.read_text()) # ugh. very annoying, sometimes not possible to extract title from last message # due to newer api... # so just do in defensively until we succeed... chat = None ex = None for m in reversed(j): try: chat = _parse_chat(msg=m, udict=udict) except Exception as e: ex = e continue if chat is None: assert ex is not None yield ex continue for msg in j: try: yield _parse_msg(msg=msg, chat=chat, udict=udict) except Exception as e: yield e def messages() -> Iterator[Res[Message]]: # seems that during backup messages were sometimes duplicated.. yield from unique_everseen(_messages) def stats() -> Stats: return { **stat(users), **stat(messages), }