HPI/my/vk/vk_messages_backup.py

157 lines
3.9 KiB
Python

'''
VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]])
'''
# note: could reuse the original repo, but little point I guess since VK closed their API
from datetime import datetime
from dataclasses import dataclass
import json
from typing import Dict, Iterator, NamedTuple
from more_itertools import unique_everseen
import pytz
from my.core import stat, Stats, Json, Res, datetime_aware
from my.config import vk_messages_backup as config
# I think vk_messages_backup used this tz?
# not sure if vk actually used to return this tz in api?
TZ = pytz.timezone('Europe/Moscow')
Uid = int
@dataclass(frozen=True)
class User:
id: Uid
first_name: str
last_name: str
@dataclass(frozen=True)
class Chat:
chat_id: str
title: str
@dataclass(frozen=True)
class Message:
dt: datetime_aware
chat: Chat
id: str # todo not sure it's unique?
user: User
body: str
Users = Dict[Uid, User]
def users() -> Users:
files = list(sorted(config.storage_path.glob('user_*.json')))
res = {}
for f in files:
j = json.loads(f.read_text())
uid = j['id']
res[uid] = User(
id=uid,
first_name=j['first_name'],
last_name=j['last_name'],
)
return res
GROUP_CHAT_MIN_ID = 2000000000
def _parse_chat(*, msg: Json, udict: Users) -> Chat:
# exported with newer api, peer_id is a proper identifier both for users and chats
peer_id = msg.get('peer_id')
if peer_id is not None:
chat_id = peer_id
else:
group_chat_id = msg.get('chat_id')
if group_chat_id is not None:
chat_id = GROUP_CHAT_MIN_ID + group_chat_id
else:
chat_id = msg['user_id']
is_group_chat = chat_id >= GROUP_CHAT_MIN_ID
if is_group_chat:
title = msg['title']
else:
user_id = msg.get('user_id') or msg.get('from_id')
assert user_id is not None
user = udict[user_id]
title = f'{user.first_name} {user.last_name}'
return Chat(
chat_id=chat_id,
title=title,
)
def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message:
mid = msg['id']
md = msg['date']
dt = datetime.fromtimestamp(md, tz=TZ)
# todo attachments? e.g. url could be an attachment
# todo might be forwarded?
mb = msg.get('body')
if mb is None:
mb = msg.get('text')
assert mb is not None, msg
out = msg['out'] == 1
if out:
user = udict[config.user_id]
else:
mu = msg.get('user_id') or msg.get('from_id')
assert mu is not None, msg
user = udict[mu]
return Message(
dt=dt,
chat=chat,
id=mid,
user=user,
body=mb,
)
def _messages() -> Iterator[Res[Message]]:
udict = users()
uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \
list(sorted(config.storage_path.glob('groupchat_*.json')))
for f in uchats:
j = json.loads(f.read_text())
# ugh. very annoying, sometimes not possible to extract title from last message
# due to newer api...
# so just do in defensively until we succeed...
chat = None
ex = None
for m in reversed(j):
try:
chat = _parse_chat(msg=m, udict=udict)
except Exception as e:
ex = e
continue
if chat is None:
assert ex is not None
yield ex
continue
for msg in j:
try:
yield _parse_msg(msg=msg, chat=chat, udict=udict)
except Exception as e:
yield e
def messages() -> Iterator[Res[Message]]:
# seems that during backup messages were sometimes duplicated..
yield from unique_everseen(_messages())
def stats() -> Stats:
return {
**stat(users),
**stat(messages),
}