diff --git a/my/config.py b/my/config.py index 58aadfc..7075d1d 100644 --- a/my/config.py +++ b/my/config.py @@ -196,6 +196,7 @@ class simple: class vk_messages_backup: storage_path: Path + user_id: int class kobo: diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index 0e8dc45..089605b 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -2,95 +2,155 @@ VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) ''' # note: could reuse the original repo, but little point I guess since VK closed their API - - from datetime import datetime +from dataclasses import dataclass import json -from typing import Dict, Iterable, NamedTuple +from typing import Dict, Iterator, NamedTuple +from more_itertools import unique_everseen import pytz -from ..core import Json +from my.core import stat, Stats, Json, Res, datetime_aware from my.config import vk_messages_backup as config -Uid = str -Name = str +# I think vk_messages_backup used this tz? +# not sure if vk actually used to return this tz in api? +TZ = pytz.timezone('Europe/Moscow') -Users = Dict[Uid, Name] +Uid = int +@dataclass(frozen=True) +class User: + id: Uid + first_name: str + last_name: str + + +@dataclass(frozen=True) +class Chat: + chat_id: str + title: str + + +@dataclass(frozen=True) +class Message: + dt: datetime_aware + chat: Chat + id: str # todo not sure it's unique? + user: User + body: str + + +Users = Dict[Uid, User] def users() -> Users: - # todo cache? files = list(sorted(config.storage_path.glob('user_*.json'))) res = {} for f in files: j = json.loads(f.read_text()) uid = j['id'] - uf = j['first_name'] - ul = j['last_name'] - res[uid] = f'{uf} {ul}' + res[uid] = User( + id=uid, + first_name=j['first_name'], + last_name=j['last_name'], + ) return res -class Message(NamedTuple): - chat_id: str - dt: datetime - user: Name - body: str +GROUP_CHAT_MIN_ID = 2000000000 +def _parse_chat(*, msg: Json, udict: Users) -> Chat: + # exported with newer api, peer_id is a proper identifier both for users and chats + peer_id = msg.get('peer_id') + if peer_id is not None: + chat_id = peer_id + else: + group_chat_id = msg.get('chat_id') + if group_chat_id is not None: + chat_id = GROUP_CHAT_MIN_ID + group_chat_id + else: + chat_id = msg['user_id'] + + is_group_chat = chat_id >= GROUP_CHAT_MIN_ID + if is_group_chat: + title = msg['title'] + else: + user_id = msg.get('user_id') or msg.get('from_id') + assert user_id is not None + user = udict[user_id] + title = f'{user.first_name} {user.last_name}' + return Chat( + chat_id=chat_id, + title=title, + ) -msk_tz = pytz.timezone('Europe/Moscow') -# todo hmm, vk_messages_backup used this tz? not sure if vk actually used to return this tz in api? +def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: + mid = msg['id'] + md = msg['date'] -def _parse(x: Json, chat_id: str, udict: Users) -> Message: - mid = x['id'] # todo not sure if useful? - md = x['date'] - - dt = datetime.fromtimestamp(md, msk_tz) + dt = datetime.fromtimestamp(md, tz=TZ) # todo attachments? e.g. url could be an attachment # todo might be forwarded? - mb = x.get('body') + mb = msg.get('body') if mb is None: - mb = x.get('text') - assert mb is not None - - mu = x.get('user_id') or x.get('peer_id') - assert mu is not None - out = x['out'] == 1 - # todo use name from the config? - user = 'you' if out else udict[mu] - - # todo conversation id?? + mb = msg.get('text') + assert mb is not None, msg + out = msg['out'] == 1 + if out: + user = udict[config.user_id] + else: + mu = msg.get('user_id') or msg.get('from_id') + assert mu is not None, msg + user = udict[mu] return Message( - chat_id=chat_id, dt=dt, + chat=chat, + id=mid, user=user, body=mb, ) -from ..core.error import Res -def messages() -> Iterable[Res[Message]]: +def _messages() -> Iterator[Res[Message]]: udict = users() uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ list(sorted(config.storage_path.glob('groupchat_*.json'))) for f in uchats: - chat_id = f.stem.split('_')[-1] j = json.loads(f.read_text()) - for x in j: + # ugh. very annoying, sometimes not possible to extract title from last message + # due to newer api... + # so just do in defensively until we succeed... + chat = None + ex = None + for m in reversed(j): try: - yield _parse(x, chat_id=chat_id, udict=udict) + chat = _parse_chat(msg=m, udict=udict) + except Exception as e: + ex = e + continue + if chat is None: + assert ex is not None + yield ex + continue + + for msg in j: + try: + yield _parse_msg(msg=msg, chat=chat, udict=udict) except Exception as e: yield e -def stats(): - from ..core import stat +def messages() -> Iterator[Res[Message]]: + # seems that during backup messages were sometimes duplicated.. + yield from unique_everseen(_messages()) + + +def stats() -> Stats: return { **stat(users), **stat(messages),