From 02c98143d51509d028ac6496476bab24a39eb7dd Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 28 Feb 2023 02:49:14 +0000 Subject: [PATCH 1/3] vk_messages_backup: better structure & exract richer information --- my/config.py | 1 + my/vk/vk_messages_backup.py | 117 ++++++++++++++++++++++++------------ 2 files changed, 78 insertions(+), 40 deletions(-) diff --git a/my/config.py b/my/config.py index bfae86e..e9eafec 100644 --- a/my/config.py +++ b/my/config.py @@ -194,6 +194,7 @@ class simple: class vk_messages_backup: storage_path: Path + user_id: int class kobo: diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index 0e8dc45..df1d18e 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -2,95 +2,132 @@ VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) ''' # note: could reuse the original repo, but little point I guess since VK closed their API - - from datetime import datetime +from dataclasses import dataclass import json from typing import Dict, Iterable, NamedTuple import pytz -from ..core import Json +from my.core import stat, Stats, Json, Res, datetime_aware from my.config import vk_messages_backup as config -Uid = str -Name = str +# I think vk_messages_backup used this tz? +# not sure if vk actually used to return this tz in api? +TZ = pytz.timezone('Europe/Moscow') -Users = Dict[Uid, Name] +Uid = int +@dataclass(frozen=True) +class User: + id: Uid + first_name: str + last_name: str + + +@dataclass(frozen=True) +class Chat: + chat_id: str + title: str + + +@dataclass +class Message: + dt: datetime_aware + chat: Chat + id: str # todo not sure it's unique? + user: User + body: str + + +Users = Dict[Uid, User] def users() -> Users: - # todo cache? files = list(sorted(config.storage_path.glob('user_*.json'))) res = {} for f in files: j = json.loads(f.read_text()) uid = j['id'] - uf = j['first_name'] - ul = j['last_name'] - res[uid] = f'{uf} {ul}' + res[uid] = User( + id=uid, + first_name=j['first_name'], + last_name=j['last_name'], + ) return res -class Message(NamedTuple): - chat_id: str - dt: datetime - user: Name - body: str +# USERCHAT_TITLE = " ... " +def _parse_chat(*, msg: Json, udict: Users) -> Chat: + group_chat_id = msg.get('chat_id') + if group_chat_id is not None: + chat_id = group_chat_id + title = msg['title'] + else: + user_id = msg.get('user_id') or msg.get('from_id') + assert user_id is not None + user = udict[user_id] + chat_id = user_id + title = f'{user.first_name} {user.last_name}' + return Chat( + chat_id=chat_id, + title=title, + ) -msk_tz = pytz.timezone('Europe/Moscow') -# todo hmm, vk_messages_backup used this tz? not sure if vk actually used to return this tz in api? +def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: + mid = msg['id'] + md = msg['date'] -def _parse(x: Json, chat_id: str, udict: Users) -> Message: - mid = x['id'] # todo not sure if useful? - md = x['date'] - - dt = datetime.fromtimestamp(md, msk_tz) + dt = datetime.fromtimestamp(md, tz=TZ) # todo attachments? e.g. url could be an attachment # todo might be forwarded? - mb = x.get('body') + mb = msg.get('body') if mb is None: - mb = x.get('text') - assert mb is not None - - mu = x.get('user_id') or x.get('peer_id') - assert mu is not None - out = x['out'] == 1 - # todo use name from the config? - user = 'you' if out else udict[mu] - - # todo conversation id?? + mb = msg.get('text') + assert mb is not None, msg + out = msg['out'] == 1 + if out: + user = udict[config.user_id] + else: + mu = msg.get('user_id') or msg.get('from_id') + assert mu is not None, msg + user = udict[mu] return Message( - chat_id=chat_id, dt=dt, + chat=chat, + id=mid, user=user, body=mb, ) -from ..core.error import Res def messages() -> Iterable[Res[Message]]: udict = users() uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ list(sorted(config.storage_path.glob('groupchat_*.json'))) for f in uchats: - chat_id = f.stem.split('_')[-1] j = json.loads(f.read_text()) - for x in j: + # extract chat from last message + try: + last = j[-1] + chat = _parse_chat(msg=last, udict=udict) + except Exception as e: + yield e + continue + + for msg in j: try: - yield _parse(x, chat_id=chat_id, udict=udict) + yield _parse_msg(msg=msg, chat=chat, udict=udict) except Exception as e: yield e -def stats(): - from ..core import stat +def stats() -> Stats: return { **stat(users), **stat(messages), From a7099e2efcc989b4d420489c2a6dca830988956c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 28 Feb 2023 03:38:11 +0000 Subject: [PATCH 2/3] vk_messages_backup: more correct handling of group chats & better chat ids --- my/vk/vk_messages_backup.py | 39 ++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index df1d18e..78b595e 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -58,17 +58,26 @@ def users() -> Users: return res -# USERCHAT_TITLE = " ... " +GROUP_CHAT_MIN_ID = 2000000000 def _parse_chat(*, msg: Json, udict: Users) -> Chat: - group_chat_id = msg.get('chat_id') - if group_chat_id is not None: - chat_id = group_chat_id + # exported with newer api, peer_id is a proper identifier both for users and chats + peer_id = msg.get('peer_id') + if peer_id is not None: + chat_id = peer_id + else: + group_chat_id = msg.get('chat_id') + if group_chat_id is not None: + chat_id = GROUP_CHAT_MIN_ID + group_chat_id + else: + chat_id = msg['user_id'] + + is_group_chat = chat_id >= GROUP_CHAT_MIN_ID + if is_group_chat: title = msg['title'] else: user_id = msg.get('user_id') or msg.get('from_id') assert user_id is not None user = udict[user_id] - chat_id = user_id title = f'{user.first_name} {user.last_name}' return Chat( chat_id=chat_id, @@ -112,12 +121,20 @@ def messages() -> Iterable[Res[Message]]: list(sorted(config.storage_path.glob('groupchat_*.json'))) for f in uchats: j = json.loads(f.read_text()) - # extract chat from last message - try: - last = j[-1] - chat = _parse_chat(msg=last, udict=udict) - except Exception as e: - yield e + # ugh. very annoying, sometimes not possible to extract title from last message + # due to newer api... + # so just do in defensively until we succeed... + chat = None + ex = None + for m in reversed(j): + try: + chat = _parse_chat(msg=m, udict=udict) + except Exception as e: + ex = e + continue + if chat is None: + assert ex is not None + yield ex continue for msg in j: From 6dc5e7575ffa8ffee3c4aa3cedcb70e99ad6a7dd Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 28 Feb 2023 03:44:10 +0000 Subject: [PATCH 3/3] vk_messages_backup: add unique_everseen to prevent duplicate messages --- my/vk/vk_messages_backup.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index 78b595e..089605b 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -5,8 +5,9 @@ VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totkton from datetime import datetime from dataclasses import dataclass import json -from typing import Dict, Iterable, NamedTuple +from typing import Dict, Iterator, NamedTuple +from more_itertools import unique_everseen import pytz from my.core import stat, Stats, Json, Res, datetime_aware @@ -34,7 +35,7 @@ class Chat: title: str -@dataclass +@dataclass(frozen=True) class Message: dt: datetime_aware chat: Chat @@ -114,7 +115,7 @@ def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: ) -def messages() -> Iterable[Res[Message]]: +def _messages() -> Iterator[Res[Message]]: udict = users() uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ @@ -144,6 +145,11 @@ def messages() -> Iterable[Res[Message]]: yield e +def messages() -> Iterator[Res[Message]]: + # seems that during backup messages were sometimes duplicated.. + yield from unique_everseen(_messages()) + + def stats() -> Stats: return { **stat(users),