vk_messages_backup: some cleanup + switch to get_files

This commit is contained in:
karlicoss 2023-10-31 00:47:31 +00:00
parent 24da04f142
commit 105928238f

View file

@ -2,8 +2,8 @@
VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]])
''' '''
# note: could reuse the original repo, but little point I guess since VK closed their API # note: could reuse the original repo, but little point I guess since VK closed their API
from datetime import datetime
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime
import json import json
from typing import Dict, Iterator from typing import Dict, Iterator
@ -22,6 +22,7 @@ TZ = pytz.timezone('Europe/Moscow')
Uid = int Uid = int
@dataclass(frozen=True) @dataclass(frozen=True)
class User: class User:
id: Uid id: Uid
@ -45,8 +46,10 @@ class Message:
Users = Dict[Uid, User] Users = Dict[Uid, User]
def users() -> Users: def users() -> Users:
files = list(sorted(config.storage_path.glob('user_*.json'))) files = get_files(config.storage_path, glob='user_*.json')
res = {} res = {}
for f in files: for f in files:
j = json.loads(f.read_text()) j = json.loads(f.read_text())
@ -60,6 +63,8 @@ def users() -> Users:
GROUP_CHAT_MIN_ID = 2000000000 GROUP_CHAT_MIN_ID = 2000000000
def _parse_chat(*, msg: Json, udict: Users) -> Chat: def _parse_chat(*, msg: Json, udict: Users) -> Chat:
# exported with newer api, peer_id is a proper identifier both for users and chats # exported with newer api, peer_id is a proper identifier both for users and chats
peer_id = msg.get('peer_id') peer_id = msg.get('peer_id')
@ -88,13 +93,13 @@ def _parse_chat(*, msg: Json, udict: Users) -> Chat:
def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message:
mid = msg['id'] mid = msg['id']
md = msg['date'] md = msg['date']
dt = datetime.fromtimestamp(md, tz=TZ) dt = datetime.fromtimestamp(md, tz=TZ)
# todo attachments? e.g. url could be an attachment # todo attachments? e.g. url could be an attachment
# todo might be forwarded? # todo might be forwarded?
mb = msg.get('body') mb = msg.get('body')
if mb is None: if mb is None:
mb = msg.get('text') mb = msg.get('text')
assert mb is not None, msg assert mb is not None, msg
@ -103,7 +108,7 @@ def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message:
if out: if out:
user = udict[config.user_id] user = udict[config.user_id]
else: else:
mu = msg.get('user_id') or msg.get('from_id') mu = msg.get('user_id') or msg.get('from_id')
assert mu is not None, msg assert mu is not None, msg
user = udict[mu] user = udict[mu]
return Message( return Message(
@ -118,8 +123,7 @@ def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message:
def _messages() -> Iterator[Res[Message]]: def _messages() -> Iterator[Res[Message]]:
udict = users() udict = users()
uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ uchats = get_files(config.storage_path, glob='userchat_*.json') + get_files(config.storage_path, glob='groupchat_*.json')
list(sorted(config.storage_path.glob('groupchat_*.json')))
for f in uchats: for f in uchats:
j = json.loads(f.read_text()) j = json.loads(f.read_text())
# ugh. very annoying, sometimes not possible to extract title from last message # ugh. very annoying, sometimes not possible to extract title from last message