vk_messages_backup: better structure & exract richer information

This commit is contained in:
Dima Gerasimov 2023-02-28 02:49:14 +00:00 committed by karlicoss
parent 130c273513
commit 02c98143d5
2 changed files with 78 additions and 40 deletions

View file

@ -194,6 +194,7 @@ class simple:
class vk_messages_backup: class vk_messages_backup:
storage_path: Path storage_path: Path
user_id: int
class kobo: class kobo:

View file

@ -2,95 +2,132 @@
VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]])
''' '''
# note: could reuse the original repo, but little point I guess since VK closed their API # note: could reuse the original repo, but little point I guess since VK closed their API
from datetime import datetime from datetime import datetime
from dataclasses import dataclass
import json import json
from typing import Dict, Iterable, NamedTuple from typing import Dict, Iterable, NamedTuple
import pytz import pytz
from ..core import Json from my.core import stat, Stats, Json, Res, datetime_aware
from my.config import vk_messages_backup as config from my.config import vk_messages_backup as config
Uid = str # I think vk_messages_backup used this tz?
Name = str # not sure if vk actually used to return this tz in api?
TZ = pytz.timezone('Europe/Moscow')
Users = Dict[Uid, Name] Uid = int
@dataclass(frozen=True)
class User:
id: Uid
first_name: str
last_name: str
@dataclass(frozen=True)
class Chat:
chat_id: str
title: str
@dataclass
class Message:
dt: datetime_aware
chat: Chat
id: str # todo not sure it's unique?
user: User
body: str
Users = Dict[Uid, User]
def users() -> Users: def users() -> Users:
# todo cache?
files = list(sorted(config.storage_path.glob('user_*.json'))) files = list(sorted(config.storage_path.glob('user_*.json')))
res = {} res = {}
for f in files: for f in files:
j = json.loads(f.read_text()) j = json.loads(f.read_text())
uid = j['id'] uid = j['id']
uf = j['first_name'] res[uid] = User(
ul = j['last_name'] id=uid,
res[uid] = f'{uf} {ul}' first_name=j['first_name'],
last_name=j['last_name'],
)
return res return res
class Message(NamedTuple): # USERCHAT_TITLE = " ... "
chat_id: str def _parse_chat(*, msg: Json, udict: Users) -> Chat:
dt: datetime group_chat_id = msg.get('chat_id')
user: Name if group_chat_id is not None:
body: str chat_id = group_chat_id
title = msg['title']
else:
user_id = msg.get('user_id') or msg.get('from_id')
assert user_id is not None
user = udict[user_id]
chat_id = user_id
title = f'{user.first_name} {user.last_name}'
return Chat(
chat_id=chat_id,
title=title,
)
msk_tz = pytz.timezone('Europe/Moscow') def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message:
# todo hmm, vk_messages_backup used this tz? not sure if vk actually used to return this tz in api? mid = msg['id']
md = msg['date']
def _parse(x: Json, chat_id: str, udict: Users) -> Message: dt = datetime.fromtimestamp(md, tz=TZ)
mid = x['id'] # todo not sure if useful?
md = x['date']
dt = datetime.fromtimestamp(md, msk_tz)
# todo attachments? e.g. url could be an attachment # todo attachments? e.g. url could be an attachment
# todo might be forwarded? # todo might be forwarded?
mb = x.get('body') mb = msg.get('body')
if mb is None: if mb is None:
mb = x.get('text') mb = msg.get('text')
assert mb is not None assert mb is not None, msg
mu = x.get('user_id') or x.get('peer_id')
assert mu is not None
out = x['out'] == 1
# todo use name from the config?
user = 'you' if out else udict[mu]
# todo conversation id??
out = msg['out'] == 1
if out:
user = udict[config.user_id]
else:
mu = msg.get('user_id') or msg.get('from_id')
assert mu is not None, msg
user = udict[mu]
return Message( return Message(
chat_id=chat_id,
dt=dt, dt=dt,
chat=chat,
id=mid,
user=user, user=user,
body=mb, body=mb,
) )
from ..core.error import Res
def messages() -> Iterable[Res[Message]]: def messages() -> Iterable[Res[Message]]:
udict = users() udict = users()
uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \
list(sorted(config.storage_path.glob('groupchat_*.json'))) list(sorted(config.storage_path.glob('groupchat_*.json')))
for f in uchats: for f in uchats:
chat_id = f.stem.split('_')[-1]
j = json.loads(f.read_text()) j = json.loads(f.read_text())
for x in j: # extract chat from last message
try:
last = j[-1]
chat = _parse_chat(msg=last, udict=udict)
except Exception as e:
yield e
continue
for msg in j:
try: try:
yield _parse(x, chat_id=chat_id, udict=udict) yield _parse_msg(msg=msg, chat=chat, udict=udict)
except Exception as e: except Exception as e:
yield e yield e
def stats(): def stats() -> Stats:
from ..core import stat
return { return {
**stat(users), **stat(users),
**stat(messages), **stat(messages),