vk_messages_backup: better structure & exract richer information

This commit is contained in:
Dima Gerasimov 2023-02-28 02:49:14 +00:00 committed by karlicoss
parent 130c273513
commit 02c98143d5
2 changed files with 78 additions and 40 deletions

View file

@ -194,6 +194,7 @@ class simple:
class vk_messages_backup:
storage_path: Path
user_id: int
class kobo:

View file

@ -2,95 +2,132 @@
VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]])
'''
# note: could reuse the original repo, but little point I guess since VK closed their API
from datetime import datetime
from dataclasses import dataclass
import json
from typing import Dict, Iterable, NamedTuple
import pytz
from ..core import Json
from my.core import stat, Stats, Json, Res, datetime_aware
from my.config import vk_messages_backup as config
Uid = str
Name = str
# I think vk_messages_backup used this tz?
# not sure if vk actually used to return this tz in api?
TZ = pytz.timezone('Europe/Moscow')
Users = Dict[Uid, Name]
Uid = int
@dataclass(frozen=True)
class User:
id: Uid
first_name: str
last_name: str
@dataclass(frozen=True)
class Chat:
chat_id: str
title: str
@dataclass
class Message:
dt: datetime_aware
chat: Chat
id: str # todo not sure it's unique?
user: User
body: str
Users = Dict[Uid, User]
def users() -> Users:
# todo cache?
files = list(sorted(config.storage_path.glob('user_*.json')))
res = {}
for f in files:
j = json.loads(f.read_text())
uid = j['id']
uf = j['first_name']
ul = j['last_name']
res[uid] = f'{uf} {ul}'
res[uid] = User(
id=uid,
first_name=j['first_name'],
last_name=j['last_name'],
)
return res
class Message(NamedTuple):
chat_id: str
dt: datetime
user: Name
body: str
# USERCHAT_TITLE = " ... "
def _parse_chat(*, msg: Json, udict: Users) -> Chat:
group_chat_id = msg.get('chat_id')
if group_chat_id is not None:
chat_id = group_chat_id
title = msg['title']
else:
user_id = msg.get('user_id') or msg.get('from_id')
assert user_id is not None
user = udict[user_id]
chat_id = user_id
title = f'{user.first_name} {user.last_name}'
return Chat(
chat_id=chat_id,
title=title,
)
msk_tz = pytz.timezone('Europe/Moscow')
# todo hmm, vk_messages_backup used this tz? not sure if vk actually used to return this tz in api?
def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message:
mid = msg['id']
md = msg['date']
def _parse(x: Json, chat_id: str, udict: Users) -> Message:
mid = x['id'] # todo not sure if useful?
md = x['date']
dt = datetime.fromtimestamp(md, msk_tz)
dt = datetime.fromtimestamp(md, tz=TZ)
# todo attachments? e.g. url could be an attachment
# todo might be forwarded?
mb = x.get('body')
mb = msg.get('body')
if mb is None:
mb = x.get('text')
assert mb is not None
mu = x.get('user_id') or x.get('peer_id')
assert mu is not None
out = x['out'] == 1
# todo use name from the config?
user = 'you' if out else udict[mu]
# todo conversation id??
mb = msg.get('text')
assert mb is not None, msg
out = msg['out'] == 1
if out:
user = udict[config.user_id]
else:
mu = msg.get('user_id') or msg.get('from_id')
assert mu is not None, msg
user = udict[mu]
return Message(
chat_id=chat_id,
dt=dt,
chat=chat,
id=mid,
user=user,
body=mb,
)
from ..core.error import Res
def messages() -> Iterable[Res[Message]]:
udict = users()
uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \
list(sorted(config.storage_path.glob('groupchat_*.json')))
for f in uchats:
chat_id = f.stem.split('_')[-1]
j = json.loads(f.read_text())
for x in j:
# extract chat from last message
try:
yield _parse(x, chat_id=chat_id, udict=udict)
last = j[-1]
chat = _parse_chat(msg=last, udict=udict)
except Exception as e:
yield e
continue
for msg in j:
try:
yield _parse_msg(msg=msg, chat=chat, udict=udict)
except Exception as e:
yield e
def stats():
from ..core import stat
def stats() -> Stats:
return {
**stat(users),
**stat(messages),