vk_messages_backup: better structure & exract richer information
This commit is contained in:
parent
130c273513
commit
02c98143d5
2 changed files with 78 additions and 40 deletions
|
@ -194,6 +194,7 @@ class simple:
|
|||
|
||||
class vk_messages_backup:
|
||||
storage_path: Path
|
||||
user_id: int
|
||||
|
||||
|
||||
class kobo:
|
||||
|
|
|
@ -2,95 +2,132 @@
|
|||
VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]])
|
||||
'''
|
||||
# note: could reuse the original repo, but little point I guess since VK closed their API
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
from typing import Dict, Iterable, NamedTuple
|
||||
|
||||
import pytz
|
||||
|
||||
from ..core import Json
|
||||
from my.core import stat, Stats, Json, Res, datetime_aware
|
||||
|
||||
from my.config import vk_messages_backup as config
|
||||
|
||||
|
||||
Uid = str
|
||||
Name = str
|
||||
# I think vk_messages_backup used this tz?
|
||||
# not sure if vk actually used to return this tz in api?
|
||||
TZ = pytz.timezone('Europe/Moscow')
|
||||
|
||||
|
||||
Users = Dict[Uid, Name]
|
||||
Uid = int
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class User:
|
||||
id: Uid
|
||||
first_name: str
|
||||
last_name: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Chat:
|
||||
chat_id: str
|
||||
title: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Message:
|
||||
dt: datetime_aware
|
||||
chat: Chat
|
||||
id: str # todo not sure it's unique?
|
||||
user: User
|
||||
body: str
|
||||
|
||||
|
||||
Users = Dict[Uid, User]
|
||||
def users() -> Users:
|
||||
# todo cache?
|
||||
files = list(sorted(config.storage_path.glob('user_*.json')))
|
||||
res = {}
|
||||
for f in files:
|
||||
j = json.loads(f.read_text())
|
||||
uid = j['id']
|
||||
uf = j['first_name']
|
||||
ul = j['last_name']
|
||||
res[uid] = f'{uf} {ul}'
|
||||
res[uid] = User(
|
||||
id=uid,
|
||||
first_name=j['first_name'],
|
||||
last_name=j['last_name'],
|
||||
)
|
||||
return res
|
||||
|
||||
|
||||
class Message(NamedTuple):
|
||||
chat_id: str
|
||||
dt: datetime
|
||||
user: Name
|
||||
body: str
|
||||
# USERCHAT_TITLE = " ... "
|
||||
def _parse_chat(*, msg: Json, udict: Users) -> Chat:
|
||||
group_chat_id = msg.get('chat_id')
|
||||
if group_chat_id is not None:
|
||||
chat_id = group_chat_id
|
||||
title = msg['title']
|
||||
else:
|
||||
user_id = msg.get('user_id') or msg.get('from_id')
|
||||
assert user_id is not None
|
||||
user = udict[user_id]
|
||||
chat_id = user_id
|
||||
title = f'{user.first_name} {user.last_name}'
|
||||
return Chat(
|
||||
chat_id=chat_id,
|
||||
title=title,
|
||||
)
|
||||
|
||||
|
||||
msk_tz = pytz.timezone('Europe/Moscow')
|
||||
# todo hmm, vk_messages_backup used this tz? not sure if vk actually used to return this tz in api?
|
||||
def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message:
|
||||
mid = msg['id']
|
||||
md = msg['date']
|
||||
|
||||
def _parse(x: Json, chat_id: str, udict: Users) -> Message:
|
||||
mid = x['id'] # todo not sure if useful?
|
||||
md = x['date']
|
||||
|
||||
dt = datetime.fromtimestamp(md, msk_tz)
|
||||
dt = datetime.fromtimestamp(md, tz=TZ)
|
||||
|
||||
# todo attachments? e.g. url could be an attachment
|
||||
# todo might be forwarded?
|
||||
mb = x.get('body')
|
||||
mb = msg.get('body')
|
||||
if mb is None:
|
||||
mb = x.get('text')
|
||||
assert mb is not None
|
||||
|
||||
mu = x.get('user_id') or x.get('peer_id')
|
||||
assert mu is not None
|
||||
out = x['out'] == 1
|
||||
# todo use name from the config?
|
||||
user = 'you' if out else udict[mu]
|
||||
|
||||
# todo conversation id??
|
||||
mb = msg.get('text')
|
||||
assert mb is not None, msg
|
||||
|
||||
out = msg['out'] == 1
|
||||
if out:
|
||||
user = udict[config.user_id]
|
||||
else:
|
||||
mu = msg.get('user_id') or msg.get('from_id')
|
||||
assert mu is not None, msg
|
||||
user = udict[mu]
|
||||
return Message(
|
||||
chat_id=chat_id,
|
||||
dt=dt,
|
||||
chat=chat,
|
||||
id=mid,
|
||||
user=user,
|
||||
body=mb,
|
||||
)
|
||||
|
||||
|
||||
from ..core.error import Res
|
||||
def messages() -> Iterable[Res[Message]]:
|
||||
udict = users()
|
||||
|
||||
uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \
|
||||
list(sorted(config.storage_path.glob('groupchat_*.json')))
|
||||
for f in uchats:
|
||||
chat_id = f.stem.split('_')[-1]
|
||||
j = json.loads(f.read_text())
|
||||
for x in j:
|
||||
# extract chat from last message
|
||||
try:
|
||||
last = j[-1]
|
||||
chat = _parse_chat(msg=last, udict=udict)
|
||||
except Exception as e:
|
||||
yield e
|
||||
continue
|
||||
|
||||
for msg in j:
|
||||
try:
|
||||
yield _parse(x, chat_id=chat_id, udict=udict)
|
||||
yield _parse_msg(msg=msg, chat=chat, udict=udict)
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
||||
|
||||
def stats():
|
||||
from ..core import stat
|
||||
def stats() -> Stats:
|
||||
return {
|
||||
**stat(users),
|
||||
**stat(messages),
|
||||
|
|
Loading…
Add table
Reference in a new issue