Merge branch 'master' into location-fallback

This commit is contained in:
Sean Breckenridge 2023-02-27 20:04:16 -08:00
commit 7c3457f073
2 changed files with 103 additions and 42 deletions

View file

@ -196,6 +196,7 @@ class simple:
class vk_messages_backup: class vk_messages_backup:
storage_path: Path storage_path: Path
user_id: int
class kobo: class kobo:

View file

@ -2,95 +2,155 @@
VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]])
''' '''
# note: could reuse the original repo, but little point I guess since VK closed their API # note: could reuse the original repo, but little point I guess since VK closed their API
from datetime import datetime from datetime import datetime
from dataclasses import dataclass
import json import json
from typing import Dict, Iterable, NamedTuple from typing import Dict, Iterator, NamedTuple
from more_itertools import unique_everseen
import pytz import pytz
from ..core import Json from my.core import stat, Stats, Json, Res, datetime_aware
from my.config import vk_messages_backup as config from my.config import vk_messages_backup as config
Uid = str # I think vk_messages_backup used this tz?
Name = str # not sure if vk actually used to return this tz in api?
TZ = pytz.timezone('Europe/Moscow')
Users = Dict[Uid, Name] Uid = int
@dataclass(frozen=True)
class User:
id: Uid
first_name: str
last_name: str
@dataclass(frozen=True)
class Chat:
chat_id: str
title: str
@dataclass(frozen=True)
class Message:
dt: datetime_aware
chat: Chat
id: str # todo not sure it's unique?
user: User
body: str
Users = Dict[Uid, User]
def users() -> Users: def users() -> Users:
# todo cache?
files = list(sorted(config.storage_path.glob('user_*.json'))) files = list(sorted(config.storage_path.glob('user_*.json')))
res = {} res = {}
for f in files: for f in files:
j = json.loads(f.read_text()) j = json.loads(f.read_text())
uid = j['id'] uid = j['id']
uf = j['first_name'] res[uid] = User(
ul = j['last_name'] id=uid,
res[uid] = f'{uf} {ul}' first_name=j['first_name'],
last_name=j['last_name'],
)
return res return res
class Message(NamedTuple): GROUP_CHAT_MIN_ID = 2000000000
chat_id: str def _parse_chat(*, msg: Json, udict: Users) -> Chat:
dt: datetime # exported with newer api, peer_id is a proper identifier both for users and chats
user: Name peer_id = msg.get('peer_id')
body: str if peer_id is not None:
chat_id = peer_id
else:
group_chat_id = msg.get('chat_id')
if group_chat_id is not None:
chat_id = GROUP_CHAT_MIN_ID + group_chat_id
else:
chat_id = msg['user_id']
is_group_chat = chat_id >= GROUP_CHAT_MIN_ID
if is_group_chat:
title = msg['title']
else:
user_id = msg.get('user_id') or msg.get('from_id')
assert user_id is not None
user = udict[user_id]
title = f'{user.first_name} {user.last_name}'
return Chat(
chat_id=chat_id,
title=title,
)
msk_tz = pytz.timezone('Europe/Moscow') def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message:
# todo hmm, vk_messages_backup used this tz? not sure if vk actually used to return this tz in api? mid = msg['id']
md = msg['date']
def _parse(x: Json, chat_id: str, udict: Users) -> Message: dt = datetime.fromtimestamp(md, tz=TZ)
mid = x['id'] # todo not sure if useful?
md = x['date']
dt = datetime.fromtimestamp(md, msk_tz)
# todo attachments? e.g. url could be an attachment # todo attachments? e.g. url could be an attachment
# todo might be forwarded? # todo might be forwarded?
mb = x.get('body') mb = msg.get('body')
if mb is None: if mb is None:
mb = x.get('text') mb = msg.get('text')
assert mb is not None assert mb is not None, msg
mu = x.get('user_id') or x.get('peer_id')
assert mu is not None
out = x['out'] == 1
# todo use name from the config?
user = 'you' if out else udict[mu]
# todo conversation id??
out = msg['out'] == 1
if out:
user = udict[config.user_id]
else:
mu = msg.get('user_id') or msg.get('from_id')
assert mu is not None, msg
user = udict[mu]
return Message( return Message(
chat_id=chat_id,
dt=dt, dt=dt,
chat=chat,
id=mid,
user=user, user=user,
body=mb, body=mb,
) )
from ..core.error import Res def _messages() -> Iterator[Res[Message]]:
def messages() -> Iterable[Res[Message]]:
udict = users() udict = users()
uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \
list(sorted(config.storage_path.glob('groupchat_*.json'))) list(sorted(config.storage_path.glob('groupchat_*.json')))
for f in uchats: for f in uchats:
chat_id = f.stem.split('_')[-1]
j = json.loads(f.read_text()) j = json.loads(f.read_text())
for x in j: # ugh. very annoying, sometimes not possible to extract title from last message
# due to newer api...
# so just do in defensively until we succeed...
chat = None
ex = None
for m in reversed(j):
try: try:
yield _parse(x, chat_id=chat_id, udict=udict) chat = _parse_chat(msg=m, udict=udict)
except Exception as e:
ex = e
continue
if chat is None:
assert ex is not None
yield ex
continue
for msg in j:
try:
yield _parse_msg(msg=msg, chat=chat, udict=udict)
except Exception as e: except Exception as e:
yield e yield e
def stats(): def messages() -> Iterator[Res[Message]]:
from ..core import stat # seems that during backup messages were sometimes duplicated..
yield from unique_everseen(_messages())
def stats() -> Stats:
return { return {
**stat(users), **stat(users),
**stat(messages), **stat(messages),