181 lines
5.1 KiB
Python
181 lines
5.1 KiB
Python
"""
|
|
Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
|
|
"""
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from itertools import count
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Sequence, Iterator, Dict, Union
|
|
|
|
from my.core import (
|
|
assert_never,
|
|
datetime_aware,
|
|
get_files,
|
|
stat,
|
|
Json,
|
|
Paths,
|
|
Res,
|
|
Stats,
|
|
)
|
|
from my.core.error import notnone
|
|
import my.config
|
|
|
|
|
|
@dataclass
|
|
class organization(my.config.zulip.organization):
|
|
# paths[s]/glob to the exported JSON data
|
|
export_path: Paths
|
|
|
|
|
|
def inputs() -> Sequence[Path]:
|
|
# TODO: seems like export ids are kinda random..
|
|
# not sure what's the best way to figure out the last without renaming?
|
|
# could use mtime perhaps?
|
|
return get_files(organization.export_path, sort=False)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Server:
|
|
id: int
|
|
string_id: str
|
|
name: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Sender:
|
|
id: int
|
|
# todo make optional?
|
|
full_name: str
|
|
email: str
|
|
|
|
|
|
# from the data, seems that subjects are completely implicit and determined by name?
|
|
# streams have ids (can extract from realm/zerver_stream), but unclear how to correlate messages/topics to streams?
|
|
@dataclass(frozen=True)
|
|
class _Message:
|
|
# todo hmm not sure what would be a good field order..
|
|
id: int
|
|
sent: datetime_aware # double checked and they are in utc
|
|
subject: str
|
|
sender_id: int
|
|
server_id: int
|
|
content: str # TODO hmm, it keeps markdown, not sure how/whether it's worth to prettify at all?
|
|
# TODO recipient??
|
|
# todo keep raw item instead? not sure
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Message:
|
|
id: int
|
|
sent: datetime_aware
|
|
subject: str
|
|
sender: Sender
|
|
server: Server
|
|
content: str
|
|
|
|
@property
|
|
def permalink(self) -> str:
|
|
# seems that these link to the same message
|
|
# https://memex.zulipchat.com/#narrow/stream/284580-python/topic/py-spy.20profiler/near/234798881
|
|
# https://memex.zulipchat.com/#narrow/stream/284580/near/234798881
|
|
# https://memex.zulipchat.com/#narrow/near/234798881
|
|
# however not sure how to correlate stream id and message/topic for now, so preferring the latter version
|
|
return f'https://{self.server.string_id}.zulipchat.com/#narrow/near/{self.id}'
|
|
|
|
|
|
# todo cache it
|
|
def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
|
|
last = max(inputs())
|
|
|
|
# todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
|
|
# I guess makes sense to have a special implementation for .tar.gz considering how common are they
|
|
import tarfile
|
|
|
|
tfile = tarfile.open(last)
|
|
|
|
subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that
|
|
|
|
with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
|
|
rj = json.load(fo)
|
|
|
|
[sj] = rj['zerver_realm']
|
|
server = Server(
|
|
id=sj['id'],
|
|
string_id=sj['string_id'],
|
|
name=sj['name'],
|
|
)
|
|
yield server
|
|
|
|
for j in rj['zerver_userprofile']:
|
|
yield Sender(
|
|
id=j['id'],
|
|
full_name=j['full_name'],
|
|
email=j['email'],
|
|
)
|
|
|
|
for j in rj['zerver_userprofile_crossrealm']: # e.g. zulip bot
|
|
yield Sender(
|
|
id=j['id'],
|
|
full_name=j['email'], # doesn't seem to have anything
|
|
email=j['email'],
|
|
)
|
|
|
|
def _parse_message(j: Json) -> _Message:
|
|
ds = j['date_sent']
|
|
# fmt: off
|
|
return _Message(
|
|
id = j['id'],
|
|
sent = datetime.fromtimestamp(ds, tz=timezone.utc),
|
|
subject = j['subject'],
|
|
sender_id = j['sender'],
|
|
server_id = server.id,
|
|
content = j['content'],
|
|
)
|
|
# fmt: on
|
|
|
|
for idx in count(start=1, step=1):
|
|
fname = f'messages-{idx:06}.json'
|
|
fpath = f'{subdir}/{fname}'
|
|
if fpath not in tfile.getnames():
|
|
# tarfile doesn't have .exists?
|
|
break
|
|
with notnone(tfile.extractfile(fpath)) as fo:
|
|
mj = json.load(fo)
|
|
# TODO handle zerver_usermessage
|
|
for j in mj['zerver_message']:
|
|
try:
|
|
yield _parse_message(j)
|
|
except Exception as e:
|
|
yield e
|
|
|
|
|
|
def messages() -> Iterator[Res[Message]]:
|
|
id2sender: Dict[int, Sender] = {}
|
|
id2server: Dict[int, Server] = {}
|
|
for x in _entities():
|
|
if isinstance(x, Exception):
|
|
yield x
|
|
continue
|
|
if isinstance(x, Server):
|
|
id2server[x.id] = x
|
|
continue
|
|
if isinstance(x, Sender):
|
|
id2sender[x.id] = x
|
|
continue
|
|
if isinstance(x, _Message):
|
|
# TODO a bit copypasty... wonder if possible to mixin or something instead
|
|
yield Message(
|
|
id=x.id,
|
|
sent=x.sent,
|
|
subject=x.subject,
|
|
sender=id2sender[x.sender_id],
|
|
server=id2server[x.server_id],
|
|
content=x.content,
|
|
)
|
|
continue
|
|
assert_never(x)
|
|
|
|
|
|
def stats() -> Stats:
|
|
return {**stat(messages)}
|