""" Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]] """ from __future__ import annotations import json from abc import abstractmethod from dataclasses import dataclass from datetime import datetime, timezone from itertools import count from pathlib import Path from typing import Iterator, Sequence from my.core import ( Json, Paths, Res, Stats, assert_never, datetime_aware, get_files, make_logger, stat, warnings, ) logger = make_logger(__name__) class config: @property @abstractmethod def export_path(self) -> Paths: """paths[s]/glob to the exported JSON data""" raise NotImplementedError def make_config() -> config: from my.config import zulip as user_config class combined_config(user_config.organization, config): pass return combined_config() def inputs() -> Sequence[Path]: # TODO: seems like export ids are kinda random.. # not sure what's the best way to figure out the last without renaming? # could use mtime perhaps? return get_files(make_config().export_path, sort=False) @dataclass(frozen=True) class Server: id: int string_id: str name: str @dataclass(frozen=True) class Sender: id: int # todo make optional? full_name: str email: str # from the data, seems that subjects are completely implicit and determined by name? # streams have ids (can extract from realm/zerver_stream), but unclear how to correlate messages/topics to streams? @dataclass(frozen=True) class _Message: # todo hmm not sure what would be a good field order.. id: int sent: datetime_aware # double checked and they are in utc subject: str sender_id: int server_id: int content: str # TODO hmm, it keeps markdown, not sure how/whether it's worth to prettify at all? # TODO recipient?? # todo keep raw item instead? not sure @dataclass(frozen=True) class Message: id: int sent: datetime_aware subject: str sender: Sender server: Server content: str @property def permalink(self) -> str: # seems that these link to the same message # https://memex.zulipchat.com/#narrow/stream/284580-python/topic/py-spy.20profiler/near/234798881 # https://memex.zulipchat.com/#narrow/stream/284580/near/234798881 # https://memex.zulipchat.com/#narrow/near/234798881 # however not sure how to correlate stream id and message/topic for now, so preferring the latter version return f'https://{self.server.string_id}.zulipchat.com/#narrow/near/{self.id}' # todo cache it def _entities() -> Iterator[Res[Server | Sender | _Message]]: last = max(inputs()) logger.info(f'extracting data from {last}') root: Path | None = None if last.is_dir(): # if it's already CPath, this will match it root = last else: try: from kompress import CPath root = CPath(last) assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support except Exception as e: logger.exception(e) warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.") if root is None: from my.core.structure import match_structure with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns [root] = res yield from _process_one(root) else: yield from _process_one(root) def _process_one(root: Path) -> Iterator[Res[Server | Sender | _Message]]: [subdir] = root.iterdir() # there is a directory inside tar file, first name should be that rj = json.loads((subdir / 'realm.json').read_text()) [sj] = rj['zerver_realm'] server = Server( id=sj['id'], string_id=sj['string_id'], name=sj['name'], ) yield server for j in rj['zerver_userprofile']: yield Sender( id=j['id'], full_name=j['full_name'], email=j['email'], ) for j in rj['zerver_userprofile_crossrealm']: # e.g. zulip bot yield Sender( id=j['id'], full_name=j['email'], # doesn't seem to have anything email=j['email'], ) def _parse_message(j: Json) -> _Message: ds = j['date_sent'] # fmt: off return _Message( id = j['id'], sent = datetime.fromtimestamp(ds, tz=timezone.utc), subject = j['subject'], sender_id = j['sender'], server_id = server.id, content = j['content'], ) # fmt: on for idx in count(start=1, step=1): fname = f'messages-{idx:06}.json' fpath = subdir / fname if not fpath.exists(): break mj = json.loads(fpath.read_text()) # TODO handle zerver_usermessage for j in mj['zerver_message']: try: yield _parse_message(j) except Exception as e: yield e def messages() -> Iterator[Res[Message]]: id2sender: dict[int, Sender] = {} id2server: dict[int, Server] = {} for x in _entities(): if isinstance(x, Exception): yield x continue if isinstance(x, Server): id2server[x.id] = x continue if isinstance(x, Sender): id2sender[x.id] = x continue if isinstance(x, _Message): # TODO a bit copypasty... wonder if possible to mixin or something instead yield Message( id=x.id, sent=x.sent, subject=x.subject, sender=id2sender[x.sender_id], server=id2server[x.server_id], content=x.content, ) continue assert_never(x) def stats() -> Stats: return {**stat(messages)}