my.zulip.organization: use UTC timestamps, support custom archive names + some cleanup
This commit is contained in:
parent
a0910e798d
commit
bef0423b4f
1 changed files with 33 additions and 32 deletions
|
@ -2,24 +2,37 @@
|
||||||
Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
|
Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]]
|
||||||
"""
|
"""
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Sequence, Iterator, Dict
|
from datetime import datetime, timezone
|
||||||
|
from itertools import count
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Sequence, Iterator, Dict, Union
|
||||||
|
|
||||||
|
from my.core import (
|
||||||
|
assert_never,
|
||||||
|
datetime_aware,
|
||||||
|
get_files,
|
||||||
|
stat,
|
||||||
|
Json,
|
||||||
|
Paths,
|
||||||
|
Res,
|
||||||
|
Stats,
|
||||||
|
)
|
||||||
|
from my.core.error import notnone
|
||||||
|
import my.config
|
||||||
|
|
||||||
from my.config import zulip as user_config
|
|
||||||
|
|
||||||
from ..core import Paths
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class organization(user_config.organization):
|
class organization(my.config.zulip.organization):
|
||||||
# paths[s]/glob to the exported JSON data
|
# paths[s]/glob to the exported JSON data
|
||||||
export_path: Paths
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from ..core import get_files, Json
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
return get_files(organization.export_path)
|
# TODO: seems like export ids are kinda random..
|
||||||
|
# not sure what's the best way to figure out the last without renaming?
|
||||||
|
# could use mtime perhaps?
|
||||||
from datetime import datetime
|
return get_files(organization.export_path, sort=False)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
@ -39,16 +52,11 @@ class Sender:
|
||||||
|
|
||||||
# from the data, seems that subjects are completely implicit and determined by name?
|
# from the data, seems that subjects are completely implicit and determined by name?
|
||||||
# streams have ids (can extract from realm/zerver_stream), but unclear how to correlate messages/topics to streams?
|
# streams have ids (can extract from realm/zerver_stream), but unclear how to correlate messages/topics to streams?
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class _Message:
|
class _Message:
|
||||||
# todo hmm not sure what would be a good field order..
|
# todo hmm not sure what would be a good field order..
|
||||||
id: int
|
id: int
|
||||||
sent: datetime
|
sent: datetime_aware # double checked and they are in utc
|
||||||
# TODO hmm kinda unclear whether it uses UTC or not??
|
|
||||||
# https://github.com/zulip/zulip/blob/0c2e4eec200d986a9a020f3e9a651d27216e0e85/zerver/models.py#L3071-L3076
|
|
||||||
# it keeps it tz aware.. but not sure what happens after?
|
|
||||||
# https://github.com/zulip/zulip/blob/1dfddffc8dac744fd6a6fbfd937018074c8bb166/zproject/computed_settings.py#L151
|
|
||||||
subject: str
|
subject: str
|
||||||
sender_id: int
|
sender_id: int
|
||||||
server_id: int
|
server_id: int
|
||||||
|
@ -60,7 +68,7 @@ class _Message:
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class Message:
|
class Message:
|
||||||
id: int
|
id: int
|
||||||
sent: datetime
|
sent: datetime_aware
|
||||||
subject: str
|
subject: str
|
||||||
sender: Sender
|
sender: Sender
|
||||||
server: Server
|
server: Server
|
||||||
|
@ -76,23 +84,18 @@ class Message:
|
||||||
return f'https://{self.server.string_id}.zulipchat.com/#narrow/near/{self.id}'
|
return f'https://{self.server.string_id}.zulipchat.com/#narrow/near/{self.id}'
|
||||||
|
|
||||||
|
|
||||||
from typing import Union
|
|
||||||
from itertools import count
|
|
||||||
import json
|
|
||||||
from ..core import Res, assert_never
|
|
||||||
# todo cache it
|
# todo cache it
|
||||||
def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
|
def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
|
||||||
# TODO hmm -- not sure if max lexicographically will actually be latest?
|
|
||||||
last = max(inputs())
|
last = max(inputs())
|
||||||
|
|
||||||
subdir = last.with_suffix('').stem # there is a directory inside tar.gz
|
|
||||||
|
|
||||||
# todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
|
# todo would be nice to switch it to unpacked dirs as well, similar to ZipPath
|
||||||
# I guess makes sense to have a special implementation for .tar.gz considering how common are they
|
# I guess makes sense to have a special implementation for .tar.gz considering how common are they
|
||||||
import tarfile
|
import tarfile
|
||||||
from ..core.error import notnone
|
|
||||||
|
|
||||||
tfile = tarfile.open(last)
|
tfile = tarfile.open(last)
|
||||||
|
|
||||||
|
subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that
|
||||||
|
|
||||||
with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
|
with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo:
|
||||||
rj = json.load(fo)
|
rj = json.load(fo)
|
||||||
|
|
||||||
|
@ -114,20 +117,22 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]:
|
||||||
for j in rj['zerver_userprofile_crossrealm']: # e.g. zulip bot
|
for j in rj['zerver_userprofile_crossrealm']: # e.g. zulip bot
|
||||||
yield Sender(
|
yield Sender(
|
||||||
id=j['id'],
|
id=j['id'],
|
||||||
full_name=j['email'], # doesn't seem to have anything
|
full_name=j['email'], # doesn't seem to have anything
|
||||||
email=j['email'],
|
email=j['email'],
|
||||||
)
|
)
|
||||||
|
|
||||||
def _parse_message(j: Json) -> _Message:
|
def _parse_message(j: Json) -> _Message:
|
||||||
ds = j['date_sent']
|
ds = j['date_sent']
|
||||||
|
# fmt: off
|
||||||
return _Message(
|
return _Message(
|
||||||
id = j['id'],
|
id = j['id'],
|
||||||
sent = datetime.fromtimestamp(ds),
|
sent = datetime.fromtimestamp(ds, tz=timezone.utc),
|
||||||
subject = j['subject'],
|
subject = j['subject'],
|
||||||
sender_id = j['sender'],
|
sender_id = j['sender'],
|
||||||
server_id = server.id,
|
server_id = server.id,
|
||||||
content = j['content'],
|
content = j['content'],
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
for idx in count(start=1, step=1):
|
for idx in count(start=1, step=1):
|
||||||
fname = f'messages-{idx:06}.json'
|
fname = f'messages-{idx:06}.json'
|
||||||
|
@ -172,9 +177,5 @@ def messages() -> Iterator[Res[Message]]:
|
||||||
assert_never(x)
|
assert_never(x)
|
||||||
|
|
||||||
|
|
||||||
from my.core import Stats
|
|
||||||
def stats() -> Stats:
|
def stats() -> Stats:
|
||||||
from my.core import stat
|
return {**stat(messages)}
|
||||||
return {
|
|
||||||
**stat(messages)
|
|
||||||
}
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue