""" Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]]) """ from __future__ import annotations import html import json # hmm interesting enough, orjson didn't give much speedup here? from abc import abstractmethod from dataclasses import dataclass from datetime import datetime from functools import cached_property from itertools import chain from pathlib import Path from typing import ( TYPE_CHECKING, Iterator, Sequence, ) from more_itertools import unique_everseen from my.core import ( Json, Paths, Res, Stats, datetime_aware, get_files, make_logger, stat, warnings, ) from my.core.serialize import dumps as json_dumps from .common import TweetId, permalink logger = make_logger(__name__) class config: @property @abstractmethod def export_path(self) -> Paths: """path[s]/glob to the twitter archive takeout""" raise NotImplementedError def make_config() -> config: # before this config was named 'twitter', doesn't make too much sense for archive # todo unify with other code like this, e.g. time.tz.via_location try: from my.config import twitter_archive as user_config except ImportError as ie: if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)): # must be caused by something else raise ie try: from my.config import twitter as user_config # type: ignore[assignment] except ImportError: raise ie # raise the original exception.. must be something else # noqa: B904 else: warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config') ## class combined_config(user_config, config): pass return combined_config() def inputs() -> Sequence[Path]: return get_files(make_config().export_path) # TODO make sure it's not used anywhere else and simplify interface @dataclass class Tweet: raw: Json screen_name: str @property def id_str(self) -> TweetId: return self.raw['id_str'] @property def created_at(self) -> datetime_aware: dts = self.raw['created_at'] return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') @property def permalink(self) -> str: return permalink(screen_name=self.screen_name, id=self.id_str) @property def text(self) -> str: res: str = self.raw['full_text'] ## replace shortened URLS repls = [] # from, to, what for ue in self.entities['urls']: [fr, to] = map(int, ue['indices']) repls.append((fr, to, ue['expanded_url'])) # seems that media field isn't always set for me in self.entities.get('media', []): [fr, to] = map(int, me['indices']) repls.append((fr, to, me['display_url'])) # todo not sure, maybe use media_url_https instead? # for now doing this for compatibility with twint repls = sorted(repls) parts = [] idx = 0 for fr, to, what in repls: parts.append(res[idx:fr]) parts.append(what) idx = to parts.append(res[idx:]) res = ''.join(parts) ## # replace stuff like </> res = html.unescape(res) return res @property def urls(self) -> list[str]: ents = self.entities us = ents['urls'] return [u['expanded_url'] for u in us] @property def entities(self) -> Json: # todo hmm what is 'extended_entities' return self.raw['entities'] def __str__(self) -> str: return str(self.raw) def __repr__(self) -> str: return repr(self.raw) # TODO deprecate tid? @property def tid(self) -> TweetId: return self.id_str @property def dt(self) -> datetime_aware: return self.created_at @dataclass class Like: raw: Json screen_name: str @property def permalink(self) -> str: # doesn'tseem like link it export is more specific... return permalink(screen_name=self.screen_name, id=self.id_str) @property def id_str(self) -> TweetId: return self.raw['tweetId'] @property def text(self) -> str | None: # NOTE: likes basically don't have anything except text and url # ugh. I think none means that tweet was deleted? res: str | None = self.raw.get('fullText') if res is None: return None res = html.unescape(res) return res # TODO deprecate? @property def tid(self) -> TweetId: return self.id_str class ZipExport: def __init__(self, archive_path: Path) -> None: self.zpath = archive_path if (self.zpath / 'tweets.csv').exists(): warnings.high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.") self.old_format = False # changed somewhere around 2020.03 if not (self.zpath / 'Your archive.html').exists(): self.old_format = True def raw(self, what: str, *, fname: str | None = None) -> Iterator[Json]: logger.info(f'{self.zpath} : processing {what}') path = fname or what if not self.old_format: path = 'data/' + path path += '.js' ddd = (self.zpath / path).read_text() start = ddd.index('[') ddd = ddd[start:] for j in json.loads(ddd): if set(j.keys()) == {what}: # newer format yield j[what] else: # older format yield j @cached_property def screen_name(self) -> str: [acc] = self.raw(what='account') return acc['username'] def tweets(self) -> Iterator[Tweet]: fname = 'tweets' # since somewhere between mar and oct 2022 if not (self.zpath / f'data/{fname}.js').exists(): fname = 'tweet' # old name # NOTE: for some reason, created_at doesn't seem to be in order # it mostly is, but there are a bunch of one-off random tweets where the time decreases (typically at the very end) for r in self.raw(what='tweet', fname=fname): yield Tweet(r, screen_name=self.screen_name) def likes(self) -> Iterator[Like]: # TODO ugh. would be nice to unify Tweet/Like interface # however, akeout only got tweetId, full text and url for r in self.raw(what='like'): yield Like(r, screen_name=self.screen_name) def _cleanup_tweet_json(rj: Json) -> None: # note: for now this isn't used, was just an attempt to normalise raw data... rj.pop('edit_info', None) # useless for downstream processing, but results in dupes, so let's remove it ## could probably just take the last one? dunno rj.pop('retweet_count', None) rj.pop('favorite_count', None) ## entities = rj.get('entities', {}) ext_entities = rj.get('extended_entities', {}) # TODO shit. unclear how to 'merge' changes to these # links sometimes change for no apparent reason -- and sometimes old one is still valid but not the new one??? for m in entities.get('media', {}): m.pop('media_url', None) m.pop('media_url_https', None) for m in ext_entities.get('media', {}): m.pop('media_url', None) m.pop('media_url_https', None) ## for m in entities.get('user_mentions', {}): # changes if user renames themselves... m.pop('name', None) # hmm so can change to -1? maybe if user was deleted? # but also can change to actually something else?? second example entities.pop('user_mentions', None) # TODO figure out what else is changing there later... rj.pop('entities', None) rj.pop('extended_entities', None) ## useless attributes which should be fine to exclude rj.pop('possibly_sensitive', None) # not sure what is this.. sometimes appears with False value?? rj.pop('withheld_in_countries', None) rj.pop('lang', None) ## # ugh. might change if the Twitter client was deleted or description renamed?? rj.pop('source', None) ## ugh. sometimes trailing 0 after decimal point is present? rj.pop('coordinates', None) rj.get('geo', {}).pop('coordinates', None) ## # ugh. this changes if user changed their name... # or disappears if account was deleted? rj.pop('in_reply_to_screen_name', None) # todo not sure about list and sorting? although can't hurt considering json is not iterative? def tweets() -> Iterator[Res[Tweet]]: _all = chain.from_iterable(ZipExport(i).tweets() for i in inputs()) # NOTE raw json data in archived tweets changes all the time even for same tweets # there is an attempt to clean it up... but it's tricky since users rename themselves, twitter stats are changing # so it's unclear how to pick up # we should probably 'merge' tweets into a canonical version, e.g. # - pick latest tweet stats # - keep history of usernames we were replying to that share the same user id # - pick 'best' media url somehow?? # - normalise coordinates data def key(t: Tweet): # NOTE: not using t.text, since it actually changes if entities in tweet are changing... # whereas full_text seems stable text = t.raw['full_text'] return (t.created_at, t.id_str, text) res = unique_everseen(_all, key=key) yield from sorted(res, key=lambda t: t.created_at) def likes() -> Iterator[Res[Like]]: _all = chain.from_iterable(ZipExport(i).likes() for i in inputs()) res = unique_everseen(_all, key=json_dumps) # ugh. likes don't have datetimes.. yield from res def stats() -> Stats: return { **stat(tweets), **stat(likes), } ## Deprecated stuff if not TYPE_CHECKING: Tid = TweetId