322 lines
9.8 KiB
Python
322 lines
9.8 KiB
Python
"""
|
|
Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]])
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import html
|
|
import json # hmm interesting enough, orjson didn't give much speedup here?
|
|
from abc import abstractmethod
|
|
from collections.abc import Iterator, Sequence
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from functools import cached_property
|
|
from itertools import chain
|
|
from pathlib import Path
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
)
|
|
|
|
from more_itertools import unique_everseen
|
|
|
|
from my.core import (
|
|
Json,
|
|
Paths,
|
|
Res,
|
|
Stats,
|
|
datetime_aware,
|
|
get_files,
|
|
make_logger,
|
|
stat,
|
|
warnings,
|
|
)
|
|
from my.core.serialize import dumps as json_dumps
|
|
|
|
from .common import TweetId, permalink
|
|
|
|
logger = make_logger(__name__)
|
|
|
|
|
|
class config:
|
|
@property
|
|
@abstractmethod
|
|
def export_path(self) -> Paths:
|
|
"""path[s]/glob to the twitter archive takeout"""
|
|
raise NotImplementedError
|
|
|
|
|
|
def make_config() -> config:
|
|
# before this config was named 'twitter', doesn't make too much sense for archive
|
|
# todo unify with other code like this, e.g. time.tz.via_location
|
|
try:
|
|
from my.config import twitter_archive as user_config
|
|
except ImportError as ie:
|
|
if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)):
|
|
# must be caused by something else
|
|
raise ie
|
|
try:
|
|
from my.config import twitter as user_config # type: ignore[assignment]
|
|
except ImportError:
|
|
raise ie # raise the original exception.. must be something else # noqa: B904
|
|
else:
|
|
warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config')
|
|
##
|
|
|
|
class combined_config(user_config, config):
|
|
pass
|
|
|
|
return combined_config()
|
|
|
|
|
|
def inputs() -> Sequence[Path]:
|
|
return get_files(make_config().export_path)
|
|
|
|
|
|
# TODO make sure it's not used anywhere else and simplify interface
|
|
@dataclass
|
|
class Tweet:
|
|
raw: Json
|
|
screen_name: str
|
|
|
|
@property
|
|
def id_str(self) -> TweetId:
|
|
return self.raw['id_str']
|
|
|
|
@property
|
|
def created_at(self) -> datetime_aware:
|
|
dts = self.raw['created_at']
|
|
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
|
|
|
|
@property
|
|
def permalink(self) -> str:
|
|
return permalink(screen_name=self.screen_name, id=self.id_str)
|
|
|
|
@property
|
|
def text(self) -> str:
|
|
res: str = self.raw['full_text']
|
|
|
|
## replace shortened URLS
|
|
repls = [] # from, to, what
|
|
for ue in self.entities['urls']:
|
|
[fr, to] = map(int, ue['indices'])
|
|
repls.append((fr, to, ue['expanded_url']))
|
|
# seems that media field isn't always set
|
|
for me in self.entities.get('media', []):
|
|
[fr, to] = map(int, me['indices'])
|
|
repls.append((fr, to, me['display_url']))
|
|
# todo not sure, maybe use media_url_https instead?
|
|
# for now doing this for compatibility with twint
|
|
repls = sorted(repls)
|
|
parts = []
|
|
idx = 0
|
|
for fr, to, what in repls:
|
|
parts.append(res[idx:fr])
|
|
parts.append(what)
|
|
idx = to
|
|
parts.append(res[idx:])
|
|
res = ''.join(parts)
|
|
##
|
|
|
|
# replace stuff like </>
|
|
res = html.unescape(res)
|
|
return res
|
|
|
|
@property
|
|
def urls(self) -> list[str]:
|
|
ents = self.entities
|
|
us = ents['urls']
|
|
return [u['expanded_url'] for u in us]
|
|
|
|
@property
|
|
def entities(self) -> Json:
|
|
# todo hmm what is 'extended_entities'
|
|
return self.raw['entities']
|
|
|
|
def __str__(self) -> str:
|
|
return str(self.raw)
|
|
|
|
def __repr__(self) -> str:
|
|
return repr(self.raw)
|
|
|
|
# TODO deprecate tid?
|
|
@property
|
|
def tid(self) -> TweetId:
|
|
return self.id_str
|
|
|
|
@property
|
|
def dt(self) -> datetime_aware:
|
|
return self.created_at
|
|
|
|
|
|
@dataclass
|
|
class Like:
|
|
raw: Json
|
|
screen_name: str
|
|
|
|
@property
|
|
def permalink(self) -> str:
|
|
# doesn'tseem like link it export is more specific...
|
|
return permalink(screen_name=self.screen_name, id=self.id_str)
|
|
|
|
@property
|
|
def id_str(self) -> TweetId:
|
|
return self.raw['tweetId']
|
|
|
|
@property
|
|
def text(self) -> str | None:
|
|
# NOTE: likes basically don't have anything except text and url
|
|
# ugh. I think none means that tweet was deleted?
|
|
res: str | None = self.raw.get('fullText')
|
|
if res is None:
|
|
return None
|
|
res = html.unescape(res)
|
|
return res
|
|
|
|
# TODO deprecate?
|
|
@property
|
|
def tid(self) -> TweetId:
|
|
return self.id_str
|
|
|
|
|
|
class ZipExport:
|
|
def __init__(self, archive_path: Path) -> None:
|
|
self.zpath = archive_path
|
|
if (self.zpath / 'tweets.csv').exists():
|
|
warnings.high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")
|
|
self.old_format = False # changed somewhere around 2020.03
|
|
if not (self.zpath / 'Your archive.html').exists():
|
|
self.old_format = True
|
|
|
|
def raw(self, what: str, *, fname: str | None = None) -> Iterator[Json]:
|
|
logger.info(f'{self.zpath} : processing {what}')
|
|
|
|
path = fname or what
|
|
if not self.old_format:
|
|
path = 'data/' + path
|
|
path += '.js'
|
|
|
|
ddd = (self.zpath / path).read_text()
|
|
start = ddd.index('[')
|
|
ddd = ddd[start:]
|
|
for j in json.loads(ddd):
|
|
if set(j.keys()) == {what}:
|
|
# newer format
|
|
yield j[what]
|
|
else:
|
|
# older format
|
|
yield j
|
|
|
|
@cached_property
|
|
def screen_name(self) -> str:
|
|
[acc] = self.raw(what='account')
|
|
return acc['username']
|
|
|
|
def tweets(self) -> Iterator[Tweet]:
|
|
fname = 'tweets' # since somewhere between mar and oct 2022
|
|
if not (self.zpath / f'data/{fname}.js').exists():
|
|
fname = 'tweet' # old name
|
|
# NOTE: for some reason, created_at doesn't seem to be in order
|
|
# it mostly is, but there are a bunch of one-off random tweets where the time decreases (typically at the very end)
|
|
for r in self.raw(what='tweet', fname=fname):
|
|
yield Tweet(r, screen_name=self.screen_name)
|
|
|
|
def likes(self) -> Iterator[Like]:
|
|
# TODO ugh. would be nice to unify Tweet/Like interface
|
|
# however, akeout only got tweetId, full text and url
|
|
for r in self.raw(what='like'):
|
|
yield Like(r, screen_name=self.screen_name)
|
|
|
|
|
|
def _cleanup_tweet_json(rj: Json) -> None:
|
|
# note: for now this isn't used, was just an attempt to normalise raw data...
|
|
|
|
rj.pop('edit_info', None) # useless for downstream processing, but results in dupes, so let's remove it
|
|
|
|
## could probably just take the last one? dunno
|
|
rj.pop('retweet_count', None)
|
|
rj.pop('favorite_count', None)
|
|
##
|
|
|
|
entities = rj.get('entities', {})
|
|
ext_entities = rj.get('extended_entities', {})
|
|
|
|
# TODO shit. unclear how to 'merge' changes to these
|
|
# links sometimes change for no apparent reason -- and sometimes old one is still valid but not the new one???
|
|
for m in entities.get('media', {}):
|
|
m.pop('media_url', None)
|
|
m.pop('media_url_https', None)
|
|
for m in ext_entities.get('media', {}):
|
|
m.pop('media_url', None)
|
|
m.pop('media_url_https', None)
|
|
##
|
|
|
|
for m in entities.get('user_mentions', {}):
|
|
# changes if user renames themselves...
|
|
m.pop('name', None)
|
|
|
|
# hmm so can change to -1? maybe if user was deleted?
|
|
# but also can change to actually something else?? second example
|
|
entities.pop('user_mentions', None)
|
|
|
|
# TODO figure out what else is changing there later...
|
|
rj.pop('entities', None)
|
|
rj.pop('extended_entities', None)
|
|
|
|
## useless attributes which should be fine to exclude
|
|
rj.pop('possibly_sensitive', None) # not sure what is this.. sometimes appears with False value??
|
|
rj.pop('withheld_in_countries', None)
|
|
rj.pop('lang', None)
|
|
##
|
|
|
|
# ugh. might change if the Twitter client was deleted or description renamed??
|
|
rj.pop('source', None)
|
|
|
|
## ugh. sometimes trailing 0 after decimal point is present?
|
|
rj.pop('coordinates', None)
|
|
rj.get('geo', {}).pop('coordinates', None)
|
|
##
|
|
|
|
# ugh. this changes if user changed their name...
|
|
# or disappears if account was deleted?
|
|
rj.pop('in_reply_to_screen_name', None)
|
|
|
|
|
|
# todo not sure about list and sorting? although can't hurt considering json is not iterative?
|
|
def tweets() -> Iterator[Res[Tweet]]:
|
|
_all = chain.from_iterable(ZipExport(i).tweets() for i in inputs())
|
|
|
|
# NOTE raw json data in archived tweets changes all the time even for same tweets
|
|
# there is an attempt to clean it up... but it's tricky since users rename themselves, twitter stats are changing
|
|
# so it's unclear how to pick up
|
|
# we should probably 'merge' tweets into a canonical version, e.g.
|
|
# - pick latest tweet stats
|
|
# - keep history of usernames we were replying to that share the same user id
|
|
# - pick 'best' media url somehow??
|
|
# - normalise coordinates data
|
|
def key(t: Tweet):
|
|
# NOTE: not using t.text, since it actually changes if entities in tweet are changing...
|
|
# whereas full_text seems stable
|
|
text = t.raw['full_text']
|
|
return (t.created_at, t.id_str, text)
|
|
|
|
res = unique_everseen(_all, key=key)
|
|
yield from sorted(res, key=lambda t: t.created_at)
|
|
|
|
|
|
def likes() -> Iterator[Res[Like]]:
|
|
_all = chain.from_iterable(ZipExport(i).likes() for i in inputs())
|
|
res = unique_everseen(_all, key=json_dumps)
|
|
# ugh. likes don't have datetimes..
|
|
yield from res
|
|
|
|
|
|
def stats() -> Stats:
|
|
return {
|
|
**stat(tweets),
|
|
**stat(likes),
|
|
}
|
|
|
|
|
|
## Deprecated stuff
|
|
if not TYPE_CHECKING:
|
|
Tid = TweetId
|