HPI/my/tweets.py
2019-11-07 21:08:01 +00:00

147 lines
3.7 KiB
Python
Executable file

"""
Uses official twitter archive export
See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive
Expects path to be set
- via ~configure~ (before calling anything else)
- or in ~my_configuration.twitter.export_path~
"""
from datetime import date, datetime
from typing import Union, List, Dict, Set, Optional, Iterator, Any
from pathlib import Path
import json
import zipfile
import pytz
from .common import PathIsh
_export_path: Optional[Path] = None
def configure(*, export_path: Optional[PathIsh]=None) -> None:
if export_path is not None:
global _export_path
_export_path = Path(export_path)
def _get_export() -> Path:
export_path = _export_path
if export_path is None:
# fallback to my_configuration
from . import paths
export_path = paths.twitter.export_path
p = Path(export_path)
if p.is_dir():
return max(p.glob('*.zip'))
else:
return p
Tid = str
# TODO make sure it's not used anywhere else and simplify interface
class Tweet:
def __init__(self, tw: Dict[str, Any]) -> None:
self.tw = tw
@property
def tid(self) -> Tid:
return self.tw['id_str']
@property
def permalink(self) -> str:
return f'https://twitter.com/i/web/status/{self.tid}'
@property
def dt(self) -> datetime:
dts = self.tw['created_at']
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
@property
def text(self) -> str:
return self.tw['full_text']
@property
def entities(self):
return self.tw['entities']
def __str__(self) -> str:
return str(self.tw)
def __repr__(self) -> str:
return repr(self.tw)
def _from_json_export() -> Iterator[Tweet]:
epath = _get_export()
ddd = zipfile.ZipFile(epath).read('tweet.js').decode('utf8')
start = ddd.index('[')
ddd = ddd[start:]
for j in json.loads(ddd):
yield Tweet(j)
def tweets_all() -> List[Tweet]:
return list(sorted(_from_json_export(), key=lambda t: t.dt))
def predicate(p) -> List[Tweet]:
return [t for t in tweets_all() if p(t)]
def predicate_date(p) -> List[Tweet]:
return predicate(lambda t: p(t.dt.date()))
# TODO move these to private tests?
Datish = Union[date, str]
def tweets_on(*dts: Datish) -> List[Tweet]:
from kython import parse_date_new
# TODO how to make sure we don't miss on 29 feb?
dates = {parse_date_new(d) for d in dts}
return predicate_date(lambda d: d in dates)
on = tweets_on
def test_tweet():
raw = """
{
"retweeted" : false,
"entities" : {
"hashtags" : [ ],
"symbols" : [ ],
"user_mentions" : [ ],
"urls" : [ {
"url" : "https://t.co/vUg4W6nxwU",
"expanded_url" : "https://intelligence.org/2013/12/13/aaronson/",
"display_url" : "intelligence.org/2013/12/13/aar…",
"indices" : [ "120", "143" ]
}
]
},
"display_text_range" : [ "0", "90" ],
"favorite_count" : "0",
"in_reply_to_status_id_str" : "24123424",
"id_str" : "2328934829084",
"in_reply_to_user_id" : "23423424",
"truncated" : false,
"retweet_count" : "0",
"id" : "23492349032940",
"in_reply_to_status_id" : "23482984932084",
"created_at" : "Thu Aug 30 07:12:48 +0000 2012",
"favorited" : false,
"full_text" : "this is a test tweet",
"lang" : "ru",
"in_reply_to_screen_name" : "whatever",
"in_reply_to_user_id_str" : "3748274"
}
"""
t = Tweet(json.loads(raw))
assert t.permalink is not None
assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
assert t.text == 'this is a test tweet'
assert t.tid == '2328934829084'
assert t.entities is not None