From a6d16b9c55d6e9ff9a670654b29909118b3e263d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 7 Nov 2019 20:20:11 +0000 Subject: [PATCH] adapt twitter provider for public consumption --- my/reddit.py | 1 + my/tweets.py | 160 +++++++++++++++++++++++++++------------------------ 2 files changed, 85 insertions(+), 76 deletions(-) diff --git a/my/reddit.py b/my/reddit.py index 5351cc0..e38a99f 100755 --- a/my/reddit.py +++ b/my/reddit.py @@ -92,6 +92,7 @@ def _get_state(bfile: Path) -> Dict[Sid, SaveWithDt]: bdt = _get_bdate(bfile) saves = [SaveWithDt(save, bdt) for save in rexport.Model([bfile]).saved()] + # TODO FIXME remove kython? from kython import make_dict return make_dict( sorted(saves, key=lambda p: p.save.created), diff --git a/my/tweets.py b/my/tweets.py index 5501b81..6db944c 100755 --- a/my/tweets.py +++ b/my/tweets.py @@ -1,52 +1,60 @@ #!/usr/bin/env python3 from datetime import date, datetime -from typing import Union, List, Dict, Set +from typing import Union, List, Dict, Set, Optional, Iterator, Any from pathlib import Path import json - import zipfile -from kython import make_dict +import pytz -KARLICOSS_ID = '119756204' -DB_PATH = Path('/L/zzz_syncthing/data/tweets') -EXPORTS_PATH = Path('/L/backups/twitter-exports') +from .common import PathIsh + + +_export_path: Optional[Path] = None + +def configure(*, export_path: Optional[PathIsh]=None) -> None: + if export_path is not None: + global _export_path + _export_path = Path(export_path) + + +def _get_export() -> Path: + export_path = _export_path + if export_path is None: + # fallback to my_configuration + from . import paths + export_path = paths.twitter.export_path + return Path(export_path) -import sys -sys.path.append('/L/coding/twidump') -import twidump # type: ignore -sys.path.pop() # TODO not sure if necessary? Tid = str # TODO make sure it's not used anywhere else and simplify interface class Tweet: - def __init__(self, tw): + def __init__(self, tw: Dict[str, Any]) -> None: self.tw = tw - def __getattr__(self, attr): - return getattr(self.tw, attr) - - @property - def url(self) -> str: - return self.tw.permalink(username='karlicoss') - - @property - def time(self) -> str: - return self.tw.created_at - - @property - def dt(self) -> datetime: - return self.tw.get_utc_datetime() - - @property - def text(self) -> str: - return self.tw.text - @property def tid(self) -> Tid: - return self.tw.id_str + return self.tw['id_str'] + + @property + def permalink(self) -> str: + return f'https://twitter.com/i/web/status/{self.tid}' + + @property + def dt(self) -> datetime: + dts = self.tw['created_at'] + return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') + + @property + def text(self) -> str: + return self.tw['full_text'] + + @property + def entities(self): + return self.tw['entities'] def __str__(self) -> str: return str(self.tw) @@ -55,47 +63,17 @@ class Tweet: return repr(self.tw) -def _twidump() -> List[Tweet]: - import twidump - # add current package to path to discover config?... nah, twidump should be capable of that. - from twidump.data_manipulation.timelines import TimelineLoader # type: ignore - from twidump.component import get_app_injector # type: ignore - tl_loader = get_app_injector(db_path=DB_PATH).get(TimelineLoader) # type: TimelineLoader - tl = tl_loader.load_timeline(KARLICOSS_ID) - return [Tweet(x) for x in tl] - - -def _json() -> List[Tweet]: - from twidump.data.tweet import Tweet as TDTweet # type: ignore - - zips = EXPORTS_PATH.glob('*.zip') - last = list(sorted(zips, key=lambda p: p.stat().st_mtime))[-1] - ddd = zipfile.ZipFile(last).read('tweet.js').decode('utf8') +def _from_json_export() -> Iterator[Tweet]: + epath = _get_export() + ddd = zipfile.ZipFile(epath).read('tweet.js').decode('utf8') start = ddd.index('[') ddd = ddd[start:] - tws = [] for j in json.loads(ddd): - j['user'] = {} # TODO is it ok? - tw = Tweet(TDTweet.from_api_dict(j)) - tws.append(tw) - return tws + yield Tweet(j) def tweets_all() -> List[Tweet]: - tjson: Dict[Tid, Tweet] = make_dict(_json(), key=lambda t: t.tid) - tdump: Dict[Tid, Tweet] = make_dict(_twidump(), key=lambda t: t.tid) - keys: Set[Tid] = set(tdump.keys()).union(set(tjson.keys())) - - # TODO hmm. looks like json generally got longer tweets? - res: List[Tweet] = [] - for tid in keys: - if tid in tjson: - tw = tjson[tid] - else: - tw = tdump[tid] - res.append(tw) - res.sort(key=lambda t: t.dt) - return res + return list(sorted(_from_json_export(), key=lambda t: t.dt)) def predicate(p) -> List[Tweet]: @@ -104,6 +82,7 @@ def predicate(p) -> List[Tweet]: def predicate_date(p) -> List[Tweet]: return predicate(lambda t: p(t.dt.date())) +# TODO move these to private tests? Datish = Union[date, str] def tweets_on(*dts: Datish) -> List[Tweet]: from kython import parse_date_new @@ -113,14 +92,43 @@ def tweets_on(*dts: Datish) -> List[Tweet]: on = tweets_on -def test_on(): - tww = tweets_on('2019-05-11') - assert len(tww) == 2 -def test_all(): - tall = tweets_all() - assert len(tall) > 100 - -if __name__ == '__main__': - for t in tweets_all(): - print(t) +def test_tweet(): + raw = """ + { + "retweeted" : false, + "entities" : { + "hashtags" : [ ], + "symbols" : [ ], + "user_mentions" : [ ], + "urls" : [ { + "url" : "https://t.co/vUg4W6nxwU", + "expanded_url" : "https://intelligence.org/2013/12/13/aaronson/", + "display_url" : "intelligence.org/2013/12/13/aar…", + "indices" : [ "120", "143" ] + } + ] + }, + "display_text_range" : [ "0", "90" ], + "favorite_count" : "0", + "in_reply_to_status_id_str" : "24123424", + "id_str" : "2328934829084", + "in_reply_to_user_id" : "23423424", + "truncated" : false, + "retweet_count" : "0", + "id" : "23492349032940", + "in_reply_to_status_id" : "23482984932084", + "created_at" : "Thu Aug 30 07:12:48 +0000 2012", + "favorited" : false, + "full_text" : "this is a test tweet", + "lang" : "ru", + "in_reply_to_screen_name" : "whatever", + "in_reply_to_user_id_str" : "3748274" +} + """ + t = Tweet(json.loads(raw)) + assert t.permalink is not None + assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc) + assert t.text == 'this is a test tweet' + assert t.tid == '2328934829084' + assert t.entities is not None