adapt twitter provider for public consumption

This commit is contained in:
Dima Gerasimov 2019-11-07 20:20:11 +00:00
parent 1075c9fdae
commit a6d16b9c55
2 changed files with 85 additions and 76 deletions

View file

@ -92,6 +92,7 @@ def _get_state(bfile: Path) -> Dict[Sid, SaveWithDt]:
bdt = _get_bdate(bfile)
saves = [SaveWithDt(save, bdt) for save in rexport.Model([bfile]).saved()]
# TODO FIXME remove kython?
from kython import make_dict
return make_dict(
sorted(saves, key=lambda p: p.save.created),

View file

@ -1,52 +1,60 @@
#!/usr/bin/env python3
from datetime import date, datetime
from typing import Union, List, Dict, Set
from typing import Union, List, Dict, Set, Optional, Iterator, Any
from pathlib import Path
import json
import zipfile
from kython import make_dict
import pytz
KARLICOSS_ID = '119756204'
DB_PATH = Path('/L/zzz_syncthing/data/tweets')
EXPORTS_PATH = Path('/L/backups/twitter-exports')
from .common import PathIsh
_export_path: Optional[Path] = None
def configure(*, export_path: Optional[PathIsh]=None) -> None:
if export_path is not None:
global _export_path
_export_path = Path(export_path)
def _get_export() -> Path:
export_path = _export_path
if export_path is None:
# fallback to my_configuration
from . import paths
export_path = paths.twitter.export_path
return Path(export_path)
import sys
sys.path.append('/L/coding/twidump')
import twidump # type: ignore
sys.path.pop() # TODO not sure if necessary?
Tid = str
# TODO make sure it's not used anywhere else and simplify interface
class Tweet:
def __init__(self, tw):
def __init__(self, tw: Dict[str, Any]) -> None:
self.tw = tw
def __getattr__(self, attr):
return getattr(self.tw, attr)
@property
def url(self) -> str:
return self.tw.permalink(username='karlicoss')
@property
def time(self) -> str:
return self.tw.created_at
@property
def dt(self) -> datetime:
return self.tw.get_utc_datetime()
@property
def text(self) -> str:
return self.tw.text
@property
def tid(self) -> Tid:
return self.tw.id_str
return self.tw['id_str']
@property
def permalink(self) -> str:
return f'https://twitter.com/i/web/status/{self.tid}'
@property
def dt(self) -> datetime:
dts = self.tw['created_at']
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
@property
def text(self) -> str:
return self.tw['full_text']
@property
def entities(self):
return self.tw['entities']
def __str__(self) -> str:
return str(self.tw)
@ -55,47 +63,17 @@ class Tweet:
return repr(self.tw)
def _twidump() -> List[Tweet]:
import twidump
# add current package to path to discover config?... nah, twidump should be capable of that.
from twidump.data_manipulation.timelines import TimelineLoader # type: ignore
from twidump.component import get_app_injector # type: ignore
tl_loader = get_app_injector(db_path=DB_PATH).get(TimelineLoader) # type: TimelineLoader
tl = tl_loader.load_timeline(KARLICOSS_ID)
return [Tweet(x) for x in tl]
def _json() -> List[Tweet]:
from twidump.data.tweet import Tweet as TDTweet # type: ignore
zips = EXPORTS_PATH.glob('*.zip')
last = list(sorted(zips, key=lambda p: p.stat().st_mtime))[-1]
ddd = zipfile.ZipFile(last).read('tweet.js').decode('utf8')
def _from_json_export() -> Iterator[Tweet]:
epath = _get_export()
ddd = zipfile.ZipFile(epath).read('tweet.js').decode('utf8')
start = ddd.index('[')
ddd = ddd[start:]
tws = []
for j in json.loads(ddd):
j['user'] = {} # TODO is it ok?
tw = Tweet(TDTweet.from_api_dict(j))
tws.append(tw)
return tws
yield Tweet(j)
def tweets_all() -> List[Tweet]:
tjson: Dict[Tid, Tweet] = make_dict(_json(), key=lambda t: t.tid)
tdump: Dict[Tid, Tweet] = make_dict(_twidump(), key=lambda t: t.tid)
keys: Set[Tid] = set(tdump.keys()).union(set(tjson.keys()))
# TODO hmm. looks like json generally got longer tweets?
res: List[Tweet] = []
for tid in keys:
if tid in tjson:
tw = tjson[tid]
else:
tw = tdump[tid]
res.append(tw)
res.sort(key=lambda t: t.dt)
return res
return list(sorted(_from_json_export(), key=lambda t: t.dt))
def predicate(p) -> List[Tweet]:
@ -104,6 +82,7 @@ def predicate(p) -> List[Tweet]:
def predicate_date(p) -> List[Tweet]:
return predicate(lambda t: p(t.dt.date()))
# TODO move these to private tests?
Datish = Union[date, str]
def tweets_on(*dts: Datish) -> List[Tweet]:
from kython import parse_date_new
@ -113,14 +92,43 @@ def tweets_on(*dts: Datish) -> List[Tweet]:
on = tweets_on
def test_on():
tww = tweets_on('2019-05-11')
assert len(tww) == 2
def test_all():
tall = tweets_all()
assert len(tall) > 100
if __name__ == '__main__':
for t in tweets_all():
print(t)
def test_tweet():
raw = """
{
"retweeted" : false,
"entities" : {
"hashtags" : [ ],
"symbols" : [ ],
"user_mentions" : [ ],
"urls" : [ {
"url" : "https://t.co/vUg4W6nxwU",
"expanded_url" : "https://intelligence.org/2013/12/13/aaronson/",
"display_url" : "intelligence.org/2013/12/13/aar…",
"indices" : [ "120", "143" ]
}
]
},
"display_text_range" : [ "0", "90" ],
"favorite_count" : "0",
"in_reply_to_status_id_str" : "24123424",
"id_str" : "2328934829084",
"in_reply_to_user_id" : "23423424",
"truncated" : false,
"retweet_count" : "0",
"id" : "23492349032940",
"in_reply_to_status_id" : "23482984932084",
"created_at" : "Thu Aug 30 07:12:48 +0000 2012",
"favorited" : false,
"full_text" : "this is a test tweet",
"lang" : "ru",
"in_reply_to_screen_name" : "whatever",
"in_reply_to_user_id_str" : "3748274"
}
"""
t = Tweet(json.loads(raw))
assert t.permalink is not None
assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
assert t.text == 'this is a test tweet'
assert t.tid == '2328934829084'
assert t.entities is not None