adapt twitter provider for public consumption

This commit is contained in:
Dima Gerasimov 2019-11-07 20:20:11 +00:00
parent 1075c9fdae
commit a6d16b9c55
2 changed files with 85 additions and 76 deletions

View file

@ -92,6 +92,7 @@ def _get_state(bfile: Path) -> Dict[Sid, SaveWithDt]:
bdt = _get_bdate(bfile) bdt = _get_bdate(bfile)
saves = [SaveWithDt(save, bdt) for save in rexport.Model([bfile]).saved()] saves = [SaveWithDt(save, bdt) for save in rexport.Model([bfile]).saved()]
# TODO FIXME remove kython?
from kython import make_dict from kython import make_dict
return make_dict( return make_dict(
sorted(saves, key=lambda p: p.save.created), sorted(saves, key=lambda p: p.save.created),

View file

@ -1,52 +1,60 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from datetime import date, datetime from datetime import date, datetime
from typing import Union, List, Dict, Set from typing import Union, List, Dict, Set, Optional, Iterator, Any
from pathlib import Path from pathlib import Path
import json import json
import zipfile import zipfile
from kython import make_dict import pytz
KARLICOSS_ID = '119756204' from .common import PathIsh
DB_PATH = Path('/L/zzz_syncthing/data/tweets')
EXPORTS_PATH = Path('/L/backups/twitter-exports')
_export_path: Optional[Path] = None
def configure(*, export_path: Optional[PathIsh]=None) -> None:
if export_path is not None:
global _export_path
_export_path = Path(export_path)
def _get_export() -> Path:
export_path = _export_path
if export_path is None:
# fallback to my_configuration
from . import paths
export_path = paths.twitter.export_path
return Path(export_path)
import sys
sys.path.append('/L/coding/twidump')
import twidump # type: ignore
sys.path.pop() # TODO not sure if necessary?
Tid = str Tid = str
# TODO make sure it's not used anywhere else and simplify interface # TODO make sure it's not used anywhere else and simplify interface
class Tweet: class Tweet:
def __init__(self, tw): def __init__(self, tw: Dict[str, Any]) -> None:
self.tw = tw self.tw = tw
def __getattr__(self, attr):
return getattr(self.tw, attr)
@property
def url(self) -> str:
return self.tw.permalink(username='karlicoss')
@property
def time(self) -> str:
return self.tw.created_at
@property
def dt(self) -> datetime:
return self.tw.get_utc_datetime()
@property
def text(self) -> str:
return self.tw.text
@property @property
def tid(self) -> Tid: def tid(self) -> Tid:
return self.tw.id_str return self.tw['id_str']
@property
def permalink(self) -> str:
return f'https://twitter.com/i/web/status/{self.tid}'
@property
def dt(self) -> datetime:
dts = self.tw['created_at']
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
@property
def text(self) -> str:
return self.tw['full_text']
@property
def entities(self):
return self.tw['entities']
def __str__(self) -> str: def __str__(self) -> str:
return str(self.tw) return str(self.tw)
@ -55,47 +63,17 @@ class Tweet:
return repr(self.tw) return repr(self.tw)
def _twidump() -> List[Tweet]: def _from_json_export() -> Iterator[Tweet]:
import twidump epath = _get_export()
# add current package to path to discover config?... nah, twidump should be capable of that. ddd = zipfile.ZipFile(epath).read('tweet.js').decode('utf8')
from twidump.data_manipulation.timelines import TimelineLoader # type: ignore
from twidump.component import get_app_injector # type: ignore
tl_loader = get_app_injector(db_path=DB_PATH).get(TimelineLoader) # type: TimelineLoader
tl = tl_loader.load_timeline(KARLICOSS_ID)
return [Tweet(x) for x in tl]
def _json() -> List[Tweet]:
from twidump.data.tweet import Tweet as TDTweet # type: ignore
zips = EXPORTS_PATH.glob('*.zip')
last = list(sorted(zips, key=lambda p: p.stat().st_mtime))[-1]
ddd = zipfile.ZipFile(last).read('tweet.js').decode('utf8')
start = ddd.index('[') start = ddd.index('[')
ddd = ddd[start:] ddd = ddd[start:]
tws = []
for j in json.loads(ddd): for j in json.loads(ddd):
j['user'] = {} # TODO is it ok? yield Tweet(j)
tw = Tweet(TDTweet.from_api_dict(j))
tws.append(tw)
return tws
def tweets_all() -> List[Tweet]: def tweets_all() -> List[Tweet]:
tjson: Dict[Tid, Tweet] = make_dict(_json(), key=lambda t: t.tid) return list(sorted(_from_json_export(), key=lambda t: t.dt))
tdump: Dict[Tid, Tweet] = make_dict(_twidump(), key=lambda t: t.tid)
keys: Set[Tid] = set(tdump.keys()).union(set(tjson.keys()))
# TODO hmm. looks like json generally got longer tweets?
res: List[Tweet] = []
for tid in keys:
if tid in tjson:
tw = tjson[tid]
else:
tw = tdump[tid]
res.append(tw)
res.sort(key=lambda t: t.dt)
return res
def predicate(p) -> List[Tweet]: def predicate(p) -> List[Tweet]:
@ -104,6 +82,7 @@ def predicate(p) -> List[Tweet]:
def predicate_date(p) -> List[Tweet]: def predicate_date(p) -> List[Tweet]:
return predicate(lambda t: p(t.dt.date())) return predicate(lambda t: p(t.dt.date()))
# TODO move these to private tests?
Datish = Union[date, str] Datish = Union[date, str]
def tweets_on(*dts: Datish) -> List[Tweet]: def tweets_on(*dts: Datish) -> List[Tweet]:
from kython import parse_date_new from kython import parse_date_new
@ -113,14 +92,43 @@ def tweets_on(*dts: Datish) -> List[Tweet]:
on = tweets_on on = tweets_on
def test_on():
tww = tweets_on('2019-05-11')
assert len(tww) == 2
def test_all(): def test_tweet():
tall = tweets_all() raw = """
assert len(tall) > 100 {
"retweeted" : false,
if __name__ == '__main__': "entities" : {
for t in tweets_all(): "hashtags" : [ ],
print(t) "symbols" : [ ],
"user_mentions" : [ ],
"urls" : [ {
"url" : "https://t.co/vUg4W6nxwU",
"expanded_url" : "https://intelligence.org/2013/12/13/aaronson/",
"display_url" : "intelligence.org/2013/12/13/aar…",
"indices" : [ "120", "143" ]
}
]
},
"display_text_range" : [ "0", "90" ],
"favorite_count" : "0",
"in_reply_to_status_id_str" : "24123424",
"id_str" : "2328934829084",
"in_reply_to_user_id" : "23423424",
"truncated" : false,
"retweet_count" : "0",
"id" : "23492349032940",
"in_reply_to_status_id" : "23482984932084",
"created_at" : "Thu Aug 30 07:12:48 +0000 2012",
"favorited" : false,
"full_text" : "this is a test tweet",
"lang" : "ru",
"in_reply_to_screen_name" : "whatever",
"in_reply_to_user_id_str" : "3748274"
}
"""
t = Tweet(json.loads(raw))
assert t.permalink is not None
assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
assert t.text == 'this is a test tweet'
assert t.tid == '2328934829084'
assert t.entities is not None