Handle json twitter export for more timeline data

This commit is contained in:
Dima Gerasimov 2019-03-20 00:36:48 +00:00
parent 2d478b767b
commit 99eb79e230

View file

@ -1,9 +1,15 @@
from datetime import date, datetime from datetime import date, datetime
from typing import Union, List from typing import Union, List, Dict, Set
from pathlib import Path from pathlib import Path
import json
import zipfile
from kython import make_dict
KARLICOSS_ID = '119756204' KARLICOSS_ID = '119756204'
DB_PATH = Path('/L/zzz_syncthing/data/tweets') DB_PATH = Path('/L/zzz_syncthing/data/tweets')
EXPORTS_PATH = Path('/L/backups/twitter-exports')
import sys import sys
@ -11,6 +17,9 @@ sys.path.append('/L/Dropbox/coding/twidump')
import twidump # type: ignore import twidump # type: ignore
sys.path.pop() # TODO not sure if necessary? sys.path.pop() # TODO not sure if necessary?
Tid = str
# TODO make sure it's not used anywhere else and simplify interface
class Tweet: class Tweet:
def __init__(self, tw): def __init__(self, tw):
self.tw = tw self.tw = tw
@ -36,7 +45,7 @@ class Tweet:
return self.tw.text return self.tw.text
@property @property
def tid(self) -> str: def tid(self) -> Tid:
return self.tw.id_str return self.tw.id_str
def __str__(self) -> str: def __str__(self) -> str:
@ -45,7 +54,8 @@ class Tweet:
def __repr__(self) -> str: def __repr__(self) -> str:
return repr(self.tw) return repr(self.tw)
def tweets_all():
def _twidump() -> List[Tweet]:
import twidump import twidump
# add current package to path to discover config?... nah, twidump should be capable of that. # add current package to path to discover config?... nah, twidump should be capable of that.
from twidump.data_manipulation.timelines import TimelineLoader # type: ignore from twidump.data_manipulation.timelines import TimelineLoader # type: ignore
@ -55,6 +65,38 @@ def tweets_all():
return [Tweet(x) for x in tl] return [Tweet(x) for x in tl]
def _json() -> List[Tweet]:
from twidump.data.tweet import Tweet as TDTweet # type: ignore
zips = EXPORTS_PATH.glob('*.zip')
last = list(sorted(zips, key=lambda p: p.stat().st_mtime))[-1]
ddd = zipfile.ZipFile(last).read('tweet.js').decode('utf8')
start = ddd.index('[')
ddd = ddd[start:]
tws = []
for j in json.loads(ddd):
j['user'] = {} # TODO is it ok?
tw = Tweet(TDTweet.from_api_dict(j))
tws.append(tw)
return tws
def tweets_all() -> List[Tweet]:
tjson: Dict[Tid, Tweet] = make_dict(_json(), key=lambda t: t.tid)
tdump: Dict[Tid, Tweet] = make_dict(_twidump(), key=lambda t: t.tid)
keys: Set[Tid] = set(tdump.keys()).union(set(tjson.keys()))
# TODO hmm. looks like json generally got longer tweets?
res: List[Tweet] = []
for tid in keys:
if tid in tjson:
res.append(tjson[tid])
else:
res.append(tdump[tid])
res.sort(key=lambda t: t.dt)
return res
def predicate(p) -> List[Tweet]: def predicate(p) -> List[Tweet]:
return [t for t in tweets_all() if p(t)] return [t for t in tweets_all() if p(t)]
@ -70,6 +112,7 @@ def tweets_on(*dts: Datish) -> List[Tweet]:
on = tweets_on on = tweets_on
if __name__ == '__main__': if __name__ == '__main__':
for t in tweets_all(): for t in tweets_all():
print(t) print(t)