unified view for twitter data

This commit is contained in:
Dima Gerasimov 2020-04-14 22:05:47 +01:00
parent 56b6ab9aaf
commit 30b6918a8d
3 changed files with 63 additions and 24 deletions

17
my/twitter/all.py Normal file
View file

@ -0,0 +1,17 @@
"""
Unified Twitter data (merged from the archive and periodic updates)
"""
from . import twint
from . import archive
def tweets():
yield from archive.tweets()
yield from twint.tweets()
# TODO not sure, likes vs favoites??
def likes():
yield from archive.likes()
# yield from twint

View file

@ -3,27 +3,25 @@ Twitter data (uses official twitter archive export)
See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive
""" """
from . import init
from datetime import date, datetime from datetime import date, datetime
from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple
from pathlib import Path from pathlib import Path
from functools import lru_cache
import json import json
import zipfile import zipfile
import pytz import pytz
from .common import PathIsh, get_files, LazyLogger, Json from ..common import PathIsh, get_files, LazyLogger, Json
from .kython import kompress from ..kython import kompress
from my.config import twitter as config
logger = LazyLogger(__name__) logger = LazyLogger(__name__)
def _get_export() -> Path: def _get_export() -> Path:
from my.config import twitter as config
return max(get_files(config.export_path, '*.zip')) return max(get_files(config.export_path, '*.zip'))
@ -33,29 +31,33 @@ Tid = str
# TODO make sure it's not used anywhere else and simplify interface # TODO make sure it's not used anywhere else and simplify interface
class Tweet(NamedTuple): class Tweet(NamedTuple):
raw: Json raw: Json
screen_name: str
# TODO deprecate tid?
@property @property
def tid(self) -> Tid: def id_str(self) -> str:
return self.raw['id_str'] return self.raw['id_str']
@property @property
def permalink(self) -> str: def created_at(self) -> datetime:
return f'https://twitter.com/i/web/status/{self.tid}'
# TODO deprecate dt?
@property
def dt(self) -> datetime:
dts = self.raw['created_at'] dts = self.raw['created_at']
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
@property
def permalink(self) -> str:
return f'https://twitter.com/{self.screen_name}/status/{self.tid}'
@property @property
def text(self) -> str: def text(self) -> str:
return self.raw['full_text'] return self.raw['full_text']
# TODO not sure if I need them...
@property @property
def entities(self): def urls(self) -> List[str]:
ents = self.entities
us = ents['urls']
return [u['expanded_url'] for u in us]
@property
def entities(self) -> Json:
return self.raw['entities'] return self.raw['entities']
def __str__(self) -> str: def __str__(self) -> str:
@ -64,15 +66,25 @@ class Tweet(NamedTuple):
def __repr__(self) -> str: def __repr__(self) -> str:
return repr(self.raw) return repr(self.raw)
# TODO deprecate tid?
@property
def tid(self) -> Tid:
return self.id_str
@property
def dt(self) -> datetime:
return self.created_at
class Like(NamedTuple): class Like(NamedTuple):
raw: Json raw: Json
screen_name: str
# TODO need to make permalink/link/url consistent across my stuff.. # TODO need to make permalink/link/url consistent across my stuff..
@property @property
def permalink(self) -> str: def permalink(self) -> str:
# doesn'tseem like link it export is more specific... # doesn'tseem like link it export is more specific...
return f'https://twitter.com/i/web/status/{self.tid}' return f'https://twitter.com/{self.screen_name}/status/{self.tid}'
@property @property
def tid(self) -> Tid: def tid(self) -> Tid:
@ -113,17 +125,21 @@ class ZipExport:
# older format # older format
yield j yield j
@lru_cache(1)
def screen_name(self) -> str:
[acc] = self.raw('account')
return acc['username']
def tweets(self) -> Iterator[Tweet]: def tweets(self) -> Iterator[Tweet]:
for r in self.raw('tweet'): for r in self.raw('tweet'):
yield Tweet(r) yield Tweet(r, screen_name=self.screen_name())
def likes(self) -> Iterator[Like]: def likes(self) -> Iterator[Like]:
# TODO ugh. would be nice to unify Tweet/Like interface # TODO ugh. would be nice to unify Tweet/Like interface
# however, akeout only got tweetId, full text and url # however, akeout only got tweetId, full text and url
for r in self.raw('like'): for r in self.raw('like'):
yield Like(r) yield Like(r, screen_name=self.screen_name())
def tweets() -> List[Tweet]: def tweets() -> List[Tweet]:
@ -185,7 +201,7 @@ def test_tweet():
"in_reply_to_user_id_str" : "3748274" "in_reply_to_user_id_str" : "3748274"
} }
""" """
t = Tweet(json.loads(raw)) t = Tweet(json.loads(raw), screen_name='whatever')
assert t.permalink is not None assert t.permalink is not None
assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc) assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
assert t.text == 'this is a test tweet' assert t.text == 'this is a test tweet'

View file

@ -3,11 +3,11 @@ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twin
""" """
from datetime import datetime from datetime import datetime
from typing import NamedTuple, Iterable from typing import NamedTuple, Iterable, List
from pathlib import Path from pathlib import Path
from .common import PathIsh, get_files, LazyLogger, Json from ..common import PathIsh, get_files, LazyLogger, Json
from .core.time import abbr_to_timezone from ..core.time import abbr_to_timezone
from my.config import twint as config from my.config import twint as config
@ -45,6 +45,12 @@ class Tweet(NamedTuple):
def text(self) -> str: def text(self) -> str:
return self.row['tweet'] return self.row['tweet']
@property
def urls(self) -> List[str]:
ustr = self.row['urls']
if len(ustr) == 0:
return []
return ustr.split(',')
@property @property
def permalink(self) -> str: def permalink(self) -> str: