Merge pull request #22 from karlicoss/twitter

merge in likes from twint
This commit is contained in:
karlicoss 2020-04-14 23:05:57 +01:00 committed by GitHub
commit 614fcce26b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 108 additions and 28 deletions

27
my/twitter/all.py Normal file
View file

@ -0,0 +1,27 @@
"""
Unified Twitter data (merged from the archive and periodic updates)
"""
from itertools import chain
from . import twint
from . import archive
from more_itertools import unique_everseen
def merge_tweets(*sources):
yield from unique_everseen(
chain(*sources),
key=lambda t: t.id_str,
)
def tweets():
# NOTE order matters.. twint seems to contain better data
# todo probably, worthy an investigation..
yield from merge_tweets(twint.likes(), archive.tweets())
def likes():
yield from merge_tweets(twint.likes(), archive.likes())

View file

@ -3,27 +3,25 @@ Twitter data (uses official twitter archive export)
See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive
"""
from . import init
from datetime import date, datetime
from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple
from pathlib import Path
from functools import lru_cache
import json
import zipfile
import pytz
from .common import PathIsh, get_files, LazyLogger, Json
from .kython import kompress
from ..common import PathIsh, get_files, LazyLogger, Json
from ..kython import kompress
from my.config import twitter as config
logger = LazyLogger(__name__)
def _get_export() -> Path:
from my.config import twitter as config
return max(get_files(config.export_path, '*.zip'))
@ -33,29 +31,33 @@ Tid = str
# TODO make sure it's not used anywhere else and simplify interface
class Tweet(NamedTuple):
raw: Json
screen_name: str
# TODO deprecate tid?
@property
def tid(self) -> Tid:
def id_str(self) -> str:
return self.raw['id_str']
@property
def permalink(self) -> str:
return f'https://twitter.com/i/web/status/{self.tid}'
# TODO deprecate dt?
@property
def dt(self) -> datetime:
def created_at(self) -> datetime:
dts = self.raw['created_at']
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
@property
def permalink(self) -> str:
return f'https://twitter.com/{self.screen_name}/status/{self.tid}'
@property
def text(self) -> str:
return self.raw['full_text']
# TODO not sure if I need them...
@property
def entities(self):
def urls(self) -> List[str]:
ents = self.entities
us = ents['urls']
return [u['expanded_url'] for u in us]
@property
def entities(self) -> Json:
return self.raw['entities']
def __str__(self) -> str:
@ -64,18 +66,28 @@ class Tweet(NamedTuple):
def __repr__(self) -> str:
return repr(self.raw)
# TODO deprecate tid?
@property
def tid(self) -> Tid:
return self.id_str
@property
def dt(self) -> datetime:
return self.created_at
class Like(NamedTuple):
raw: Json
screen_name: str
# TODO need to make permalink/link/url consistent across my stuff..
@property
def permalink(self) -> str:
# doesn'tseem like link it export is more specific...
return f'https://twitter.com/i/web/status/{self.tid}'
return f'https://twitter.com/{self.screen_name}/status/{self.tid}'
@property
def tid(self) -> Tid:
def id_str(self) -> Tid:
return self.raw['tweetId']
@property
@ -83,6 +95,11 @@ class Like(NamedTuple):
# ugh. I think none means that tweet was deleted?
return self.raw.get('fullText')
# TODO deprecate?
@property
def tid(self) -> Tid:
return self.id_str
class ZipExport:
def __init__(self) -> None:
@ -113,17 +130,21 @@ class ZipExport:
# older format
yield j
@lru_cache(1)
def screen_name(self) -> str:
[acc] = self.raw('account')
return acc['username']
def tweets(self) -> Iterator[Tweet]:
for r in self.raw('tweet'):
yield Tweet(r)
yield Tweet(r, screen_name=self.screen_name())
def likes(self) -> Iterator[Like]:
# TODO ugh. would be nice to unify Tweet/Like interface
# however, akeout only got tweetId, full text and url
for r in self.raw('like'):
yield Like(r)
yield Like(r, screen_name=self.screen_name())
def tweets() -> List[Tweet]:
@ -185,7 +206,7 @@ def test_tweet():
"in_reply_to_user_id_str" : "3748274"
}
"""
t = Tweet(json.loads(raw))
t = Tweet(json.loads(raw), screen_name='whatever')
assert t.permalink is not None
assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
assert t.text == 'this is a test tweet'

View file

@ -3,11 +3,11 @@ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twin
"""
from datetime import datetime
from typing import NamedTuple, Iterable
from typing import NamedTuple, Iterable, List
from pathlib import Path
from .common import PathIsh, get_files, LazyLogger, Json
from .core.time import abbr_to_timezone
from ..common import PathIsh, get_files, LazyLogger, Json
from ..core.time import abbr_to_timezone
from my.config import twint as config
@ -45,6 +45,12 @@ class Tweet(NamedTuple):
def text(self) -> str:
return self.row['tweet']
@property
def urls(self) -> List[str]:
ustr = self.row['urls']
if len(ustr) == 0:
return []
return ustr.split(',')
@property
def permalink(self) -> str:
@ -55,11 +61,37 @@ class Tweet(NamedTuple):
def __repr__(self):
return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})'
# https://github.com/twintproject/twint/issues/196
# ugh. so it dumps everything in tweet table, and there is no good way to tell between fav/original tweet.
# it might result in some tweets missing from the timeline if you happened to like them...
# not sure what to do with it
# alternatively, could ask the user to run separate databases for tweets and favs?
# TODO think about it
def tweets() -> Iterable[Tweet]:
_QUERY = '''
SELECT T.*
FROM tweets as T
LEFT JOIN favorites as F
ON T.id_str = F.tweet_id
WHERE {where}
ORDER BY T.created_at
'''
def _get_db():
import dataset # type: ignore
db_path = get_db_path()
# TODO check that exists?
db = dataset.connect(f'sqlite:///{db_path}')
tdb = db.load_table('tweets')
yield from map(Tweet, tdb.all(order_by='created_at'))
return db
def tweets() -> Iterable[Tweet]:
db = _get_db()
res = db.query(_QUERY.format(where='F.tweet_id IS NULL'))
yield from map(Tweet, res)
def likes() -> Iterable[Tweet]:
db = _get_db()
res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL'))
yield from map(Tweet, res)