Merge pull request #22 from karlicoss/twitter

merge in likes from twint
This commit is contained in:
karlicoss 2020-04-14 23:05:57 +01:00 committed by GitHub
commit 614fcce26b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 108 additions and 28 deletions

27
my/twitter/all.py Normal file
View file

@ -0,0 +1,27 @@
"""
Unified Twitter data (merged from the archive and periodic updates)
"""
from itertools import chain
from . import twint
from . import archive
from more_itertools import unique_everseen
def merge_tweets(*sources):
yield from unique_everseen(
chain(*sources),
key=lambda t: t.id_str,
)
def tweets():
# NOTE order matters.. twint seems to contain better data
# todo probably, worthy an investigation..
yield from merge_tweets(twint.likes(), archive.tweets())
def likes():
yield from merge_tweets(twint.likes(), archive.likes())

View file

@ -3,27 +3,25 @@ Twitter data (uses official twitter archive export)
See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive
""" """
from . import init
from datetime import date, datetime from datetime import date, datetime
from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple
from pathlib import Path from pathlib import Path
from functools import lru_cache
import json import json
import zipfile import zipfile
import pytz import pytz
from .common import PathIsh, get_files, LazyLogger, Json from ..common import PathIsh, get_files, LazyLogger, Json
from .kython import kompress from ..kython import kompress
from my.config import twitter as config
logger = LazyLogger(__name__) logger = LazyLogger(__name__)
def _get_export() -> Path: def _get_export() -> Path:
from my.config import twitter as config
return max(get_files(config.export_path, '*.zip')) return max(get_files(config.export_path, '*.zip'))
@ -33,29 +31,33 @@ Tid = str
# TODO make sure it's not used anywhere else and simplify interface # TODO make sure it's not used anywhere else and simplify interface
class Tweet(NamedTuple): class Tweet(NamedTuple):
raw: Json raw: Json
screen_name: str
# TODO deprecate tid?
@property @property
def tid(self) -> Tid: def id_str(self) -> str:
return self.raw['id_str'] return self.raw['id_str']
@property @property
def permalink(self) -> str: def created_at(self) -> datetime:
return f'https://twitter.com/i/web/status/{self.tid}'
# TODO deprecate dt?
@property
def dt(self) -> datetime:
dts = self.raw['created_at'] dts = self.raw['created_at']
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
@property
def permalink(self) -> str:
return f'https://twitter.com/{self.screen_name}/status/{self.tid}'
@property @property
def text(self) -> str: def text(self) -> str:
return self.raw['full_text'] return self.raw['full_text']
# TODO not sure if I need them...
@property @property
def entities(self): def urls(self) -> List[str]:
ents = self.entities
us = ents['urls']
return [u['expanded_url'] for u in us]
@property
def entities(self) -> Json:
return self.raw['entities'] return self.raw['entities']
def __str__(self) -> str: def __str__(self) -> str:
@ -64,18 +66,28 @@ class Tweet(NamedTuple):
def __repr__(self) -> str: def __repr__(self) -> str:
return repr(self.raw) return repr(self.raw)
# TODO deprecate tid?
@property
def tid(self) -> Tid:
return self.id_str
@property
def dt(self) -> datetime:
return self.created_at
class Like(NamedTuple): class Like(NamedTuple):
raw: Json raw: Json
screen_name: str
# TODO need to make permalink/link/url consistent across my stuff.. # TODO need to make permalink/link/url consistent across my stuff..
@property @property
def permalink(self) -> str: def permalink(self) -> str:
# doesn'tseem like link it export is more specific... # doesn'tseem like link it export is more specific...
return f'https://twitter.com/i/web/status/{self.tid}' return f'https://twitter.com/{self.screen_name}/status/{self.tid}'
@property @property
def tid(self) -> Tid: def id_str(self) -> Tid:
return self.raw['tweetId'] return self.raw['tweetId']
@property @property
@ -83,6 +95,11 @@ class Like(NamedTuple):
# ugh. I think none means that tweet was deleted? # ugh. I think none means that tweet was deleted?
return self.raw.get('fullText') return self.raw.get('fullText')
# TODO deprecate?
@property
def tid(self) -> Tid:
return self.id_str
class ZipExport: class ZipExport:
def __init__(self) -> None: def __init__(self) -> None:
@ -113,17 +130,21 @@ class ZipExport:
# older format # older format
yield j yield j
@lru_cache(1)
def screen_name(self) -> str:
[acc] = self.raw('account')
return acc['username']
def tweets(self) -> Iterator[Tweet]: def tweets(self) -> Iterator[Tweet]:
for r in self.raw('tweet'): for r in self.raw('tweet'):
yield Tweet(r) yield Tweet(r, screen_name=self.screen_name())
def likes(self) -> Iterator[Like]: def likes(self) -> Iterator[Like]:
# TODO ugh. would be nice to unify Tweet/Like interface # TODO ugh. would be nice to unify Tweet/Like interface
# however, akeout only got tweetId, full text and url # however, akeout only got tweetId, full text and url
for r in self.raw('like'): for r in self.raw('like'):
yield Like(r) yield Like(r, screen_name=self.screen_name())
def tweets() -> List[Tweet]: def tweets() -> List[Tweet]:
@ -185,7 +206,7 @@ def test_tweet():
"in_reply_to_user_id_str" : "3748274" "in_reply_to_user_id_str" : "3748274"
} }
""" """
t = Tweet(json.loads(raw)) t = Tweet(json.loads(raw), screen_name='whatever')
assert t.permalink is not None assert t.permalink is not None
assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc) assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
assert t.text == 'this is a test tweet' assert t.text == 'this is a test tweet'

View file

@ -3,11 +3,11 @@ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twin
""" """
from datetime import datetime from datetime import datetime
from typing import NamedTuple, Iterable from typing import NamedTuple, Iterable, List
from pathlib import Path from pathlib import Path
from .common import PathIsh, get_files, LazyLogger, Json from ..common import PathIsh, get_files, LazyLogger, Json
from .core.time import abbr_to_timezone from ..core.time import abbr_to_timezone
from my.config import twint as config from my.config import twint as config
@ -45,6 +45,12 @@ class Tweet(NamedTuple):
def text(self) -> str: def text(self) -> str:
return self.row['tweet'] return self.row['tweet']
@property
def urls(self) -> List[str]:
ustr = self.row['urls']
if len(ustr) == 0:
return []
return ustr.split(',')
@property @property
def permalink(self) -> str: def permalink(self) -> str:
@ -55,11 +61,37 @@ class Tweet(NamedTuple):
def __repr__(self): def __repr__(self):
return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})' return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})'
# https://github.com/twintproject/twint/issues/196
# ugh. so it dumps everything in tweet table, and there is no good way to tell between fav/original tweet.
# it might result in some tweets missing from the timeline if you happened to like them...
# not sure what to do with it
# alternatively, could ask the user to run separate databases for tweets and favs?
# TODO think about it
def tweets() -> Iterable[Tweet]: _QUERY = '''
SELECT T.*
FROM tweets as T
LEFT JOIN favorites as F
ON T.id_str = F.tweet_id
WHERE {where}
ORDER BY T.created_at
'''
def _get_db():
import dataset # type: ignore import dataset # type: ignore
db_path = get_db_path() db_path = get_db_path()
# TODO check that exists? # TODO check that exists?
db = dataset.connect(f'sqlite:///{db_path}') db = dataset.connect(f'sqlite:///{db_path}')
tdb = db.load_table('tweets') return db
yield from map(Tweet, tdb.all(order_by='created_at'))
def tweets() -> Iterable[Tweet]:
db = _get_db()
res = db.query(_QUERY.format(where='F.tweet_id IS NULL'))
yield from map(Tweet, res)
def likes() -> Iterable[Tweet]:
db = _get_db()
res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL'))
yield from map(Tweet, res)