diff --git a/my/twitter/all.py b/my/twitter/all.py new file mode 100644 index 0000000..4a1d1be --- /dev/null +++ b/my/twitter/all.py @@ -0,0 +1,27 @@ +""" +Unified Twitter data (merged from the archive and periodic updates) +""" +from itertools import chain + +from . import twint +from . import archive + + +from more_itertools import unique_everseen + + +def merge_tweets(*sources): + yield from unique_everseen( + chain(*sources), + key=lambda t: t.id_str, + ) + + +def tweets(): + # NOTE order matters.. twint seems to contain better data + # todo probably, worthy an investigation.. + yield from merge_tweets(twint.likes(), archive.tweets()) + + +def likes(): + yield from merge_tweets(twint.likes(), archive.likes()) diff --git a/my/twitter.py b/my/twitter/archive.py similarity index 80% rename from my/twitter.py rename to my/twitter/archive.py index 37d08c4..24f9a14 100755 --- a/my/twitter.py +++ b/my/twitter/archive.py @@ -3,27 +3,25 @@ Twitter data (uses official twitter archive export) See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive """ - -from . import init - - from datetime import date, datetime from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple from pathlib import Path +from functools import lru_cache import json import zipfile import pytz -from .common import PathIsh, get_files, LazyLogger, Json -from .kython import kompress +from ..common import PathIsh, get_files, LazyLogger, Json +from ..kython import kompress + +from my.config import twitter as config logger = LazyLogger(__name__) def _get_export() -> Path: - from my.config import twitter as config return max(get_files(config.export_path, '*.zip')) @@ -33,29 +31,33 @@ Tid = str # TODO make sure it's not used anywhere else and simplify interface class Tweet(NamedTuple): raw: Json + screen_name: str - # TODO deprecate tid? @property - def tid(self) -> Tid: + def id_str(self) -> str: return self.raw['id_str'] @property - def permalink(self) -> str: - return f'https://twitter.com/i/web/status/{self.tid}' - - # TODO deprecate dt? - @property - def dt(self) -> datetime: + def created_at(self) -> datetime: dts = self.raw['created_at'] return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') + @property + def permalink(self) -> str: + return f'https://twitter.com/{self.screen_name}/status/{self.tid}' + @property def text(self) -> str: return self.raw['full_text'] - # TODO not sure if I need them... @property - def entities(self): + def urls(self) -> List[str]: + ents = self.entities + us = ents['urls'] + return [u['expanded_url'] for u in us] + + @property + def entities(self) -> Json: return self.raw['entities'] def __str__(self) -> str: @@ -64,18 +66,28 @@ class Tweet(NamedTuple): def __repr__(self) -> str: return repr(self.raw) + # TODO deprecate tid? + @property + def tid(self) -> Tid: + return self.id_str + + @property + def dt(self) -> datetime: + return self.created_at + class Like(NamedTuple): raw: Json + screen_name: str # TODO need to make permalink/link/url consistent across my stuff.. @property def permalink(self) -> str: # doesn'tseem like link it export is more specific... - return f'https://twitter.com/i/web/status/{self.tid}' + return f'https://twitter.com/{self.screen_name}/status/{self.tid}' @property - def tid(self) -> Tid: + def id_str(self) -> Tid: return self.raw['tweetId'] @property @@ -83,6 +95,11 @@ class Like(NamedTuple): # ugh. I think none means that tweet was deleted? return self.raw.get('fullText') + # TODO deprecate? + @property + def tid(self) -> Tid: + return self.id_str + class ZipExport: def __init__(self) -> None: @@ -113,17 +130,21 @@ class ZipExport: # older format yield j + @lru_cache(1) + def screen_name(self) -> str: + [acc] = self.raw('account') + return acc['username'] def tweets(self) -> Iterator[Tweet]: for r in self.raw('tweet'): - yield Tweet(r) + yield Tweet(r, screen_name=self.screen_name()) def likes(self) -> Iterator[Like]: # TODO ugh. would be nice to unify Tweet/Like interface # however, akeout only got tweetId, full text and url for r in self.raw('like'): - yield Like(r) + yield Like(r, screen_name=self.screen_name()) def tweets() -> List[Tweet]: @@ -185,7 +206,7 @@ def test_tweet(): "in_reply_to_user_id_str" : "3748274" } """ - t = Tweet(json.loads(raw)) + t = Tweet(json.loads(raw), screen_name='whatever') assert t.permalink is not None assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc) assert t.text == 'this is a test tweet' diff --git a/my/twitter_twint.py b/my/twitter/twint.py similarity index 56% rename from my/twitter_twint.py rename to my/twitter/twint.py index 22b2c23..45f58fd 100644 --- a/my/twitter_twint.py +++ b/my/twitter/twint.py @@ -3,11 +3,11 @@ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twin """ from datetime import datetime -from typing import NamedTuple, Iterable +from typing import NamedTuple, Iterable, List from pathlib import Path -from .common import PathIsh, get_files, LazyLogger, Json -from .core.time import abbr_to_timezone +from ..common import PathIsh, get_files, LazyLogger, Json +from ..core.time import abbr_to_timezone from my.config import twint as config @@ -45,6 +45,12 @@ class Tweet(NamedTuple): def text(self) -> str: return self.row['tweet'] + @property + def urls(self) -> List[str]: + ustr = self.row['urls'] + if len(ustr) == 0: + return [] + return ustr.split(',') @property def permalink(self) -> str: @@ -55,11 +61,37 @@ class Tweet(NamedTuple): def __repr__(self): return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})' +# https://github.com/twintproject/twint/issues/196 +# ugh. so it dumps everything in tweet table, and there is no good way to tell between fav/original tweet. +# it might result in some tweets missing from the timeline if you happened to like them... +# not sure what to do with it +# alternatively, could ask the user to run separate databases for tweets and favs? +# TODO think about it -def tweets() -> Iterable[Tweet]: +_QUERY = ''' +SELECT T.* +FROM tweets as T +LEFT JOIN favorites as F +ON T.id_str = F.tweet_id +WHERE {where} +ORDER BY T.created_at +''' + +def _get_db(): import dataset # type: ignore db_path = get_db_path() # TODO check that exists? db = dataset.connect(f'sqlite:///{db_path}') - tdb = db.load_table('tweets') - yield from map(Tweet, tdb.all(order_by='created_at')) + return db + + +def tweets() -> Iterable[Tweet]: + db = _get_db() + res = db.query(_QUERY.format(where='F.tweet_id IS NULL')) + yield from map(Tweet, res) + + +def likes() -> Iterable[Tweet]: + db = _get_db() + res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL')) + yield from map(Tweet, res)