From 56b6ab9aaf1b980ff2f65f51d1ceebad573fb0e9 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 14 Apr 2020 21:31:40 +0100 Subject: [PATCH 1/4] move twitter stuff to twitter subdir --- my/{twitter.py => twitter/archive.py} | 0 my/{twitter_twint.py => twitter/twint.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename my/{twitter.py => twitter/archive.py} (100%) rename my/{twitter_twint.py => twitter/twint.py} (100%) diff --git a/my/twitter.py b/my/twitter/archive.py similarity index 100% rename from my/twitter.py rename to my/twitter/archive.py diff --git a/my/twitter_twint.py b/my/twitter/twint.py similarity index 100% rename from my/twitter_twint.py rename to my/twitter/twint.py From 30b6918a8de626b85ca89444c02714e438193693 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 14 Apr 2020 22:05:47 +0100 Subject: [PATCH 2/4] unified view for twitter data --- my/twitter/all.py | 17 +++++++++++++ my/twitter/archive.py | 58 +++++++++++++++++++++++++++---------------- my/twitter/twint.py | 12 ++++++--- 3 files changed, 63 insertions(+), 24 deletions(-) create mode 100644 my/twitter/all.py diff --git a/my/twitter/all.py b/my/twitter/all.py new file mode 100644 index 0000000..29196e4 --- /dev/null +++ b/my/twitter/all.py @@ -0,0 +1,17 @@ +""" +Unified Twitter data (merged from the archive and periodic updates) +""" + +from . import twint +from . import archive + + +def tweets(): + yield from archive.tweets() + yield from twint.tweets() + + +# TODO not sure, likes vs favoites?? +def likes(): + yield from archive.likes() + # yield from twint diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 37d08c4..9f4c7ee 100755 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -3,27 +3,25 @@ Twitter data (uses official twitter archive export) See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive """ - -from . import init - - from datetime import date, datetime from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple from pathlib import Path +from functools import lru_cache import json import zipfile import pytz -from .common import PathIsh, get_files, LazyLogger, Json -from .kython import kompress +from ..common import PathIsh, get_files, LazyLogger, Json +from ..kython import kompress + +from my.config import twitter as config logger = LazyLogger(__name__) def _get_export() -> Path: - from my.config import twitter as config return max(get_files(config.export_path, '*.zip')) @@ -33,29 +31,33 @@ Tid = str # TODO make sure it's not used anywhere else and simplify interface class Tweet(NamedTuple): raw: Json + screen_name: str - # TODO deprecate tid? @property - def tid(self) -> Tid: + def id_str(self) -> str: return self.raw['id_str'] @property - def permalink(self) -> str: - return f'https://twitter.com/i/web/status/{self.tid}' - - # TODO deprecate dt? - @property - def dt(self) -> datetime: + def created_at(self) -> datetime: dts = self.raw['created_at'] return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') + @property + def permalink(self) -> str: + return f'https://twitter.com/{self.screen_name}/status/{self.tid}' + @property def text(self) -> str: return self.raw['full_text'] - # TODO not sure if I need them... @property - def entities(self): + def urls(self) -> List[str]: + ents = self.entities + us = ents['urls'] + return [u['expanded_url'] for u in us] + + @property + def entities(self) -> Json: return self.raw['entities'] def __str__(self) -> str: @@ -64,15 +66,25 @@ class Tweet(NamedTuple): def __repr__(self) -> str: return repr(self.raw) + # TODO deprecate tid? + @property + def tid(self) -> Tid: + return self.id_str + + @property + def dt(self) -> datetime: + return self.created_at + class Like(NamedTuple): raw: Json + screen_name: str # TODO need to make permalink/link/url consistent across my stuff.. @property def permalink(self) -> str: # doesn'tseem like link it export is more specific... - return f'https://twitter.com/i/web/status/{self.tid}' + return f'https://twitter.com/{self.screen_name}/status/{self.tid}' @property def tid(self) -> Tid: @@ -113,17 +125,21 @@ class ZipExport: # older format yield j + @lru_cache(1) + def screen_name(self) -> str: + [acc] = self.raw('account') + return acc['username'] def tweets(self) -> Iterator[Tweet]: for r in self.raw('tweet'): - yield Tweet(r) + yield Tweet(r, screen_name=self.screen_name()) def likes(self) -> Iterator[Like]: # TODO ugh. would be nice to unify Tweet/Like interface # however, akeout only got tweetId, full text and url for r in self.raw('like'): - yield Like(r) + yield Like(r, screen_name=self.screen_name()) def tweets() -> List[Tweet]: @@ -185,7 +201,7 @@ def test_tweet(): "in_reply_to_user_id_str" : "3748274" } """ - t = Tweet(json.loads(raw)) + t = Tweet(json.loads(raw), screen_name='whatever') assert t.permalink is not None assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc) assert t.text == 'this is a test tweet' diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 22b2c23..2180a8a 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -3,11 +3,11 @@ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twin """ from datetime import datetime -from typing import NamedTuple, Iterable +from typing import NamedTuple, Iterable, List from pathlib import Path -from .common import PathIsh, get_files, LazyLogger, Json -from .core.time import abbr_to_timezone +from ..common import PathIsh, get_files, LazyLogger, Json +from ..core.time import abbr_to_timezone from my.config import twint as config @@ -45,6 +45,12 @@ class Tweet(NamedTuple): def text(self) -> str: return self.row['tweet'] + @property + def urls(self) -> List[str]: + ustr = self.row['urls'] + if len(ustr) == 0: + return [] + return ustr.split(',') @property def permalink(self) -> str: From 69a1624f8f407670039f01bf9ba1ebe608a77fd2 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 14 Apr 2020 22:15:35 +0100 Subject: [PATCH 3/4] use more-itertools; merge tweets --- my/twitter/all.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/my/twitter/all.py b/my/twitter/all.py index 29196e4..f1e39a7 100644 --- a/my/twitter/all.py +++ b/my/twitter/all.py @@ -1,17 +1,29 @@ """ Unified Twitter data (merged from the archive and periodic updates) """ +from itertools import chain from . import twint from . import archive +from more_itertools import unique_everseen + + +def merge_tweets(*sources): + yield from unique_everseen( + chain(*sources), + key=lambda t: t.id_str, + ) + + def tweets(): - yield from archive.tweets() - yield from twint.tweets() + # NOTE order matters.. twint seems to contain better data + # todo probably, worthy an investigation.. + yield from merge_tweets(twint.tweets(), archive.tweets()) # TODO not sure, likes vs favoites?? def likes(): - yield from archive.likes() + yield from merge_tweets(archive.likes()) # yield from twint From 81986b06249e5c162dee56b9adbe42cb18ef1135 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 14 Apr 2020 23:01:44 +0100 Subject: [PATCH 4/4] support likes from twint --- my/twitter/all.py | 6 ++---- my/twitter/archive.py | 7 ++++++- my/twitter/twint.py | 32 +++++++++++++++++++++++++++++--- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/my/twitter/all.py b/my/twitter/all.py index f1e39a7..4a1d1be 100644 --- a/my/twitter/all.py +++ b/my/twitter/all.py @@ -20,10 +20,8 @@ def merge_tweets(*sources): def tweets(): # NOTE order matters.. twint seems to contain better data # todo probably, worthy an investigation.. - yield from merge_tweets(twint.tweets(), archive.tweets()) + yield from merge_tweets(twint.likes(), archive.tweets()) -# TODO not sure, likes vs favoites?? def likes(): - yield from merge_tweets(archive.likes()) - # yield from twint + yield from merge_tweets(twint.likes(), archive.likes()) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 9f4c7ee..24f9a14 100755 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -87,7 +87,7 @@ class Like(NamedTuple): return f'https://twitter.com/{self.screen_name}/status/{self.tid}' @property - def tid(self) -> Tid: + def id_str(self) -> Tid: return self.raw['tweetId'] @property @@ -95,6 +95,11 @@ class Like(NamedTuple): # ugh. I think none means that tweet was deleted? return self.raw.get('fullText') + # TODO deprecate? + @property + def tid(self) -> Tid: + return self.id_str + class ZipExport: def __init__(self) -> None: diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 2180a8a..45f58fd 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -61,11 +61,37 @@ class Tweet(NamedTuple): def __repr__(self): return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})' +# https://github.com/twintproject/twint/issues/196 +# ugh. so it dumps everything in tweet table, and there is no good way to tell between fav/original tweet. +# it might result in some tweets missing from the timeline if you happened to like them... +# not sure what to do with it +# alternatively, could ask the user to run separate databases for tweets and favs? +# TODO think about it -def tweets() -> Iterable[Tweet]: +_QUERY = ''' +SELECT T.* +FROM tweets as T +LEFT JOIN favorites as F +ON T.id_str = F.tweet_id +WHERE {where} +ORDER BY T.created_at +''' + +def _get_db(): import dataset # type: ignore db_path = get_db_path() # TODO check that exists? db = dataset.connect(f'sqlite:///{db_path}') - tdb = db.load_table('tweets') - yield from map(Tweet, tdb.all(order_by='created_at')) + return db + + +def tweets() -> Iterable[Tweet]: + db = _get_db() + res = db.query(_QUERY.format(where='F.tweet_id IS NULL')) + yield from map(Tweet, res) + + +def likes() -> Iterable[Tweet]: + db = _get_db() + res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL')) + yield from map(Tweet, res)