From 30b6918a8de626b85ca89444c02714e438193693 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 14 Apr 2020 22:05:47 +0100 Subject: [PATCH] unified view for twitter data --- my/twitter/all.py | 17 +++++++++++++ my/twitter/archive.py | 58 +++++++++++++++++++++++++++---------------- my/twitter/twint.py | 12 ++++++--- 3 files changed, 63 insertions(+), 24 deletions(-) create mode 100644 my/twitter/all.py diff --git a/my/twitter/all.py b/my/twitter/all.py new file mode 100644 index 0000000..29196e4 --- /dev/null +++ b/my/twitter/all.py @@ -0,0 +1,17 @@ +""" +Unified Twitter data (merged from the archive and periodic updates) +""" + +from . import twint +from . import archive + + +def tweets(): + yield from archive.tweets() + yield from twint.tweets() + + +# TODO not sure, likes vs favoites?? +def likes(): + yield from archive.likes() + # yield from twint diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 37d08c4..9f4c7ee 100755 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -3,27 +3,25 @@ Twitter data (uses official twitter archive export) See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive """ - -from . import init - - from datetime import date, datetime from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple from pathlib import Path +from functools import lru_cache import json import zipfile import pytz -from .common import PathIsh, get_files, LazyLogger, Json -from .kython import kompress +from ..common import PathIsh, get_files, LazyLogger, Json +from ..kython import kompress + +from my.config import twitter as config logger = LazyLogger(__name__) def _get_export() -> Path: - from my.config import twitter as config return max(get_files(config.export_path, '*.zip')) @@ -33,29 +31,33 @@ Tid = str # TODO make sure it's not used anywhere else and simplify interface class Tweet(NamedTuple): raw: Json + screen_name: str - # TODO deprecate tid? @property - def tid(self) -> Tid: + def id_str(self) -> str: return self.raw['id_str'] @property - def permalink(self) -> str: - return f'https://twitter.com/i/web/status/{self.tid}' - - # TODO deprecate dt? - @property - def dt(self) -> datetime: + def created_at(self) -> datetime: dts = self.raw['created_at'] return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') + @property + def permalink(self) -> str: + return f'https://twitter.com/{self.screen_name}/status/{self.tid}' + @property def text(self) -> str: return self.raw['full_text'] - # TODO not sure if I need them... @property - def entities(self): + def urls(self) -> List[str]: + ents = self.entities + us = ents['urls'] + return [u['expanded_url'] for u in us] + + @property + def entities(self) -> Json: return self.raw['entities'] def __str__(self) -> str: @@ -64,15 +66,25 @@ class Tweet(NamedTuple): def __repr__(self) -> str: return repr(self.raw) + # TODO deprecate tid? + @property + def tid(self) -> Tid: + return self.id_str + + @property + def dt(self) -> datetime: + return self.created_at + class Like(NamedTuple): raw: Json + screen_name: str # TODO need to make permalink/link/url consistent across my stuff.. @property def permalink(self) -> str: # doesn'tseem like link it export is more specific... - return f'https://twitter.com/i/web/status/{self.tid}' + return f'https://twitter.com/{self.screen_name}/status/{self.tid}' @property def tid(self) -> Tid: @@ -113,17 +125,21 @@ class ZipExport: # older format yield j + @lru_cache(1) + def screen_name(self) -> str: + [acc] = self.raw('account') + return acc['username'] def tweets(self) -> Iterator[Tweet]: for r in self.raw('tweet'): - yield Tweet(r) + yield Tweet(r, screen_name=self.screen_name()) def likes(self) -> Iterator[Like]: # TODO ugh. would be nice to unify Tweet/Like interface # however, akeout only got tweetId, full text and url for r in self.raw('like'): - yield Like(r) + yield Like(r, screen_name=self.screen_name()) def tweets() -> List[Tweet]: @@ -185,7 +201,7 @@ def test_tweet(): "in_reply_to_user_id_str" : "3748274" } """ - t = Tweet(json.loads(raw)) + t = Tweet(json.loads(raw), screen_name='whatever') assert t.permalink is not None assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc) assert t.text == 'this is a test tweet' diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 22b2c23..2180a8a 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -3,11 +3,11 @@ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twin """ from datetime import datetime -from typing import NamedTuple, Iterable +from typing import NamedTuple, Iterable, List from pathlib import Path -from .common import PathIsh, get_files, LazyLogger, Json -from .core.time import abbr_to_timezone +from ..common import PathIsh, get_files, LazyLogger, Json +from ..core.time import abbr_to_timezone from my.config import twint as config @@ -45,6 +45,12 @@ class Tweet(NamedTuple): def text(self) -> str: return self.row['tweet'] + @property + def urls(self) -> List[str]: + ustr = self.row['urls'] + if len(ustr) == 0: + return [] + return ustr.split(',') @property def permalink(self) -> str: