move twitter stuff to twitter subdir

2020-04-14 21:31:40 +01:00 · 2020-04-14 21:31:40 +01:00 · 56b6ab9aaf
commit 56b6ab9aaf
parent 6f8c2e2f24
2 changed files with 0 additions and 0 deletions
--- a/my/twitter/archive.py
+++ b/my/twitter/archive.py
@ -0,0 +1,193 @@
+"""
+Twitter data (uses official twitter archive export)
+
+See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive
+"""
+
+from . import init
+
+
+from datetime import date, datetime
+from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple
+from pathlib import Path
+import json
+import zipfile
+
+import pytz
+
+from .common import PathIsh, get_files, LazyLogger, Json
+from .kython import kompress
+
+
+logger = LazyLogger(__name__)
+
+
+def _get_export() -> Path:
+    from my.config import twitter as config
+    return max(get_files(config.export_path, '*.zip'))
+
+
+Tid = str
+
+
+# TODO make sure it's not used anywhere else and simplify interface
+class Tweet(NamedTuple):
+    raw: Json
+
+    # TODO deprecate tid?
+    @property
+    def tid(self) -> Tid:
+        return self.raw['id_str']
+
+    @property
+    def permalink(self) -> str:
+        return f'https://twitter.com/i/web/status/{self.tid}'
+
+    # TODO deprecate dt?
+    @property
+    def dt(self) -> datetime:
+        dts = self.raw['created_at']
+        return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
+
+    @property
+    def text(self) -> str:
+        return self.raw['full_text']
+
+    # TODO not sure if I need them...
+    @property
+    def entities(self):
+        return self.raw['entities']
+
+    def __str__(self) -> str:
+        return str(self.raw)
+
+    def __repr__(self) -> str:
+        return repr(self.raw)
+
+
+class Like(NamedTuple):
+    raw: Json
+
+    # TODO need to make permalink/link/url consistent across my stuff..
+    @property
+    def permalink(self) -> str:
+        # doesn'tseem like link it export is more specific...
+        return f'https://twitter.com/i/web/status/{self.tid}'
+
+    @property
+    def tid(self) -> Tid:
+        return self.raw['tweetId']
+
+    @property
+    def text(self) -> Optional[str]:
+        # ugh. I think none means that tweet was deleted?
+        return self.raw.get('fullText')
+
+
+class ZipExport:
+    def __init__(self) -> None:
+        self.epath = _get_export()
+
+        self.old_format = False # changed somewhere around 2020.03
+        if not kompress.kexists(self.epath, 'Your archive.html'):
+            self.old_format = True
+
+
+    def raw(self, what: str): # TODO Json in common?
+        logger.info('processing: %s %s', self.epath, what)
+
+        path = what
+        if not self.old_format:
+            path = 'data/' + path
+        path += '.js'
+
+        with kompress.kopen(self.epath, path) as fo:
+            ddd = fo.read().decode('utf8')
+        start = ddd.index('[')
+        ddd = ddd[start:]
+        for j in json.loads(ddd):
+            if set(j.keys()) == {what}:
+                # newer format
+                yield j[what]
+            else:
+                # older format
+                yield j
+
+
+    def tweets(self) -> Iterator[Tweet]:
+        for r in self.raw('tweet'):
+            yield Tweet(r)
+
+
+    def likes(self) -> Iterator[Like]:
+        # TODO ugh. would be nice to unify Tweet/Like interface
+        # however, akeout only got tweetId, full text and url
+        for r in self.raw('like'):
+            yield Like(r)
+
+
+def tweets() -> List[Tweet]:
+    return list(sorted(ZipExport().tweets(), key=lambda t: t.dt))
+
+
+def likes() -> List[Like]:
+    return list(ZipExport().likes())
+
+
+def predicate(p) -> List[Tweet]:
+    return [t for t in tweets() if p(t)]
+
+
+def predicate_date(p) -> List[Tweet]: # TODO rename to by_date?
+    return predicate(lambda t: p(t.dt.date()))
+
+# TODO move these to private tests?
+Datish = Union[date, str]
+def tweets_on(*dts: Datish) -> List[Tweet]:
+    from kython import parse_date_new
+    # TODO how to make sure we don't miss on 29 feb?
+    dates = {parse_date_new(d) for d in dts}
+    return predicate_date(lambda d: d in dates)
+
+on = tweets_on
+
+
+def test_tweet():
+    raw = """
+ {
+  "retweeted" : false,
+  "entities" : {
+    "hashtags" : [ ],
+    "symbols" : [ ],
+    "user_mentions" : [ ],
+    "urls" : [ {
+      "url" : "https://t.co/vUg4W6nxwU",
+      "expanded_url" : "https://intelligence.org/2013/12/13/aaronson/",
+      "display_url" : "intelligence.org/2013/12/13/aar…",
+      "indices" : [ "120", "143" ]
+    }
+    ]
+  },
+  "display_text_range" : [ "0", "90" ],
+  "favorite_count" : "0",
+  "in_reply_to_status_id_str" : "24123424",
+  "id_str" : "2328934829084",
+  "in_reply_to_user_id" : "23423424",
+  "truncated" : false,
+  "retweet_count" : "0",
+  "id" : "23492349032940",
+  "in_reply_to_status_id" : "23482984932084",
+  "created_at" : "Thu Aug 30 07:12:48 +0000 2012",
+  "favorited" : false,
+  "full_text" : "this is a test tweet",
+  "lang" : "ru",
+  "in_reply_to_screen_name" : "whatever",
+  "in_reply_to_user_id_str" : "3748274"
+}
+    """
+    t = Tweet(json.loads(raw))
+    assert t.permalink is not None
+    assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
+    assert t.text == 'this is a test tweet'
+    assert t.tid  == '2328934829084'
+    assert t.entities is not None
--- a/my/twitter/twint.py
+++ b/my/twitter/twint.py
@ -0,0 +1,65 @@
+"""
+Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
+"""
+
+from datetime import datetime
+from typing import NamedTuple, Iterable
+from pathlib import Path
+
+from .common import PathIsh, get_files, LazyLogger, Json
+from .core.time import abbr_to_timezone
+
+from my.config import twint as config
+
+
+log = LazyLogger(__name__)
+
+
+def get_db_path() -> Path:
+    # TODO don't like the hardcoded extension. maybe, config should decide?
+    # or, glob only applies to directories?
+    return max(get_files(config.export_path, glob='*.db'))
+
+
+class Tweet(NamedTuple):
+    row: Json
+
+    @property
+    def id_str(self) -> str:
+        return self.row['id_str']
+
+    @property
+    def created_at(self) -> datetime:
+        seconds = self.row['created_at'] / 1000
+        tz_abbr = self.row['timezone']
+        tz = abbr_to_timezone(tz_abbr)
+        dt = datetime.fromtimestamp(seconds, tz=tz)
+        return dt
+
+    # TODO permalink -- take user into account?
+    @property
+    def screen_name(self) -> str:
+        return self.row['screen_name']
+
+    @property
+    def text(self) -> str:
+        return self.row['tweet']
+
+
+    @property
+    def permalink(self) -> str:
+        return f'https://twitter.com/{self.screen_name}/status/{self.id_str}'
+
+
+    # TODO urls
+    def __repr__(self):
+        return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})'
+
+
+def tweets() -> Iterable[Tweet]:
+    import dataset # type: ignore
+    db_path = get_db_path()
+    # TODO check that exists?
+    db = dataset.connect(f'sqlite:///{db_path}')
+    tdb = db.load_table('tweets')
+    yield from map(Tweet, tdb.all(order_by='created_at'))