Use twint data export for periodic twitter data

2020-04-14 21:23:35 +01:00 · 2020-04-14 21:23:35 +01:00 · 6f8c2e2f24
commit 6f8c2e2f24
parent 811303fcb1
5 changed files with 97 additions and 30 deletions
--- a/my/common.py
+++ b/my/common.py
@ -3,6 +3,8 @@ import functools
 import types
 from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast

+from . import init
+
 # some helper functions
 PathIsh = Union[Path, str]

@ -163,3 +165,6 @@ def fastermime(path: str) -> str:
    # magic is slower but returns more stuff
    # TODO FIXME Result type; it's inherently racey
    return _magic().from_file(path)
+
+
+Json = Dict[str, Any]
--- a/my/core/time.py
+++ b/my/core/time.py
@ -0,0 +1,16 @@
+from functools import lru_cache
+from datetime import datetime
+
+import pytz # type: ignore
+
+# https://gist.github.com/edwardabraham/8680198
+tz_lookup = {
+    pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
+    for x in pytz.all_timezones
+}
+tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
+
+
+@lru_cache(-1)
+def abbr_to_timezone(abbr: str):
+    return tz_lookup[abbr]
--- a/my/kython/ktakeout.py
+++ b/my/kython/ktakeout.py
@ -8,18 +8,12 @@ from collections import OrderedDict
 from urllib.parse import unquote
 import pytz

+from ..core.time import abbr_to_timezone
+
 # Mar 8, 2018, 5:14:40 PM
 _TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"


-# https://gist.github.com/edwardabraham/8680198
-tz_lookup = {
-    pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
-    for x in pytz.all_timezones
-}
-tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
-
-
 # ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
 def parse_dt(s: str) -> datetime:
    fmt = _TIME_FORMAT
@ -33,8 +27,8 @@ def parse_dt(s: str) -> datetime:
        # hopefully it was utc? Legacy, so no that much of an issue anymore..
        tz = pytz.utc
    else:
-        s, tzname = s.rsplit(maxsplit=1)
-        tz = tz_lookup[tzname]
+        s, tzabbr = s.rsplit(maxsplit=1)
+        tz = abbr_to_timezone(tzabbr)

    dt = datetime.strptime(s, fmt)
    dt = tz.localize(dt)
--- a/my/twitter.py
+++ b/my/twitter.py
@ -15,41 +15,26 @@ import zipfile

 import pytz

-from .common import PathIsh, get_files, LazyLogger
+from .common import PathIsh, get_files, LazyLogger, Json
 from .kython import kompress


-logger = LazyLogger(__package__)
-
-
-# TODO get rid of this?
-_export_path: Optional[Path] = None
-def configure(*, export_path: Optional[PathIsh]=None) -> None:
-    if export_path is not None:
-        global _export_path
-        _export_path = Path(export_path)
+logger = LazyLogger(__name__)


 def _get_export() -> Path:
-    export_path = _export_path
-    if export_path is None:
-        # fallback
    from my.config import twitter as config
-        export_path = config.export_path
-    return max(get_files(export_path, '*.zip'))
+    return max(get_files(config.export_path, '*.zip'))


 Tid = str


-# TODO a bit messy... perhaps we do need DAL for twitter exports
-Json = Dict[str, Any]
-
-
 # TODO make sure it's not used anywhere else and simplify interface
 class Tweet(NamedTuple):
    raw: Json

+    # TODO deprecate tid?
    @property
    def tid(self) -> Tid:
        return self.raw['id_str']
@ -58,6 +43,7 @@ class Tweet(NamedTuple):
    def permalink(self) -> str:
        return f'https://twitter.com/i/web/status/{self.tid}'

+    # TODO deprecate dt?
    @property
    def dt(self) -> datetime:
        dts = self.raw['created_at']
@ -67,6 +53,7 @@ class Tweet(NamedTuple):
    def text(self) -> str:
        return self.raw['full_text']

+    # TODO not sure if I need them...
    @property
    def entities(self):
        return self.raw['entities']
--- a/my/twitter_twint.py
+++ b/my/twitter_twint.py
@ -0,0 +1,65 @@
+"""
+Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
+"""
+
+from datetime import datetime
+from typing import NamedTuple, Iterable
+from pathlib import Path
+
+from .common import PathIsh, get_files, LazyLogger, Json
+from .core.time import abbr_to_timezone
+
+from my.config import twint as config
+
+
+log = LazyLogger(__name__)
+
+
+def get_db_path() -> Path:
+    # TODO don't like the hardcoded extension. maybe, config should decide?
+    # or, glob only applies to directories?
+    return max(get_files(config.export_path, glob='*.db'))
+
+
+class Tweet(NamedTuple):
+    row: Json
+
+    @property
+    def id_str(self) -> str:
+        return self.row['id_str']
+
+    @property
+    def created_at(self) -> datetime:
+        seconds = self.row['created_at'] / 1000
+        tz_abbr = self.row['timezone']
+        tz = abbr_to_timezone(tz_abbr)
+        dt = datetime.fromtimestamp(seconds, tz=tz)
+        return dt
+
+    # TODO permalink -- take user into account?
+    @property
+    def screen_name(self) -> str:
+        return self.row['screen_name']
+
+    @property
+    def text(self) -> str:
+        return self.row['tweet']
+
+
+    @property
+    def permalink(self) -> str:
+        return f'https://twitter.com/{self.screen_name}/status/{self.id_str}'
+
+
+    # TODO urls
+    def __repr__(self):
+        return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})'
+
+
+def tweets() -> Iterable[Tweet]:
+    import dataset # type: ignore
+    db_path = get_db_path()
+    # TODO check that exists?
+    db = dataset.connect(f'sqlite:///{db_path}')
+    tdb = db.load_table('tweets')
+    yield from map(Tweet, tdb.all(order_by='created_at'))