diff --git a/my/common.py b/my/common.py index e80e736..2c241cd 100644 --- a/my/common.py +++ b/my/common.py @@ -3,6 +3,8 @@ import functools import types from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast +from . import init + # some helper functions PathIsh = Union[Path, str] @@ -163,3 +165,6 @@ def fastermime(path: str) -> str: # magic is slower but returns more stuff # TODO FIXME Result type; it's inherently racey return _magic().from_file(path) + + +Json = Dict[str, Any] diff --git a/my/core/time.py b/my/core/time.py new file mode 100644 index 0000000..d34ebf8 --- /dev/null +++ b/my/core/time.py @@ -0,0 +1,16 @@ +from functools import lru_cache +from datetime import datetime + +import pytz # type: ignore + +# https://gist.github.com/edwardabraham/8680198 +tz_lookup = { + pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x) + for x in pytz.all_timezones +} +tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu... + + +@lru_cache(-1) +def abbr_to_timezone(abbr: str): + return tz_lookup[abbr] diff --git a/my/kython/ktakeout.py b/my/kython/ktakeout.py index 513d4d6..96a3f58 100644 --- a/my/kython/ktakeout.py +++ b/my/kython/ktakeout.py @@ -8,18 +8,12 @@ from collections import OrderedDict from urllib.parse import unquote import pytz +from ..core.time import abbr_to_timezone + # Mar 8, 2018, 5:14:40 PM _TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p" -# https://gist.github.com/edwardabraham/8680198 -tz_lookup = { - pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x) - for x in pytz.all_timezones -} -tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu... - - # ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :( def parse_dt(s: str) -> datetime: fmt = _TIME_FORMAT @@ -33,8 +27,8 @@ def parse_dt(s: str) -> datetime: # hopefully it was utc? Legacy, so no that much of an issue anymore.. tz = pytz.utc else: - s, tzname = s.rsplit(maxsplit=1) - tz = tz_lookup[tzname] + s, tzabbr = s.rsplit(maxsplit=1) + tz = abbr_to_timezone(tzabbr) dt = datetime.strptime(s, fmt) dt = tz.localize(dt) diff --git a/my/twitter.py b/my/twitter.py index fb3a36f..37d08c4 100755 --- a/my/twitter.py +++ b/my/twitter.py @@ -15,41 +15,26 @@ import zipfile import pytz -from .common import PathIsh, get_files, LazyLogger +from .common import PathIsh, get_files, LazyLogger, Json from .kython import kompress -logger = LazyLogger(__package__) - - -# TODO get rid of this? -_export_path: Optional[Path] = None -def configure(*, export_path: Optional[PathIsh]=None) -> None: - if export_path is not None: - global _export_path - _export_path = Path(export_path) +logger = LazyLogger(__name__) def _get_export() -> Path: - export_path = _export_path - if export_path is None: - # fallback - from my.config import twitter as config - export_path = config.export_path - return max(get_files(export_path, '*.zip')) + from my.config import twitter as config + return max(get_files(config.export_path, '*.zip')) Tid = str -# TODO a bit messy... perhaps we do need DAL for twitter exports -Json = Dict[str, Any] - - # TODO make sure it's not used anywhere else and simplify interface class Tweet(NamedTuple): raw: Json + # TODO deprecate tid? @property def tid(self) -> Tid: return self.raw['id_str'] @@ -58,6 +43,7 @@ class Tweet(NamedTuple): def permalink(self) -> str: return f'https://twitter.com/i/web/status/{self.tid}' + # TODO deprecate dt? @property def dt(self) -> datetime: dts = self.raw['created_at'] @@ -67,6 +53,7 @@ class Tweet(NamedTuple): def text(self) -> str: return self.raw['full_text'] + # TODO not sure if I need them... @property def entities(self): return self.raw['entities'] diff --git a/my/twitter_twint.py b/my/twitter_twint.py new file mode 100644 index 0000000..22b2c23 --- /dev/null +++ b/my/twitter_twint.py @@ -0,0 +1,65 @@ +""" +Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export. +""" + +from datetime import datetime +from typing import NamedTuple, Iterable +from pathlib import Path + +from .common import PathIsh, get_files, LazyLogger, Json +from .core.time import abbr_to_timezone + +from my.config import twint as config + + +log = LazyLogger(__name__) + + +def get_db_path() -> Path: + # TODO don't like the hardcoded extension. maybe, config should decide? + # or, glob only applies to directories? + return max(get_files(config.export_path, glob='*.db')) + + +class Tweet(NamedTuple): + row: Json + + @property + def id_str(self) -> str: + return self.row['id_str'] + + @property + def created_at(self) -> datetime: + seconds = self.row['created_at'] / 1000 + tz_abbr = self.row['timezone'] + tz = abbr_to_timezone(tz_abbr) + dt = datetime.fromtimestamp(seconds, tz=tz) + return dt + + # TODO permalink -- take user into account? + @property + def screen_name(self) -> str: + return self.row['screen_name'] + + @property + def text(self) -> str: + return self.row['tweet'] + + + @property + def permalink(self) -> str: + return f'https://twitter.com/{self.screen_name}/status/{self.id_str}' + + + # TODO urls + def __repr__(self): + return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})' + + +def tweets() -> Iterable[Tweet]: + import dataset # type: ignore + db_path = get_db_path() + # TODO check that exists? + db = dataset.connect(f'sqlite:///{db_path}') + tdb = db.load_table('tweets') + yield from map(Tweet, tdb.all(order_by='created_at'))