Use twint data export for periodic twitter data

This commit is contained in:
Dima Gerasimov 2020-04-14 21:23:35 +01:00
parent 811303fcb1
commit 6f8c2e2f24
5 changed files with 97 additions and 30 deletions

View file

@ -3,6 +3,8 @@ import functools
import types import types
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast
from . import init
# some helper functions # some helper functions
PathIsh = Union[Path, str] PathIsh = Union[Path, str]
@ -163,3 +165,6 @@ def fastermime(path: str) -> str:
# magic is slower but returns more stuff # magic is slower but returns more stuff
# TODO FIXME Result type; it's inherently racey # TODO FIXME Result type; it's inherently racey
return _magic().from_file(path) return _magic().from_file(path)
Json = Dict[str, Any]

16
my/core/time.py Normal file
View file

@ -0,0 +1,16 @@
from functools import lru_cache
from datetime import datetime
import pytz # type: ignore
# https://gist.github.com/edwardabraham/8680198
tz_lookup = {
pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
for x in pytz.all_timezones
}
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
@lru_cache(-1)
def abbr_to_timezone(abbr: str):
return tz_lookup[abbr]

View file

@ -8,18 +8,12 @@ from collections import OrderedDict
from urllib.parse import unquote from urllib.parse import unquote
import pytz import pytz
from ..core.time import abbr_to_timezone
# Mar 8, 2018, 5:14:40 PM # Mar 8, 2018, 5:14:40 PM
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p" _TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
# https://gist.github.com/edwardabraham/8680198
tz_lookup = {
pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
for x in pytz.all_timezones
}
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :( # ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
def parse_dt(s: str) -> datetime: def parse_dt(s: str) -> datetime:
fmt = _TIME_FORMAT fmt = _TIME_FORMAT
@ -33,8 +27,8 @@ def parse_dt(s: str) -> datetime:
# hopefully it was utc? Legacy, so no that much of an issue anymore.. # hopefully it was utc? Legacy, so no that much of an issue anymore..
tz = pytz.utc tz = pytz.utc
else: else:
s, tzname = s.rsplit(maxsplit=1) s, tzabbr = s.rsplit(maxsplit=1)
tz = tz_lookup[tzname] tz = abbr_to_timezone(tzabbr)
dt = datetime.strptime(s, fmt) dt = datetime.strptime(s, fmt)
dt = tz.localize(dt) dt = tz.localize(dt)

View file

@ -15,41 +15,26 @@ import zipfile
import pytz import pytz
from .common import PathIsh, get_files, LazyLogger from .common import PathIsh, get_files, LazyLogger, Json
from .kython import kompress from .kython import kompress
logger = LazyLogger(__package__) logger = LazyLogger(__name__)
# TODO get rid of this?
_export_path: Optional[Path] = None
def configure(*, export_path: Optional[PathIsh]=None) -> None:
if export_path is not None:
global _export_path
_export_path = Path(export_path)
def _get_export() -> Path: def _get_export() -> Path:
export_path = _export_path from my.config import twitter as config
if export_path is None: return max(get_files(config.export_path, '*.zip'))
# fallback
from my.config import twitter as config
export_path = config.export_path
return max(get_files(export_path, '*.zip'))
Tid = str Tid = str
# TODO a bit messy... perhaps we do need DAL for twitter exports
Json = Dict[str, Any]
# TODO make sure it's not used anywhere else and simplify interface # TODO make sure it's not used anywhere else and simplify interface
class Tweet(NamedTuple): class Tweet(NamedTuple):
raw: Json raw: Json
# TODO deprecate tid?
@property @property
def tid(self) -> Tid: def tid(self) -> Tid:
return self.raw['id_str'] return self.raw['id_str']
@ -58,6 +43,7 @@ class Tweet(NamedTuple):
def permalink(self) -> str: def permalink(self) -> str:
return f'https://twitter.com/i/web/status/{self.tid}' return f'https://twitter.com/i/web/status/{self.tid}'
# TODO deprecate dt?
@property @property
def dt(self) -> datetime: def dt(self) -> datetime:
dts = self.raw['created_at'] dts = self.raw['created_at']
@ -67,6 +53,7 @@ class Tweet(NamedTuple):
def text(self) -> str: def text(self) -> str:
return self.raw['full_text'] return self.raw['full_text']
# TODO not sure if I need them...
@property @property
def entities(self): def entities(self):
return self.raw['entities'] return self.raw['entities']

65
my/twitter_twint.py Normal file
View file

@ -0,0 +1,65 @@
"""
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
"""
from datetime import datetime
from typing import NamedTuple, Iterable
from pathlib import Path
from .common import PathIsh, get_files, LazyLogger, Json
from .core.time import abbr_to_timezone
from my.config import twint as config
log = LazyLogger(__name__)
def get_db_path() -> Path:
# TODO don't like the hardcoded extension. maybe, config should decide?
# or, glob only applies to directories?
return max(get_files(config.export_path, glob='*.db'))
class Tweet(NamedTuple):
row: Json
@property
def id_str(self) -> str:
return self.row['id_str']
@property
def created_at(self) -> datetime:
seconds = self.row['created_at'] / 1000
tz_abbr = self.row['timezone']
tz = abbr_to_timezone(tz_abbr)
dt = datetime.fromtimestamp(seconds, tz=tz)
return dt
# TODO permalink -- take user into account?
@property
def screen_name(self) -> str:
return self.row['screen_name']
@property
def text(self) -> str:
return self.row['tweet']
@property
def permalink(self) -> str:
return f'https://twitter.com/{self.screen_name}/status/{self.id_str}'
# TODO urls
def __repr__(self):
return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})'
def tweets() -> Iterable[Tweet]:
import dataset # type: ignore
db_path = get_db_path()
# TODO check that exists?
db = dataset.connect(f'sqlite:///{db_path}')
tdb = db.load_table('tweets')
yield from map(Tweet, tdb.all(order_by='created_at'))