Use twint data export for periodic twitter data
This commit is contained in:
parent
811303fcb1
commit
6f8c2e2f24
5 changed files with 97 additions and 30 deletions
|
@ -3,6 +3,8 @@ import functools
|
||||||
import types
|
import types
|
||||||
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast
|
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast
|
||||||
|
|
||||||
|
from . import init
|
||||||
|
|
||||||
# some helper functions
|
# some helper functions
|
||||||
PathIsh = Union[Path, str]
|
PathIsh = Union[Path, str]
|
||||||
|
|
||||||
|
@ -163,3 +165,6 @@ def fastermime(path: str) -> str:
|
||||||
# magic is slower but returns more stuff
|
# magic is slower but returns more stuff
|
||||||
# TODO FIXME Result type; it's inherently racey
|
# TODO FIXME Result type; it's inherently racey
|
||||||
return _magic().from_file(path)
|
return _magic().from_file(path)
|
||||||
|
|
||||||
|
|
||||||
|
Json = Dict[str, Any]
|
||||||
|
|
16
my/core/time.py
Normal file
16
my/core/time.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from functools import lru_cache
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pytz # type: ignore
|
||||||
|
|
||||||
|
# https://gist.github.com/edwardabraham/8680198
|
||||||
|
tz_lookup = {
|
||||||
|
pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
|
||||||
|
for x in pytz.all_timezones
|
||||||
|
}
|
||||||
|
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(-1)
|
||||||
|
def abbr_to_timezone(abbr: str):
|
||||||
|
return tz_lookup[abbr]
|
|
@ -8,18 +8,12 @@ from collections import OrderedDict
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
|
from ..core.time import abbr_to_timezone
|
||||||
|
|
||||||
# Mar 8, 2018, 5:14:40 PM
|
# Mar 8, 2018, 5:14:40 PM
|
||||||
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
|
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
|
||||||
|
|
||||||
|
|
||||||
# https://gist.github.com/edwardabraham/8680198
|
|
||||||
tz_lookup = {
|
|
||||||
pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
|
|
||||||
for x in pytz.all_timezones
|
|
||||||
}
|
|
||||||
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
|
|
||||||
|
|
||||||
|
|
||||||
# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
|
# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
|
||||||
def parse_dt(s: str) -> datetime:
|
def parse_dt(s: str) -> datetime:
|
||||||
fmt = _TIME_FORMAT
|
fmt = _TIME_FORMAT
|
||||||
|
@ -33,8 +27,8 @@ def parse_dt(s: str) -> datetime:
|
||||||
# hopefully it was utc? Legacy, so no that much of an issue anymore..
|
# hopefully it was utc? Legacy, so no that much of an issue anymore..
|
||||||
tz = pytz.utc
|
tz = pytz.utc
|
||||||
else:
|
else:
|
||||||
s, tzname = s.rsplit(maxsplit=1)
|
s, tzabbr = s.rsplit(maxsplit=1)
|
||||||
tz = tz_lookup[tzname]
|
tz = abbr_to_timezone(tzabbr)
|
||||||
|
|
||||||
dt = datetime.strptime(s, fmt)
|
dt = datetime.strptime(s, fmt)
|
||||||
dt = tz.localize(dt)
|
dt = tz.localize(dt)
|
||||||
|
|
|
@ -15,41 +15,26 @@ import zipfile
|
||||||
|
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
from .common import PathIsh, get_files, LazyLogger
|
from .common import PathIsh, get_files, LazyLogger, Json
|
||||||
from .kython import kompress
|
from .kython import kompress
|
||||||
|
|
||||||
|
|
||||||
logger = LazyLogger(__package__)
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# TODO get rid of this?
|
|
||||||
_export_path: Optional[Path] = None
|
|
||||||
def configure(*, export_path: Optional[PathIsh]=None) -> None:
|
|
||||||
if export_path is not None:
|
|
||||||
global _export_path
|
|
||||||
_export_path = Path(export_path)
|
|
||||||
|
|
||||||
|
|
||||||
def _get_export() -> Path:
|
def _get_export() -> Path:
|
||||||
export_path = _export_path
|
from my.config import twitter as config
|
||||||
if export_path is None:
|
return max(get_files(config.export_path, '*.zip'))
|
||||||
# fallback
|
|
||||||
from my.config import twitter as config
|
|
||||||
export_path = config.export_path
|
|
||||||
return max(get_files(export_path, '*.zip'))
|
|
||||||
|
|
||||||
|
|
||||||
Tid = str
|
Tid = str
|
||||||
|
|
||||||
|
|
||||||
# TODO a bit messy... perhaps we do need DAL for twitter exports
|
|
||||||
Json = Dict[str, Any]
|
|
||||||
|
|
||||||
|
|
||||||
# TODO make sure it's not used anywhere else and simplify interface
|
# TODO make sure it's not used anywhere else and simplify interface
|
||||||
class Tweet(NamedTuple):
|
class Tweet(NamedTuple):
|
||||||
raw: Json
|
raw: Json
|
||||||
|
|
||||||
|
# TODO deprecate tid?
|
||||||
@property
|
@property
|
||||||
def tid(self) -> Tid:
|
def tid(self) -> Tid:
|
||||||
return self.raw['id_str']
|
return self.raw['id_str']
|
||||||
|
@ -58,6 +43,7 @@ class Tweet(NamedTuple):
|
||||||
def permalink(self) -> str:
|
def permalink(self) -> str:
|
||||||
return f'https://twitter.com/i/web/status/{self.tid}'
|
return f'https://twitter.com/i/web/status/{self.tid}'
|
||||||
|
|
||||||
|
# TODO deprecate dt?
|
||||||
@property
|
@property
|
||||||
def dt(self) -> datetime:
|
def dt(self) -> datetime:
|
||||||
dts = self.raw['created_at']
|
dts = self.raw['created_at']
|
||||||
|
@ -67,6 +53,7 @@ class Tweet(NamedTuple):
|
||||||
def text(self) -> str:
|
def text(self) -> str:
|
||||||
return self.raw['full_text']
|
return self.raw['full_text']
|
||||||
|
|
||||||
|
# TODO not sure if I need them...
|
||||||
@property
|
@property
|
||||||
def entities(self):
|
def entities(self):
|
||||||
return self.raw['entities']
|
return self.raw['entities']
|
||||||
|
|
65
my/twitter_twint.py
Normal file
65
my/twitter_twint.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
"""
|
||||||
|
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import NamedTuple, Iterable
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .common import PathIsh, get_files, LazyLogger, Json
|
||||||
|
from .core.time import abbr_to_timezone
|
||||||
|
|
||||||
|
from my.config import twint as config
|
||||||
|
|
||||||
|
|
||||||
|
log = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_db_path() -> Path:
|
||||||
|
# TODO don't like the hardcoded extension. maybe, config should decide?
|
||||||
|
# or, glob only applies to directories?
|
||||||
|
return max(get_files(config.export_path, glob='*.db'))
|
||||||
|
|
||||||
|
|
||||||
|
class Tweet(NamedTuple):
|
||||||
|
row: Json
|
||||||
|
|
||||||
|
@property
|
||||||
|
def id_str(self) -> str:
|
||||||
|
return self.row['id_str']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def created_at(self) -> datetime:
|
||||||
|
seconds = self.row['created_at'] / 1000
|
||||||
|
tz_abbr = self.row['timezone']
|
||||||
|
tz = abbr_to_timezone(tz_abbr)
|
||||||
|
dt = datetime.fromtimestamp(seconds, tz=tz)
|
||||||
|
return dt
|
||||||
|
|
||||||
|
# TODO permalink -- take user into account?
|
||||||
|
@property
|
||||||
|
def screen_name(self) -> str:
|
||||||
|
return self.row['screen_name']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self) -> str:
|
||||||
|
return self.row['tweet']
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def permalink(self) -> str:
|
||||||
|
return f'https://twitter.com/{self.screen_name}/status/{self.id_str}'
|
||||||
|
|
||||||
|
|
||||||
|
# TODO urls
|
||||||
|
def __repr__(self):
|
||||||
|
return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})'
|
||||||
|
|
||||||
|
|
||||||
|
def tweets() -> Iterable[Tweet]:
|
||||||
|
import dataset # type: ignore
|
||||||
|
db_path = get_db_path()
|
||||||
|
# TODO check that exists?
|
||||||
|
db = dataset.connect(f'sqlite:///{db_path}')
|
||||||
|
tdb = db.load_table('tweets')
|
||||||
|
yield from map(Tweet, tdb.all(order_by='created_at'))
|
Loading…
Add table
Reference in a new issue