Use twint data export for periodic twitter data

This commit is contained in:
Dima Gerasimov 2020-04-14 21:23:35 +01:00
parent 811303fcb1
commit 95f5750d7b
4 changed files with 32 additions and 30 deletions

View file

@ -3,6 +3,8 @@ import functools
import types import types
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast
from . import init
# some helper functions # some helper functions
PathIsh = Union[Path, str] PathIsh = Union[Path, str]
@ -163,3 +165,6 @@ def fastermime(path: str) -> str:
# magic is slower but returns more stuff # magic is slower but returns more stuff
# TODO FIXME Result type; it's inherently racey # TODO FIXME Result type; it's inherently racey
return _magic().from_file(path) return _magic().from_file(path)
Json = Dict[str, Any]

16
my/core/time.py Normal file
View file

@ -0,0 +1,16 @@
from functools import lru_cache
from datetime import datetime
import pytz # type: ignore
# https://gist.github.com/edwardabraham/8680198
tz_lookup = {
pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
for x in pytz.all_timezones
}
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
@lru_cache(-1)
def abbr_to_timezone(abbr: str):
return tz_lookup[abbr]

View file

@ -8,18 +8,12 @@ from collections import OrderedDict
from urllib.parse import unquote from urllib.parse import unquote
import pytz import pytz
from ..core.time import abbr_to_timezone
# Mar 8, 2018, 5:14:40 PM # Mar 8, 2018, 5:14:40 PM
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p" _TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
# https://gist.github.com/edwardabraham/8680198
tz_lookup = {
pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
for x in pytz.all_timezones
}
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :( # ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
def parse_dt(s: str) -> datetime: def parse_dt(s: str) -> datetime:
fmt = _TIME_FORMAT fmt = _TIME_FORMAT
@ -33,8 +27,8 @@ def parse_dt(s: str) -> datetime:
# hopefully it was utc? Legacy, so no that much of an issue anymore.. # hopefully it was utc? Legacy, so no that much of an issue anymore..
tz = pytz.utc tz = pytz.utc
else: else:
s, tzname = s.rsplit(maxsplit=1) s, tzabbr = s.rsplit(maxsplit=1)
tz = tz_lookup[tzname] tz = abbr_to_timezone(tzabbr)
dt = datetime.strptime(s, fmt) dt = datetime.strptime(s, fmt)
dt = tz.localize(dt) dt = tz.localize(dt)

View file

@ -15,41 +15,26 @@ import zipfile
import pytz import pytz
from .common import PathIsh, get_files, LazyLogger from .common import PathIsh, get_files, LazyLogger, Json
from .kython import kompress from .kython import kompress
logger = LazyLogger(__package__) logger = LazyLogger(__name__)
# TODO get rid of this?
_export_path: Optional[Path] = None
def configure(*, export_path: Optional[PathIsh]=None) -> None:
if export_path is not None:
global _export_path
_export_path = Path(export_path)
def _get_export() -> Path: def _get_export() -> Path:
export_path = _export_path from my.config import twitter as config
if export_path is None: return max(get_files(config.export_path, '*.zip'))
# fallback
from my.config import twitter as config
export_path = config.export_path
return max(get_files(export_path, '*.zip'))
Tid = str Tid = str
# TODO a bit messy... perhaps we do need DAL for twitter exports
Json = Dict[str, Any]
# TODO make sure it's not used anywhere else and simplify interface # TODO make sure it's not used anywhere else and simplify interface
class Tweet(NamedTuple): class Tweet(NamedTuple):
raw: Json raw: Json
# TODO deprecate tid?
@property @property
def tid(self) -> Tid: def tid(self) -> Tid:
return self.raw['id_str'] return self.raw['id_str']
@ -58,6 +43,7 @@ class Tweet(NamedTuple):
def permalink(self) -> str: def permalink(self) -> str:
return f'https://twitter.com/i/web/status/{self.tid}' return f'https://twitter.com/i/web/status/{self.tid}'
# TODO deprecate dt?
@property @property
def dt(self) -> datetime: def dt(self) -> datetime:
dts = self.raw['created_at'] dts = self.raw['created_at']
@ -67,6 +53,7 @@ class Tweet(NamedTuple):
def text(self) -> str: def text(self) -> str:
return self.raw['full_text'] return self.raw['full_text']
# TODO not sure if I need them...
@property @property
def entities(self): def entities(self):
return self.raw['entities'] return self.raw['entities']