Use twint data export for periodic twitter data

This commit is contained in:
Dima Gerasimov 2020-04-14 21:23:35 +01:00
parent 811303fcb1
commit 95f5750d7b
4 changed files with 32 additions and 30 deletions

View file

@ -3,6 +3,8 @@ import functools
import types
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast
from . import init
# some helper functions
PathIsh = Union[Path, str]
@ -163,3 +165,6 @@ def fastermime(path: str) -> str:
# magic is slower but returns more stuff
# TODO FIXME Result type; it's inherently racey
return _magic().from_file(path)
Json = Dict[str, Any]

16
my/core/time.py Normal file
View file

@ -0,0 +1,16 @@
from functools import lru_cache
from datetime import datetime
import pytz # type: ignore
# https://gist.github.com/edwardabraham/8680198
tz_lookup = {
pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
for x in pytz.all_timezones
}
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
@lru_cache(-1)
def abbr_to_timezone(abbr: str):
return tz_lookup[abbr]

View file

@ -8,18 +8,12 @@ from collections import OrderedDict
from urllib.parse import unquote
import pytz
from ..core.time import abbr_to_timezone
# Mar 8, 2018, 5:14:40 PM
_TIME_FORMAT = "%b %d, %Y, %I:%M:%S %p"
# https://gist.github.com/edwardabraham/8680198
tz_lookup = {
pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
for x in pytz.all_timezones
}
tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
# ugh. something is seriously wrong with datetime, it wouldn't parse timezone aware UTC timestamp :(
def parse_dt(s: str) -> datetime:
fmt = _TIME_FORMAT
@ -33,8 +27,8 @@ def parse_dt(s: str) -> datetime:
# hopefully it was utc? Legacy, so no that much of an issue anymore..
tz = pytz.utc
else:
s, tzname = s.rsplit(maxsplit=1)
tz = tz_lookup[tzname]
s, tzabbr = s.rsplit(maxsplit=1)
tz = abbr_to_timezone(tzabbr)
dt = datetime.strptime(s, fmt)
dt = tz.localize(dt)

View file

@ -15,41 +15,26 @@ import zipfile
import pytz
from .common import PathIsh, get_files, LazyLogger
from .common import PathIsh, get_files, LazyLogger, Json
from .kython import kompress
logger = LazyLogger(__package__)
# TODO get rid of this?
_export_path: Optional[Path] = None
def configure(*, export_path: Optional[PathIsh]=None) -> None:
if export_path is not None:
global _export_path
_export_path = Path(export_path)
logger = LazyLogger(__name__)
def _get_export() -> Path:
export_path = _export_path
if export_path is None:
# fallback
from my.config import twitter as config
export_path = config.export_path
return max(get_files(export_path, '*.zip'))
return max(get_files(config.export_path, '*.zip'))
Tid = str
# TODO a bit messy... perhaps we do need DAL for twitter exports
Json = Dict[str, Any]
# TODO make sure it's not used anywhere else and simplify interface
class Tweet(NamedTuple):
raw: Json
# TODO deprecate tid?
@property
def tid(self) -> Tid:
return self.raw['id_str']
@ -58,6 +43,7 @@ class Tweet(NamedTuple):
def permalink(self) -> str:
return f'https://twitter.com/i/web/status/{self.tid}'
# TODO deprecate dt?
@property
def dt(self) -> datetime:
dts = self.raw['created_at']
@ -67,6 +53,7 @@ class Tweet(NamedTuple):
def text(self) -> str:
return self.raw['full_text']
# TODO not sure if I need them...
@property
def entities(self):
return self.raw['entities']