twitter.twint: localize timestamps correctly
same issue as discussed here https://memex.zulipchat.com/#narrow/stream/279610-data/topic/google.20takeout.20timestamps also see corresponding changes for google_takeout_parser - https://github.com/seanbreckenridge/google_takeout_parser/pull/28/files - https://github.com/seanbreckenridge/google_takeout_parser/pull/30/files
This commit is contained in:
parent
de7972be05
commit
d65e1b5245
2 changed files with 23 additions and 12 deletions
|
@ -1,8 +1,9 @@
|
|||
from functools import lru_cache
|
||||
from datetime import tzinfo
|
||||
from typing import Sequence
|
||||
from typing import Sequence, Dict
|
||||
|
||||
import pytz # type: ignore
|
||||
import pytz
|
||||
|
||||
from .common import datetime_aware, datetime_naive
|
||||
|
||||
|
||||
def user_forced() -> Sequence[str]:
|
||||
|
@ -17,12 +18,12 @@ def user_forced() -> Sequence[str]:
|
|||
|
||||
|
||||
@lru_cache(1)
|
||||
def _abbr_to_timezone_map():
|
||||
def _abbr_to_timezone_map() -> Dict[str, pytz.BaseTzInfo]:
|
||||
# also force UTC to always correspond to utc
|
||||
# this makes more sense than Zulu it ends up by default
|
||||
timezones = pytz.all_timezones + ['UTC'] + list(user_forced())
|
||||
|
||||
res = {}
|
||||
res: Dict[str, pytz.BaseTzInfo] = {}
|
||||
for tzname in timezones:
|
||||
tz = pytz.timezone(tzname)
|
||||
infos = getattr(tz, '_tzinfos', []) # not sure if can rely on attr always present?
|
||||
|
@ -41,12 +42,23 @@ def _abbr_to_timezone_map():
|
|||
return res
|
||||
|
||||
|
||||
# todo dammit, lru_cache interferes with mypy?
|
||||
@lru_cache(None)
|
||||
def abbr_to_timezone(abbr: str) -> tzinfo:
|
||||
@lru_cache(maxsize=None)
|
||||
def abbr_to_timezone(abbr: str) -> pytz.BaseTzInfo:
|
||||
return _abbr_to_timezone_map()[abbr]
|
||||
|
||||
|
||||
def localize_with_abbr(dt: datetime_naive, *, abbr: str) -> datetime_aware:
|
||||
if abbr.lower() == 'utc':
|
||||
# best to shortcut here to avoid complications
|
||||
return pytz.utc.localize(dt)
|
||||
|
||||
tz = abbr_to_timezone(abbr)
|
||||
# this will compute the correct UTC offset
|
||||
tzinfo = tz.localize(dt).tzinfo
|
||||
assert tzinfo is not None # make mypy happy
|
||||
return tz.normalize(dt.replace(tzinfo=tzinfo))
|
||||
|
||||
|
||||
def zone_to_countrycode(zone: str) -> str:
|
||||
# todo make optional?
|
||||
return _zones_to_countrycode()[zone]
|
||||
|
|
|
@ -26,7 +26,7 @@ from typing import NamedTuple, Iterator, List
|
|||
from pathlib import Path
|
||||
|
||||
from ..core.common import get_files, LazyLogger, Json, datetime_aware
|
||||
from ..core.time import abbr_to_timezone
|
||||
from ..core.time import localize_with_abbr
|
||||
|
||||
log = LazyLogger(__name__)
|
||||
|
||||
|
@ -49,9 +49,8 @@ class Tweet(NamedTuple):
|
|||
def created_at(self) -> datetime_aware:
|
||||
seconds = self.row['created_at'] / 1000
|
||||
tz_abbr = self.row['timezone']
|
||||
tz = abbr_to_timezone(tz_abbr)
|
||||
dt = datetime.fromtimestamp(seconds, tz=tz)
|
||||
return dt
|
||||
naive = datetime.fromtimestamp(seconds)
|
||||
return localize_with_abbr(naive, abbr=tz_abbr)
|
||||
|
||||
@property
|
||||
def screen_name(self) -> str:
|
||||
|
|
Loading…
Add table
Reference in a new issue