From d65e1b5245ca04049c42ced4153b6deef8cb061c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 17:24:09 +0100 Subject: [PATCH] twitter.twint: localize timestamps correctly same issue as discussed here https://memex.zulipchat.com/#narrow/stream/279610-data/topic/google.20takeout.20timestamps also see corresponding changes for google_takeout_parser - https://github.com/seanbreckenridge/google_takeout_parser/pull/28/files - https://github.com/seanbreckenridge/google_takeout_parser/pull/30/files --- my/core/time.py | 28 ++++++++++++++++++++-------- my/twitter/twint.py | 7 +++---- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/my/core/time.py b/my/core/time.py index b55fae3..7698332 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -1,8 +1,9 @@ from functools import lru_cache -from datetime import tzinfo -from typing import Sequence +from typing import Sequence, Dict -import pytz # type: ignore +import pytz + +from .common import datetime_aware, datetime_naive def user_forced() -> Sequence[str]: @@ -17,12 +18,12 @@ def user_forced() -> Sequence[str]: @lru_cache(1) -def _abbr_to_timezone_map(): +def _abbr_to_timezone_map() -> Dict[str, pytz.BaseTzInfo]: # also force UTC to always correspond to utc # this makes more sense than Zulu it ends up by default timezones = pytz.all_timezones + ['UTC'] + list(user_forced()) - res = {} + res: Dict[str, pytz.BaseTzInfo] = {} for tzname in timezones: tz = pytz.timezone(tzname) infos = getattr(tz, '_tzinfos', []) # not sure if can rely on attr always present? @@ -41,12 +42,23 @@ def _abbr_to_timezone_map(): return res -# todo dammit, lru_cache interferes with mypy? -@lru_cache(None) -def abbr_to_timezone(abbr: str) -> tzinfo: +@lru_cache(maxsize=None) +def abbr_to_timezone(abbr: str) -> pytz.BaseTzInfo: return _abbr_to_timezone_map()[abbr] +def localize_with_abbr(dt: datetime_naive, *, abbr: str) -> datetime_aware: + if abbr.lower() == 'utc': + # best to shortcut here to avoid complications + return pytz.utc.localize(dt) + + tz = abbr_to_timezone(abbr) + # this will compute the correct UTC offset + tzinfo = tz.localize(dt).tzinfo + assert tzinfo is not None # make mypy happy + return tz.normalize(dt.replace(tzinfo=tzinfo)) + + def zone_to_countrycode(zone: str) -> str: # todo make optional? return _zones_to_countrycode()[zone] diff --git a/my/twitter/twint.py b/my/twitter/twint.py index f20be42..a40c5bb 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -26,7 +26,7 @@ from typing import NamedTuple, Iterator, List from pathlib import Path from ..core.common import get_files, LazyLogger, Json, datetime_aware -from ..core.time import abbr_to_timezone +from ..core.time import localize_with_abbr log = LazyLogger(__name__) @@ -49,9 +49,8 @@ class Tweet(NamedTuple): def created_at(self) -> datetime_aware: seconds = self.row['created_at'] / 1000 tz_abbr = self.row['timezone'] - tz = abbr_to_timezone(tz_abbr) - dt = datetime.fromtimestamp(seconds, tz=tz) - return dt + naive = datetime.fromtimestamp(seconds) + return localize_with_abbr(naive, abbr=tz_abbr) @property def screen_name(self) -> str: