twitter.twint: localize timestamps correctly

same issue as discussed here https://memex.zulipchat.com/#narrow/stream/279610-data/topic/google.20takeout.20timestamps

also see corresponding changes for google_takeout_parser

- https://github.com/seanbreckenridge/google_takeout_parser/pull/28/files
- https://github.com/seanbreckenridge/google_takeout_parser/pull/30/files
This commit is contained in:
Dima Gerasimov 2022-05-30 17:24:09 +01:00 committed by karlicoss
parent de7972be05
commit d65e1b5245
2 changed files with 23 additions and 12 deletions

View file

@ -1,8 +1,9 @@
from functools import lru_cache
from datetime import tzinfo
from typing import Sequence
from typing import Sequence, Dict
import pytz # type: ignore
import pytz
from .common import datetime_aware, datetime_naive
def user_forced() -> Sequence[str]:
@ -17,12 +18,12 @@ def user_forced() -> Sequence[str]:
@lru_cache(1)
def _abbr_to_timezone_map():
def _abbr_to_timezone_map() -> Dict[str, pytz.BaseTzInfo]:
# also force UTC to always correspond to utc
# this makes more sense than Zulu it ends up by default
timezones = pytz.all_timezones + ['UTC'] + list(user_forced())
res = {}
res: Dict[str, pytz.BaseTzInfo] = {}
for tzname in timezones:
tz = pytz.timezone(tzname)
infos = getattr(tz, '_tzinfos', []) # not sure if can rely on attr always present?
@ -41,12 +42,23 @@ def _abbr_to_timezone_map():
return res
# todo dammit, lru_cache interferes with mypy?
@lru_cache(None)
def abbr_to_timezone(abbr: str) -> tzinfo:
@lru_cache(maxsize=None)
def abbr_to_timezone(abbr: str) -> pytz.BaseTzInfo:
return _abbr_to_timezone_map()[abbr]
def localize_with_abbr(dt: datetime_naive, *, abbr: str) -> datetime_aware:
if abbr.lower() == 'utc':
# best to shortcut here to avoid complications
return pytz.utc.localize(dt)
tz = abbr_to_timezone(abbr)
# this will compute the correct UTC offset
tzinfo = tz.localize(dt).tzinfo
assert tzinfo is not None # make mypy happy
return tz.normalize(dt.replace(tzinfo=tzinfo))
def zone_to_countrycode(zone: str) -> str:
# todo make optional?
return _zones_to_countrycode()[zone]

View file

@ -26,7 +26,7 @@ from typing import NamedTuple, Iterator, List
from pathlib import Path
from ..core.common import get_files, LazyLogger, Json, datetime_aware
from ..core.time import abbr_to_timezone
from ..core.time import localize_with_abbr
log = LazyLogger(__name__)
@ -49,9 +49,8 @@ class Tweet(NamedTuple):
def created_at(self) -> datetime_aware:
seconds = self.row['created_at'] / 1000
tz_abbr = self.row['timezone']
tz = abbr_to_timezone(tz_abbr)
dt = datetime.fromtimestamp(seconds, tz=tz)
return dt
naive = datetime.fromtimestamp(seconds)
return localize_with_abbr(naive, abbr=tz_abbr)
@property
def screen_name(self) -> str: