twitter.twint: actually need to treat created_at is UTC

This commit is contained in:
Dima Gerasimov 2022-05-30 19:47:42 +01:00 committed by karlicoss
parent d65e1b5245
commit 4104f821fa

View file

@ -21,12 +21,11 @@ from ..core.cfg import make_config
config = make_config(twint) config = make_config(twint)
from datetime import datetime from datetime import datetime, timezone
from typing import NamedTuple, Iterator, List from typing import NamedTuple, Iterator, List
from pathlib import Path from pathlib import Path
from ..core.common import get_files, LazyLogger, Json, datetime_aware from ..core.common import get_files, LazyLogger, Json, datetime_aware
from ..core.time import localize_with_abbr
log = LazyLogger(__name__) log = LazyLogger(__name__)
@ -48,9 +47,18 @@ class Tweet(NamedTuple):
@property @property
def created_at(self) -> datetime_aware: def created_at(self) -> datetime_aware:
seconds = self.row['created_at'] / 1000 seconds = self.row['created_at'] / 1000
tz_abbr = self.row['timezone'] tz = timezone.utc
naive = datetime.fromtimestamp(seconds) # NOTE: UTC seems to be the case at least for the older version of schema I was using
return localize_with_abbr(naive, abbr=tz_abbr) # in twint, it was extracted from "data-time-ms" field in the scraped HML
# https://github.com/twintproject/twint/blob/e3345426eb24154ff084be22e4fed5cfa4631930/twint/tweet.py#L85
#
# I checked against twitter archive which is definitely UTC, and it seems to match
# also seems that other people are treating it as utc, e.g.
# https://github.com/thomasancheriyil/Red-Tide-Detection-based-on-Twitter/blob/beb200be60cc66dcbc394e670513715509837812/python/twitterGapParse.py#L61-L62
#
# twint is also saving 'timezone', but this is local machine timezone at the time of scraping?
# perhaps they thought date-time-ms was local time... or just kept it just in case (they are keepin lots on unnecessary stuff in the db)
return datetime.fromtimestamp(seconds, tz=tz)
@property @property
def screen_name(self) -> str: def screen_name(self) -> str: