twitter.twint: actually need to treat created_at is UTC
This commit is contained in:
parent
d65e1b5245
commit
4104f821fa
1 changed files with 13 additions and 5 deletions
|
@ -21,12 +21,11 @@ from ..core.cfg import make_config
|
||||||
config = make_config(twint)
|
config = make_config(twint)
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from typing import NamedTuple, Iterator, List
|
from typing import NamedTuple, Iterator, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from ..core.common import get_files, LazyLogger, Json, datetime_aware
|
from ..core.common import get_files, LazyLogger, Json, datetime_aware
|
||||||
from ..core.time import localize_with_abbr
|
|
||||||
|
|
||||||
log = LazyLogger(__name__)
|
log = LazyLogger(__name__)
|
||||||
|
|
||||||
|
@ -48,9 +47,18 @@ class Tweet(NamedTuple):
|
||||||
@property
|
@property
|
||||||
def created_at(self) -> datetime_aware:
|
def created_at(self) -> datetime_aware:
|
||||||
seconds = self.row['created_at'] / 1000
|
seconds = self.row['created_at'] / 1000
|
||||||
tz_abbr = self.row['timezone']
|
tz = timezone.utc
|
||||||
naive = datetime.fromtimestamp(seconds)
|
# NOTE: UTC seems to be the case at least for the older version of schema I was using
|
||||||
return localize_with_abbr(naive, abbr=tz_abbr)
|
# in twint, it was extracted from "data-time-ms" field in the scraped HML
|
||||||
|
# https://github.com/twintproject/twint/blob/e3345426eb24154ff084be22e4fed5cfa4631930/twint/tweet.py#L85
|
||||||
|
#
|
||||||
|
# I checked against twitter archive which is definitely UTC, and it seems to match
|
||||||
|
# also seems that other people are treating it as utc, e.g.
|
||||||
|
# https://github.com/thomasancheriyil/Red-Tide-Detection-based-on-Twitter/blob/beb200be60cc66dcbc394e670513715509837812/python/twitterGapParse.py#L61-L62
|
||||||
|
#
|
||||||
|
# twint is also saving 'timezone', but this is local machine timezone at the time of scraping?
|
||||||
|
# perhaps they thought date-time-ms was local time... or just kept it just in case (they are keepin lots on unnecessary stuff in the db)
|
||||||
|
return datetime.fromtimestamp(seconds, tz=tz)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def screen_name(self) -> str:
|
def screen_name(self) -> str:
|
||||||
|
|
Loading…
Add table
Reference in a new issue