HPI/my/twitter/twint.py
Dima Gerasimov 5c82d0faa9 switch from using dataset to raw sqlite3 module
dataset is kinda unmaintaned and currently broken due to sqlalchemy 2.0 changes

resolves https://github.com/karlicoss/HPI/issues/264
2023-02-07 01:57:00 +00:00

127 lines
4.1 KiB
Python

"""
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
"""
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import NamedTuple, Iterator, List
from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats
from my.core.cfg import make_config
from my.core.sqlite import sqlite_connection
from my.config import twint as user_config
# TODO move to twitter.twint config structure
@dataclass
class twint(user_config):
export_path: Paths # path[s]/glob to the twint Sqlite database
####
config = make_config(twint)
log = LazyLogger(__name__)
def get_db_path() -> Path:
return max(get_files(config.export_path))
from .common import TweetId, permalink
class Tweet(NamedTuple):
row: Json
@property
def id_str(self) -> TweetId:
return self.row['id_str']
@property
def created_at(self) -> datetime_aware:
seconds = self.row['created_at'] / 1000
tz = timezone.utc
# NOTE: UTC seems to be the case at least for the older version of schema I was using
# in twint, it was extracted from "data-time-ms" field in the scraped HML
# https://github.com/twintproject/twint/blob/e3345426eb24154ff084be22e4fed5cfa4631930/twint/tweet.py#L85
#
# I checked against twitter archive which is definitely UTC, and it seems to match
# also seems that other people are treating it as utc, e.g.
# https://github.com/thomasancheriyil/Red-Tide-Detection-based-on-Twitter/blob/beb200be60cc66dcbc394e670513715509837812/python/twitterGapParse.py#L61-L62
#
# twint is also saving 'timezone', but this is local machine timezone at the time of scraping?
# perhaps they thought date-time-ms was local time... or just kept it just in case (they are keepin lots on unnecessary stuff in the db)
return datetime.fromtimestamp(seconds, tz=tz)
@property
def screen_name(self) -> str:
return self.row['screen_name']
@property
def text(self) -> str:
text = self.row['tweet']
mentions_s = self.row['mentions']
if len(mentions_s) > 0:
# at some point for no apparent reasions mentions stopped appearing from tweet text in twint
# note that the order is still inconsisnent against twitter archive, but not much we can do
mentions = mentions_s.split(',')
for m in mentions:
# ugh. sometimes they appear as lowercase in text, sometimes not..
if m.lower() not in text.lower():
text = f'@{m} ' + text
return text
@property
def urls(self) -> List[str]:
ustr = self.row['urls']
if len(ustr) == 0:
return []
return ustr.split(',')
@property
def permalink(self) -> str:
return permalink(screen_name=self.screen_name, id=self.id_str)
# TODO urls
def __repr__(self):
return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})'
# https://github.com/twintproject/twint/issues/196
# ugh. so it dumps everything in tweet table, and there is no good way to tell between fav/original tweet.
# it might result in some tweets missing from the timeline if you happened to like them...
# not sure what to do with it
# alternatively, could ask the user to run separate databases for tweets and favs?
# TODO think about it
_QUERY = '''
SELECT T.*
FROM tweets as T
LEFT JOIN favorites as F
ON T.id_str = F.tweet_id
WHERE {where}
ORDER BY T.created_at
'''
def tweets() -> Iterator[Res[Tweet]]:
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
res = db.execute(_QUERY.format(where='F.tweet_id IS NULL'))
yield from map(Tweet, res)
def likes() -> Iterator[Res[Tweet]]:
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL'))
yield from map(Tweet, res)
def stats() -> Stats:
return {
**stat(tweets),
**stat(likes),
}