HPI/my/twitter/twint.py

"""
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
"""
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import NamedTuple, Iterator, List


from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats
from my.core.cfg import make_config
from my.core.sqlite import sqlite_connection

from my.config import twint as user_config

# TODO move to twitter.twint config structure

@dataclass
class twint(user_config):
    export_path: Paths # path[s]/glob to the twint Sqlite database

####

config = make_config(twint)


log = LazyLogger(__name__)


def get_db_path() -> Path:
    return max(get_files(config.export_path))


from .common import TweetId, permalink


class Tweet(NamedTuple):
    row: Json

    @property
    def id_str(self) -> TweetId:
        return self.row['id_str']

    @property
    def created_at(self) -> datetime_aware:
        seconds = self.row['created_at'] / 1000
        tz = timezone.utc
        # NOTE: UTC seems to be the case at least for the older version of schema I was using
        # in twint, it was extracted from "data-time-ms" field in the scraped HML
        # https://github.com/twintproject/twint/blob/e3345426eb24154ff084be22e4fed5cfa4631930/twint/tweet.py#L85
        #
        # I checked against twitter archive which is definitely UTC, and it seems to match
        # also seems that other people are treating it as utc, e.g.
        # https://github.com/thomasancheriyil/Red-Tide-Detection-based-on-Twitter/blob/beb200be60cc66dcbc394e670513715509837812/python/twitterGapParse.py#L61-L62
        #
        # twint is also saving 'timezone', but this is local machine timezone at the time of scraping?
        # perhaps they thought date-time-ms was local time... or just kept it just in case (they are keepin lots on unnecessary stuff in the db)
        return datetime.fromtimestamp(seconds, tz=tz)

    @property
    def screen_name(self) -> str:
        return self.row['screen_name']

    @property
    def text(self) -> str:
        text = self.row['tweet']
        mentions_s = self.row['mentions']
        if len(mentions_s) > 0:
            # at some point for no apparent reasions mentions stopped appearing from tweet text in twint
            # note that the order is still inconsisnent against twitter archive, but not much we can do
            mentions = mentions_s.split(',')
            for m in mentions:
                # ugh. sometimes they appear as lowercase in text, sometimes not..
                if m.lower() not in text.lower():
                    text = f'@{m} ' + text
        return text

    @property
    def urls(self) -> List[str]:
        ustr = self.row['urls']
        if len(ustr) == 0:
            return []
        return ustr.split(',')

    @property
    def permalink(self) -> str:
        return permalink(screen_name=self.screen_name, id=self.id_str)


    # TODO urls
    def __repr__(self):
        return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})'

# https://github.com/twintproject/twint/issues/196
# ugh. so it dumps everything in tweet table, and there is no good way to tell between fav/original tweet.
# it might result in some tweets missing from the timeline if you happened to like them...
# not sure what to do with it
# alternatively, could ask the user to run separate databases for tweets and favs?
# TODO think about it

_QUERY = '''
SELECT T.*
FROM      tweets    as T
LEFT JOIN favorites as F
ON    T.id_str = F.tweet_id
WHERE {where}
ORDER BY T.created_at
'''


def tweets() -> Iterator[Res[Tweet]]:
    with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
        res = db.execute(_QUERY.format(where='F.tweet_id IS NULL'))
        yield from map(Tweet, res)


def likes() -> Iterator[Res[Tweet]]:
    with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
        res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL'))
        yield from map(Tweet, res)


def stats() -> Stats:
    return {
        **stat(tweets),
        **stat(likes),
    }