dataset is kinda unmaintaned and currently broken due to sqlalchemy 2.0 changes resolves https://github.com/karlicoss/HPI/issues/264
151 lines
4.8 KiB
Python
151 lines
4.8 KiB
Python
"""
|
|
Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter_l/databases/=)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
import re
|
|
import sqlite3
|
|
from typing import Iterator, Sequence, Union
|
|
|
|
from more_itertools import unique_everseen
|
|
|
|
from my.core import Paths, Res, datetime_aware, get_files
|
|
from my.core.sqlite import sqlite_connection
|
|
|
|
from .common import TweetId, permalink
|
|
|
|
from my.config import twitter as user_config
|
|
|
|
|
|
@dataclass
|
|
class config(user_config.talon):
|
|
# paths[s]/glob to the exported sqlite databases
|
|
export_path: Paths
|
|
|
|
|
|
from pathlib import Path
|
|
def inputs() -> Sequence[Path]:
|
|
return get_files(config.export_path)
|
|
|
|
|
|
@dataclass(unsafe_hash=True)
|
|
class Tweet:
|
|
id_str: TweetId
|
|
created_at: datetime_aware
|
|
screen_name: str
|
|
text: str
|
|
urls: Sequence[str]
|
|
|
|
@property
|
|
def permalink(self) -> str:
|
|
return permalink(screen_name=self.screen_name, id=self.id_str)
|
|
|
|
|
|
# meh... just wrappers to tell apart tweets from favorites...
|
|
@dataclass(unsafe_hash=True)
|
|
class _IsTweet:
|
|
tweet: Tweet
|
|
@dataclass(unsafe_hash=True)
|
|
class _IsFavorire:
|
|
tweet: Tweet
|
|
|
|
|
|
Entity = Union[_IsTweet, _IsFavorire]
|
|
def _entities() -> Iterator[Res[Entity]]:
|
|
for f in inputs():
|
|
yield from _process_one(f)
|
|
|
|
|
|
def _process_one(f: Path) -> Iterator[Res[Entity]]:
|
|
handlers = {
|
|
'user_tweets.db' : _process_user_tweets,
|
|
'favorite_tweets.db': _process_favorite_tweets,
|
|
}
|
|
fname = f.name
|
|
handler = handlers.get(fname)
|
|
if handler is None:
|
|
yield RuntimeError(f"Could not find handler for {fname}")
|
|
return
|
|
with sqlite_connection(f, immutable=True, row_factory='row') as db:
|
|
yield from handler(db)
|
|
|
|
|
|
def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
|
|
# dunno why it's called 'lists'
|
|
for r in db.execute('SELECT * FROM lists ORDER BY time'):
|
|
try:
|
|
yield _IsTweet(_parse_tweet(r))
|
|
except Exception as e:
|
|
yield e
|
|
|
|
|
|
def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
|
|
for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'):
|
|
try:
|
|
yield _IsFavorire(_parse_tweet(r))
|
|
except Exception as e:
|
|
yield e
|
|
|
|
|
|
def _parse_tweet(row: sqlite3.Row) -> Tweet:
|
|
# ok so looks like it's tz aware..
|
|
# https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95
|
|
# uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime()
|
|
# and it's created here, so looks like it's properly parsed from the api
|
|
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
|
|
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc)
|
|
text = row['text']
|
|
|
|
# try explanding URLs.. sadly there are no positions in the db
|
|
urls = row['other_url'].split()
|
|
if len(urls) > 0:
|
|
ellipsis = '...'
|
|
# might have something collapsed
|
|
# e.g. deepmind.com/blog/article/Comp...
|
|
# NOTE: need a one character of lookahead to split on ellipsis.. hence ?=
|
|
for short in re.findall(r'(?:^|\s)([\S]+)' + re.escape(ellipsis) + r'(?=\s|$)', text):
|
|
for full in urls:
|
|
if short in full:
|
|
text = text.replace(short + ellipsis, full)
|
|
break
|
|
#
|
|
|
|
screen_name = row['screen_name']
|
|
# considering id_str is referring to the retweeter's tweet (rather than the original tweet)
|
|
# makes sense for the permalink to contain the retweeter as well
|
|
# also makes it more compatible to twitter archive
|
|
# a bit sad to lose structured information about RT, but then again we could always just parse it..
|
|
retweeter = row['retweeter']
|
|
if len(retweeter) > 0:
|
|
text = f'RT @{screen_name}: {text}'
|
|
screen_name = retweeter
|
|
|
|
return Tweet(
|
|
id_str=str(row['tweet_id']),
|
|
created_at=created_at,
|
|
screen_name=screen_name,
|
|
text=text,
|
|
# todo hmm text sometimes is trimmed with ellipsis? at least urls
|
|
urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0),
|
|
)
|
|
|
|
|
|
def tweets() -> Iterator[Res[Tweet]]:
|
|
for x in unique_everseen(_entities()):
|
|
if isinstance(x, Exception):
|
|
yield x
|
|
elif isinstance(x, _IsTweet):
|
|
yield x.tweet
|
|
|
|
|
|
def likes() -> Iterator[Res[Tweet]]:
|
|
for x in unique_everseen(_entities()):
|
|
if isinstance(x, Exception):
|
|
yield x
|
|
elif isinstance(x, _IsFavorire):
|
|
yield x.tweet
|
|
|
|
|
|
# TODO maybe should combine all public iterators into a stats()
|