switch from using dataset to raw sqlite3 module

dataset is kinda unmaintaned and currently broken due to sqlalchemy 2.0 changes

resolves https://github.com/karlicoss/HPI/issues/264
This commit is contained in:
Dima Gerasimov 2023-02-07 01:28:45 +00:00
parent 1dcb40f840
commit 7c7457ab08
8 changed files with 123 additions and 103 deletions

View file

@ -4,31 +4,32 @@ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from datetime import datetime, timezone
import re
from typing import Iterator, Sequence, Optional, Dict
import sqlite3
from typing import Iterator, Sequence, Union
import pytz
from more_itertools import unique_everseen
from my.core import Paths, Res, datetime_aware, get_files
from my.core.sqlite import sqlite_connection
from .common import TweetId, permalink
from my.config import twitter as user_config
from ..core import Paths, Res, datetime_aware
@dataclass
class config(user_config.talon):
# paths[s]/glob to the exported sqlite databases
export_path: Paths
from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
from .common import TweetId, permalink
@dataclass(unsafe_hash=True)
class Tweet:
id_str: TweetId
@ -51,8 +52,6 @@ class _IsFavorire:
tweet: Tweet
from typing import Union
from ..core.dataset import connect_readonly
Entity = Union[_IsTweet, _IsFavorire]
def _entities() -> Iterator[Res[Entity]]:
for f in inputs():
@ -67,35 +66,36 @@ def _process_one(f: Path) -> Iterator[Res[Entity]]:
fname = f.name
handler = handlers.get(fname)
if handler is None:
yield RuntimeError(f"Coulnd't find handler for {fname}")
yield RuntimeError(f"Could not find handler for {fname}")
return
with connect_readonly(f) as db:
with sqlite_connection(f, immutable=True, row_factory='row') as db:
yield from handler(db)
def _process_user_tweets(db) -> Iterator[Res[Entity]]:
def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
# dunno why it's called 'lists'
for r in db['lists'].all(order_by='time'):
for r in db.execute('SELECT * FROM lists ORDER BY time'):
try:
yield _IsTweet(_parse_tweet(r))
except Exception as e:
yield e
def _process_favorite_tweets(db) -> Iterator[Res[Entity]]:
for r in db['favorite_tweets'].all(order_by='time'):
def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'):
try:
yield _IsFavorire(_parse_tweet(r))
except Exception as e:
yield e
def _parse_tweet(row) -> Tweet:
def _parse_tweet(row: sqlite3.Row) -> Tweet:
# ok so looks like it's tz aware..
# https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95
# uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime()
# and it's created here, so looks like it's properly parsed from the api
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc)
text = row['text']
# try explanding URLs.. sadly there are no positions in the db
@ -132,7 +132,6 @@ def _parse_tweet(row) -> Tweet:
)
from more_itertools import unique_everseen
def tweets() -> Iterator[Res[Tweet]]:
for x in unique_everseen(_entities()):
if isinstance(x, Exception):
@ -140,6 +139,7 @@ def tweets() -> Iterator[Res[Tweet]]:
elif isinstance(x, _IsTweet):
yield x.tweet
def likes() -> Iterator[Res[Tweet]]:
for x in unique_everseen(_entities()):
if isinstance(x, Exception):