switch from using dataset to raw sqlite3 module

dataset is kinda unmaintaned and currently broken due to sqlalchemy 2.0 changes

resolves https://github.com/karlicoss/HPI/issues/264
This commit is contained in:
Dima Gerasimov 2023-02-07 01:28:45 +00:00 committed by karlicoss
parent 9c432027b5
commit 5c82d0faa9
8 changed files with 123 additions and 103 deletions

View file

@ -4,31 +4,32 @@ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from datetime import datetime, timezone
import re
from typing import Iterator, Sequence, Optional, Dict
import sqlite3
from typing import Iterator, Sequence, Union
import pytz
from more_itertools import unique_everseen
from my.core import Paths, Res, datetime_aware, get_files
from my.core.sqlite import sqlite_connection
from .common import TweetId, permalink
from my.config import twitter as user_config
from ..core import Paths, Res, datetime_aware
@dataclass
class config(user_config.talon):
# paths[s]/glob to the exported sqlite databases
export_path: Paths
from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
from .common import TweetId, permalink
@dataclass(unsafe_hash=True)
class Tweet:
id_str: TweetId
@ -51,8 +52,6 @@ class _IsFavorire:
tweet: Tweet
from typing import Union
from ..core.dataset import connect_readonly
Entity = Union[_IsTweet, _IsFavorire]
def _entities() -> Iterator[Res[Entity]]:
for f in inputs():
@ -67,35 +66,36 @@ def _process_one(f: Path) -> Iterator[Res[Entity]]:
fname = f.name
handler = handlers.get(fname)
if handler is None:
yield RuntimeError(f"Coulnd't find handler for {fname}")
yield RuntimeError(f"Could not find handler for {fname}")
return
with connect_readonly(f) as db:
with sqlite_connection(f, immutable=True, row_factory='row') as db:
yield from handler(db)
def _process_user_tweets(db) -> Iterator[Res[Entity]]:
def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
# dunno why it's called 'lists'
for r in db['lists'].all(order_by='time'):
for r in db.execute('SELECT * FROM lists ORDER BY time'):
try:
yield _IsTweet(_parse_tweet(r))
except Exception as e:
yield e
def _process_favorite_tweets(db) -> Iterator[Res[Entity]]:
for r in db['favorite_tweets'].all(order_by='time'):
def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'):
try:
yield _IsFavorire(_parse_tweet(r))
except Exception as e:
yield e
def _parse_tweet(row) -> Tweet:
def _parse_tweet(row: sqlite3.Row) -> Tweet:
# ok so looks like it's tz aware..
# https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95
# uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime()
# and it's created here, so looks like it's properly parsed from the api
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc)
text = row['text']
# try explanding URLs.. sadly there are no positions in the db
@ -132,7 +132,6 @@ def _parse_tweet(row) -> Tweet:
)
from more_itertools import unique_everseen
def tweets() -> Iterator[Res[Tweet]]:
for x in unique_everseen(_entities()):
if isinstance(x, Exception):
@ -140,6 +139,7 @@ def tweets() -> Iterator[Res[Tweet]]:
elif isinstance(x, _IsTweet):
yield x.tweet
def likes() -> Iterator[Res[Tweet]]:
for x in unique_everseen(_entities()):
if isinstance(x, Exception):

View file

@ -1,12 +1,16 @@
"""
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
"""
REQUIRES = ['dataset']
from ..core.common import Paths
from ..core.error import Res
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import NamedTuple, Iterator, List
from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats
from my.core.cfg import make_config
from my.core.sqlite import sqlite_connection
from my.config import twint as user_config
# TODO move to twitter.twint config structure
@ -17,16 +21,9 @@ class twint(user_config):
####
from ..core.cfg import make_config
config = make_config(twint)
from datetime import datetime, timezone
from typing import NamedTuple, Iterator, List
from pathlib import Path
from ..core.common import get_files, LazyLogger, Json, datetime_aware
log = LazyLogger(__name__)
@ -110,25 +107,19 @@ WHERE {where}
ORDER BY T.created_at
'''
def _get_db():
from ..core.dataset import connect_readonly
db_path = get_db_path()
return connect_readonly(db_path)
def tweets() -> Iterator[Res[Tweet]]:
db = _get_db()
res = db.query(_QUERY.format(where='F.tweet_id IS NULL'))
yield from map(Tweet, res)
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
res = db.execute(_QUERY.format(where='F.tweet_id IS NULL'))
yield from map(Tweet, res)
def likes() -> Iterator[Res[Tweet]]:
db = _get_db()
res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL'))
yield from map(Tweet, res)
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL'))
yield from map(Tweet, res)
from ..core import stat, Stats
def stats() -> Stats:
return {
**stat(tweets),