switch from using dataset to raw sqlite3 module

dataset is kinda unmaintaned and currently broken due to sqlalchemy 2.0 changes

resolves https://github.com/karlicoss/HPI/issues/264
This commit is contained in:
Dima Gerasimov 2023-02-07 01:28:45 +00:00 committed by karlicoss
parent 9c432027b5
commit 5c82d0faa9
8 changed files with 123 additions and 103 deletions

View file

@ -1,17 +1,19 @@
from .common import assert_subpackage; assert_subpackage(__name__)
from contextlib import contextmanager
from pathlib import Path
import shutil
import sqlite3
from tempfile import TemporaryDirectory
from typing import Tuple, Any, Iterator, Callable, Optional, Union
from .common import PathIsh
from .common import PathIsh, assert_never
from .compat import Literal
def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection:
# https://www.sqlite.org/draft/uri.html#uriimmutable
return sqlite3.connect(f'file:{db}?immutable=1', uri=True)
@ -30,6 +32,42 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None:
conn.execute('DROP TABLE testtable')
SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any]
def dict_factory(cursor, row):
fields = [column[0] for column in cursor.description]
return {key: value for key, value in zip(fields, row)}
Factory = Union[SqliteRowFactory, Literal['row', 'dict']]
@contextmanager
def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]:
dbp = f'file:{db}'
# https://www.sqlite.org/draft/uri.html#uriimmutable
if immutable:
dbp = f'{dbp}?immutable=1'
row_factory_: Any = None
if row_factory is not None:
if callable(row_factory):
row_factory_ = row_factory
elif row_factory == 'row':
row_factory_ = sqlite3.Row
elif row_factory == 'dict':
row_factory_ = dict_factory
else:
assert_never()
conn = sqlite3.connect(dbp, uri=True)
try:
conn.row_factory = row_factory_
with conn:
yield conn
finally:
# Connection context manager isn't actually closing the connection, only keeps transaction
conn.close()
# TODO come up with a better name?
# NOTE: this is tested by tests/sqlite.py::test_sqlite_read_with_wal
def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
@ -52,8 +90,6 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
return dest
from typing import Tuple, Any, Iterator
# NOTE hmm, so this kinda works
# V = TypeVar('V', bound=Tuple[Any, ...])
# def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]:

View file

@ -3,25 +3,27 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab
"""
from __future__ import annotations
REQUIRES = ['dataset']
from dataclasses import dataclass
from datetime import datetime
from typing import Iterator, Sequence, Optional, Dict
import json
from pathlib import Path
import sqlite3
from typing import Iterator, Sequence, Optional, Dict, Union
from more_itertools import unique_everseen
from my.core import get_files, Paths, datetime_naive, Res, assert_never
from my.core.sqlite import sqlite_connection
from my.config import fbmessenger as user_config
from ..core import Paths
@dataclass
class config(user_config.android):
# paths[s]/glob to the exported sqlite databases
export_path: Paths
from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
@ -38,7 +40,6 @@ class Thread:
name: Optional[str]
# todo not sure about order of fields...
from ..core import datetime_naive
@dataclass
class _BaseMessage:
id: str
@ -63,22 +64,18 @@ class Message(_BaseMessage):
reply_to: Optional[Message]
import json
from typing import Union
from ..core import Res, assert_never
from ..core.dataset import connect_readonly, DatabaseT
Entity = Union[Sender, Thread, _Message]
def _entities() -> Iterator[Res[Entity]]:
for f in inputs():
with connect_readonly(f) as db:
with sqlite_connection(f, immutable=True, row_factory='row') as db:
yield from _process_db(db)
def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
# works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user
threadkey2id = lambda key: key.split(':')[1]
for r in db['threads'].find():
for r in db.execute('SELECT * FROM threads'):
try:
yield Thread(
id=threadkey2id(r['thread_key']),
@ -88,7 +85,7 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
yield e
continue
for r in db['messages'].find(order_by='timestamp_ms'):
for r in db.execute('SELECT * FROM messages ORDER BY timestamp_ms'):
mtype: int = r['msg_type']
if mtype == -1:
# likely immediately deleted or something? doesn't have any data at all
@ -133,7 +130,6 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
yield e
from more_itertools import unique_everseen
def messages() -> Iterator[Res[Message]]:
senders: Dict[str, Sender] = {}
msgs: Dict[str, Message] = {}

View file

@ -5,13 +5,15 @@ from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from typing import Iterator, Sequence, Optional, Dict
from pathlib import Path
from typing import Iterator, Sequence, Optional
from my.core import get_files, Paths, Res
from my.core.sqlite import sqlite_connection
from my.config import hackernews as user_config
from ..core import Paths
@dataclass
class config(user_config.dogsheep):
# paths[s]/glob to the dogsheep database
@ -20,8 +22,6 @@ class config(user_config.dogsheep):
# todo so much boilerplate... really need some common wildcard imports?...
# at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional
from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
@ -44,15 +44,15 @@ class Item:
@property
def permalink(self) -> str:
return hackernews_link(self.id)
# TODO hmm kinda annoying that permalink isn't getting serialized
# maybe won't be such a big problem if we used hpi query directly on objects, without jsons?
# so we could just take .permalink thing
from ..core.error import Res
from ..core.dataset import connect_readonly
def items() -> Iterator[Res[Item]]:
f = max(inputs())
with connect_readonly(f) as db:
items = db['items']
for r in items.all(order_by='time'):
with sqlite_connection(f, immutable=True, row_factory='row') as conn:
for r in conn.execute('SELECT * FROM items ORDER BY time'):
yield Item(
id=r['id'],
type=r['type'],

View file

@ -1,20 +1,17 @@
"""
[[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews
"""
REQUIRES = ['dataset']
from datetime import datetime
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, Iterator, NamedTuple, Sequence
import pytz
from my.core import get_files
from my.core.sqlite import sqlite_connection
from my.config import materialistic as config
# todo migrate config to my.hackernews.materialistic
from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
@ -28,7 +25,7 @@ class Saved(NamedTuple):
@property
def when(self) -> datetime:
ts = int(self.row['time']) / 1000
return datetime.fromtimestamp(ts, tz=pytz.utc)
return datetime.fromtimestamp(ts, tz=timezone.utc)
@property
def uid(self) -> str:
@ -47,13 +44,11 @@ class Saved(NamedTuple):
return hackernews_link(self.uid)
from ..core.dataset import connect_readonly
def raw() -> Iterator[Row]:
last = max(inputs())
with connect_readonly(last) as db:
saved = db['saved']
with sqlite_connection(last, immutable=True, row_factory='dict') as conn:
yield from conn.execute('SELECT * FROM saved ORDER BY time')
# TODO wonder if it's 'save time' or creation time?
yield from saved.all(order_by='time')
def saves() -> Iterator[Saved]:

View file

@ -1,11 +1,11 @@
'''
[[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data
'''
from datetime import datetime
from typing import NamedTuple, Dict, Optional, Iterable
from .core import get_files
from my.core import get_files, stat, Stats
from my.core.sqlite import sqlite_connection
from my.config import taplog as user_config
@ -46,10 +46,9 @@ class Entry(NamedTuple):
def entries() -> Iterable[Entry]:
last = max(get_files(user_config.export_path))
from .core.dataset import connect_readonly
db = connect_readonly(last)
with sqlite_connection(last, immutable=True, row_factory='dict') as db:
# todo is it sorted by timestamp?
for row in db['Log'].all():
for row in db.execute('SELECT * FROM Log'):
yield Entry(row)
@ -60,6 +59,5 @@ def by_button(button: str) -> Iterable[Entry]:
yield e
from .core import stat, Stats
def stats() -> Stats:
return stat(entries)

View file

@ -3,19 +3,18 @@ Tinder data from Android app database (in =/data/data/com.tinder/databases/tinde
"""
from __future__ import annotations
REQUIRES = ['dataset']
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timezone
from itertools import chain
from pathlib import Path
import sqlite3
from typing import Sequence, Iterator, Union, Dict, List, Mapping
from more_itertools import unique_everseen
from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware
from my.core.dataset import connect_readonly, DatabaseT
from my.core.sqlite import sqlite_connection
from my.config import tinder as user_config
@ -73,6 +72,8 @@ class Message(_BaseMessage):
to: Person
# todo hmm I have a suspicion it might be cumulative?
# although still possible that the user might remove/install app back, so need to keep that in mind
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
@ -83,40 +84,43 @@ Entity = Union[Person, Match, Message]
def _entities() -> Iterator[Res[_Entity]]:
for db_file in inputs():
with connect_readonly(db_file) as db:
with sqlite_connection(db_file, immutable=True, row_factory='row') as db:
yield from _handle_db(db)
def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]:
def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]:
# profile_user_view contains our own user id
for row in chain(db['profile_user_view'], db['match_person']):
for row in chain(
db.execute('SELECT * FROM profile_user_view'),
db.execute('SELECT * FROM match_person'),
):
try:
yield _parse_person(row)
except Exception as e:
# todo attach error contex?
yield e
for row in db['match']:
for row in db.execute('SELECT * FROM match'):
try:
yield _parse_match(row)
except Exception as e:
yield e
for row in db['message']:
for row in db.execute('SELECT * FROM message'):
try:
yield _parse_msg(row)
except Exception as e:
yield e
def _parse_person(row) -> Person:
def _parse_person(row: sqlite3.Row) -> Person:
return Person(
id=row['id'],
name=row['name'],
)
def _parse_match(row) -> _Match:
def _parse_match(row: sqlite3.Row) -> _Match:
return _Match(
id=row['id'],
person_id=row['person_id'],
@ -124,7 +128,7 @@ def _parse_match(row) -> _Match:
)
def _parse_msg(row) -> _Message:
def _parse_msg(row: sqlite3.Row) -> _Message:
# note it also has raw_message_data -- not sure which is best to use..
sent = row['sent_date']
return _Message(

View file

@ -4,31 +4,32 @@ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime
from datetime import datetime, timezone
import re
from typing import Iterator, Sequence, Optional, Dict
import sqlite3
from typing import Iterator, Sequence, Union
import pytz
from more_itertools import unique_everseen
from my.core import Paths, Res, datetime_aware, get_files
from my.core.sqlite import sqlite_connection
from .common import TweetId, permalink
from my.config import twitter as user_config
from ..core import Paths, Res, datetime_aware
@dataclass
class config(user_config.talon):
# paths[s]/glob to the exported sqlite databases
export_path: Paths
from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
from .common import TweetId, permalink
@dataclass(unsafe_hash=True)
class Tweet:
id_str: TweetId
@ -51,8 +52,6 @@ class _IsFavorire:
tweet: Tweet
from typing import Union
from ..core.dataset import connect_readonly
Entity = Union[_IsTweet, _IsFavorire]
def _entities() -> Iterator[Res[Entity]]:
for f in inputs():
@ -67,35 +66,36 @@ def _process_one(f: Path) -> Iterator[Res[Entity]]:
fname = f.name
handler = handlers.get(fname)
if handler is None:
yield RuntimeError(f"Coulnd't find handler for {fname}")
yield RuntimeError(f"Could not find handler for {fname}")
return
with connect_readonly(f) as db:
with sqlite_connection(f, immutable=True, row_factory='row') as db:
yield from handler(db)
def _process_user_tweets(db) -> Iterator[Res[Entity]]:
def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
# dunno why it's called 'lists'
for r in db['lists'].all(order_by='time'):
for r in db.execute('SELECT * FROM lists ORDER BY time'):
try:
yield _IsTweet(_parse_tweet(r))
except Exception as e:
yield e
def _process_favorite_tweets(db) -> Iterator[Res[Entity]]:
for r in db['favorite_tweets'].all(order_by='time'):
def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'):
try:
yield _IsFavorire(_parse_tweet(r))
except Exception as e:
yield e
def _parse_tweet(row) -> Tweet:
def _parse_tweet(row: sqlite3.Row) -> Tweet:
# ok so looks like it's tz aware..
# https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95
# uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime()
# and it's created here, so looks like it's properly parsed from the api
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc)
text = row['text']
# try explanding URLs.. sadly there are no positions in the db
@ -132,7 +132,6 @@ def _parse_tweet(row) -> Tweet:
)
from more_itertools import unique_everseen
def tweets() -> Iterator[Res[Tweet]]:
for x in unique_everseen(_entities()):
if isinstance(x, Exception):
@ -140,6 +139,7 @@ def tweets() -> Iterator[Res[Tweet]]:
elif isinstance(x, _IsTweet):
yield x.tweet
def likes() -> Iterator[Res[Tweet]]:
for x in unique_everseen(_entities()):
if isinstance(x, Exception):

View file

@ -1,12 +1,16 @@
"""
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
"""
REQUIRES = ['dataset']
from ..core.common import Paths
from ..core.error import Res
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import NamedTuple, Iterator, List
from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats
from my.core.cfg import make_config
from my.core.sqlite import sqlite_connection
from my.config import twint as user_config
# TODO move to twitter.twint config structure
@ -17,16 +21,9 @@ class twint(user_config):
####
from ..core.cfg import make_config
config = make_config(twint)
from datetime import datetime, timezone
from typing import NamedTuple, Iterator, List
from pathlib import Path
from ..core.common import get_files, LazyLogger, Json, datetime_aware
log = LazyLogger(__name__)
@ -110,25 +107,19 @@ WHERE {where}
ORDER BY T.created_at
'''
def _get_db():
from ..core.dataset import connect_readonly
db_path = get_db_path()
return connect_readonly(db_path)
def tweets() -> Iterator[Res[Tweet]]:
db = _get_db()
res = db.query(_QUERY.format(where='F.tweet_id IS NULL'))
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
res = db.execute(_QUERY.format(where='F.tweet_id IS NULL'))
yield from map(Tweet, res)
def likes() -> Iterator[Res[Tweet]]:
db = _get_db()
res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL'))
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL'))
yield from map(Tweet, res)
from ..core import stat, Stats
def stats() -> Stats:
return {
**stat(tweets),