switch from using dataset to raw sqlite3 module

dataset is kinda unmaintaned and currently broken due to sqlalchemy 2.0 changes resolves https://github.com/karlicoss/HPI/issues/264
2023-02-07 01:28:45 +00:00 · 2023-02-07 01:28:45 +00:00 · 5c82d0faa9
commit 5c82d0faa9
parent 9c432027b5
8 changed files with 123 additions and 103 deletions
--- a/my/core/sqlite.py
+++ b/my/core/sqlite.py
@ -1,17 +1,19 @@
 from .common import assert_subpackage; assert_subpackage(__name__)


+from contextlib import contextmanager
 from pathlib import Path
 import shutil
 import sqlite3
 from tempfile import TemporaryDirectory
+from typing import Tuple, Any, Iterator, Callable, Optional, Union


-from .common import PathIsh
+from .common import PathIsh, assert_never
+from .compat import Literal


 def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection:
-    # https://www.sqlite.org/draft/uri.html#uriimmutable
    return sqlite3.connect(f'file:{db}?immutable=1', uri=True)


@ -30,6 +32,42 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None:
        conn.execute('DROP TABLE testtable')


+SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any]
+
+def dict_factory(cursor, row):
+    fields = [column[0] for column in cursor.description]
+    return {key: value for key, value in zip(fields, row)}
+
+
+Factory = Union[SqliteRowFactory, Literal['row', 'dict']]
+
+@contextmanager
+def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]:
+    dbp = f'file:{db}'
+    # https://www.sqlite.org/draft/uri.html#uriimmutable
+    if immutable:
+        dbp = f'{dbp}?immutable=1'
+    row_factory_: Any = None
+    if row_factory is not None:
+        if callable(row_factory):
+            row_factory_ = row_factory
+        elif row_factory == 'row':
+            row_factory_ = sqlite3.Row
+        elif row_factory == 'dict':
+            row_factory_ = dict_factory
+        else:
+            assert_never()
+
+    conn = sqlite3.connect(dbp, uri=True)
+    try:
+        conn.row_factory = row_factory_
+        with conn:
+            yield conn
+    finally:
+        # Connection context manager isn't actually closing the connection, only keeps transaction
+        conn.close()
+
+
 # TODO come up with a better name?
 # NOTE: this is tested by tests/sqlite.py::test_sqlite_read_with_wal
 def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
@ -52,8 +90,6 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
    return dest


-from typing import Tuple, Any, Iterator
-
 # NOTE hmm, so this kinda works
 # V = TypeVar('V', bound=Tuple[Any, ...])
 # def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]:
--- a/my/fbmessenger/android.py
+++ b/my/fbmessenger/android.py
@ -3,25 +3,27 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab
 """
 from __future__ import annotations

-REQUIRES = ['dataset']
-
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Iterator, Sequence, Optional, Dict
+import json
+from pathlib import Path
+import sqlite3
+from typing import Iterator, Sequence, Optional, Dict, Union

+from more_itertools import unique_everseen
+
+from my.core import get_files, Paths, datetime_naive, Res, assert_never
+from my.core.sqlite import sqlite_connection

 from my.config import fbmessenger as user_config


-from ..core import Paths
@dataclass
 class config(user_config.android):
    # paths[s]/glob to the exported sqlite databases
    export_path: Paths


-from ..core import get_files
-from pathlib import Path
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)

@ -38,7 +40,6 @@ class Thread:
    name: Optional[str]

 # todo not sure about order of fields...
-from ..core import datetime_naive
@dataclass
 class _BaseMessage:
    id: str
@ -63,22 +64,18 @@ class Message(_BaseMessage):
    reply_to: Optional[Message]


-import json
-from typing import Union
-from ..core import Res, assert_never
-from ..core.dataset import connect_readonly, DatabaseT
 Entity = Union[Sender, Thread, _Message]
 def _entities() -> Iterator[Res[Entity]]:
    for f in inputs():
-        with connect_readonly(f) as db:
+        with sqlite_connection(f, immutable=True, row_factory='row') as db:
            yield from _process_db(db)


-def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
+def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
    # works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user
    threadkey2id = lambda key: key.split(':')[1]

-    for r in db['threads'].find():
+    for r in db.execute('SELECT * FROM threads'):
        try:
            yield Thread(
                id=threadkey2id(r['thread_key']),
@ -88,7 +85,7 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
            yield e
            continue

-    for r in db['messages'].find(order_by='timestamp_ms'):
+    for r in db.execute('SELECT * FROM messages ORDER BY timestamp_ms'):
        mtype: int = r['msg_type']
        if mtype == -1:
            # likely immediately deleted or something? doesn't have any data at all
@ -133,7 +130,6 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
            yield e


-from more_itertools import unique_everseen
 def messages() -> Iterator[Res[Message]]:
    senders: Dict[str, Sender] = {}
    msgs: Dict[str, Message] = {}
--- a/my/hackernews/dogsheep.py
+++ b/my/hackernews/dogsheep.py
@ -5,13 +5,15 @@ from __future__ import annotations

 from dataclasses import dataclass
 from datetime import datetime
-from typing import Iterator, Sequence, Optional, Dict
+from pathlib import Path
+from typing import Iterator, Sequence, Optional

+from my.core import get_files, Paths, Res
+from my.core.sqlite import sqlite_connection

 from my.config import hackernews as user_config


-from ..core import Paths
@dataclass
 class config(user_config.dogsheep):
    # paths[s]/glob to the dogsheep database
@ -20,8 +22,6 @@ class config(user_config.dogsheep):

 # todo so much boilerplate... really need some common wildcard imports?...
 # at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional
-from ..core import get_files
-from pathlib import Path
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)

@ -44,15 +44,15 @@ class Item:
    @property
    def permalink(self) -> str:
        return hackernews_link(self.id)
+# TODO hmm kinda annoying that permalink isn't getting serialized
+# maybe won't be such a big problem if we used hpi query directly on objects, without jsons?
+# so we could just take .permalink thing


-from ..core.error import Res
-from ..core.dataset import connect_readonly
 def items() -> Iterator[Res[Item]]:
    f = max(inputs())
-    with connect_readonly(f) as db:
-        items = db['items']
-        for r in items.all(order_by='time'):
+    with sqlite_connection(f, immutable=True, row_factory='row') as conn:
+        for r in conn.execute('SELECT * FROM items ORDER BY time'):
            yield Item(
                id=r['id'],
                type=r['type'],
--- a/my/hackernews/materialistic.py
+++ b/my/hackernews/materialistic.py
@ -1,20 +1,17 @@
 """
 [[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews
 """
-
-REQUIRES = ['dataset']
-
-from datetime import datetime
+from datetime import datetime, timezone
+from pathlib import Path
 from typing import Any, Dict, Iterator, NamedTuple, Sequence

-import pytz
+from my.core import get_files
+from my.core.sqlite import sqlite_connection

 from my.config import materialistic as config
 # todo migrate config to my.hackernews.materialistic


-from ..core import get_files
-from pathlib import Path
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)

@ -28,7 +25,7 @@ class Saved(NamedTuple):
    @property
    def when(self) -> datetime:
        ts = int(self.row['time']) / 1000
-        return datetime.fromtimestamp(ts, tz=pytz.utc)
+        return datetime.fromtimestamp(ts, tz=timezone.utc)

    @property
    def uid(self) -> str:
@ -47,13 +44,11 @@ class Saved(NamedTuple):
        return hackernews_link(self.uid)


-from ..core.dataset import connect_readonly
 def raw() -> Iterator[Row]:
    last = max(inputs())
-    with connect_readonly(last) as db:
-        saved = db['saved']
+    with sqlite_connection(last, immutable=True, row_factory='dict') as conn:
+        yield from conn.execute('SELECT * FROM saved ORDER BY time')
        # TODO wonder if it's 'save time' or creation time?
-        yield from saved.all(order_by='time')


 def saves() -> Iterator[Saved]:
--- a/my/taplog.py
+++ b/my/taplog.py
@ -1,11 +1,11 @@
 '''
 [[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data
 '''
-
 from datetime import datetime
 from typing import NamedTuple, Dict, Optional, Iterable

-from .core import get_files
+from my.core import get_files, stat, Stats
+from my.core.sqlite import sqlite_connection

 from my.config import taplog as user_config

@ -46,10 +46,9 @@ class Entry(NamedTuple):

 def entries() -> Iterable[Entry]:
    last = max(get_files(user_config.export_path))
-    from .core.dataset import connect_readonly
-    db = connect_readonly(last)
+    with sqlite_connection(last, immutable=True, row_factory='dict') as db:
        # todo is it sorted by timestamp?
-    for row in db['Log'].all():
+        for row in db.execute('SELECT * FROM Log'):
            yield Entry(row)


@ -60,6 +59,5 @@ def by_button(button: str) -> Iterable[Entry]:
            yield e


-from .core import stat, Stats
 def stats() -> Stats:
    return stat(entries)
--- a/my/tinder/android.py
+++ b/my/tinder/android.py
@ -3,19 +3,18 @@ Tinder data from Android app database (in =/data/data/com.tinder/databases/tinde
 """
 from __future__ import annotations

-REQUIRES = ['dataset']
-
 from collections import defaultdict
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from itertools import chain
 from pathlib import Path
+import sqlite3
 from typing import Sequence, Iterator, Union, Dict, List, Mapping

 from more_itertools import unique_everseen

 from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware
-from my.core.dataset import connect_readonly, DatabaseT
+from my.core.sqlite import sqlite_connection


 from my.config import tinder as user_config
@ -73,6 +72,8 @@ class Message(_BaseMessage):
    to: Person


+# todo hmm I have a suspicion it might be cumulative?
+# although still possible that the user might remove/install app back, so need to keep that in mind
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)

@ -83,40 +84,43 @@ Entity  = Union[Person,  Match,  Message]

 def _entities() -> Iterator[Res[_Entity]]:
    for db_file in inputs():
-        with connect_readonly(db_file) as db:
+        with sqlite_connection(db_file, immutable=True, row_factory='row') as db:
            yield from _handle_db(db)


-def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]:
+def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]:
    # profile_user_view contains our own user id
-    for row in chain(db['profile_user_view'], db['match_person']):
+    for row in chain(
+            db.execute('SELECT * FROM profile_user_view'),
+            db.execute('SELECT * FROM match_person'),
+    ):
        try:
            yield _parse_person(row)
        except Exception as e:
            # todo attach error contex?
            yield e

-    for row in db['match']:
+    for row in db.execute('SELECT * FROM match'):
        try:
            yield _parse_match(row)
        except Exception as e:
            yield e

-    for row in db['message']:
+    for row in db.execute('SELECT * FROM message'):
        try:
            yield _parse_msg(row)
        except Exception as e:
            yield e


-def _parse_person(row) -> Person:
+def _parse_person(row: sqlite3.Row) -> Person:
    return Person(
        id=row['id'],
        name=row['name'],
    )


-def _parse_match(row) -> _Match:
+def _parse_match(row: sqlite3.Row) -> _Match:
    return _Match(
        id=row['id'],
        person_id=row['person_id'],
@ -124,7 +128,7 @@ def _parse_match(row) -> _Match:
    )


-def _parse_msg(row) -> _Message:
+def _parse_msg(row: sqlite3.Row) -> _Message:
    # note it also has raw_message_data -- not sure which is best to use..
    sent    = row['sent_date']
    return _Message(
--- a/my/twitter/talon.py
+++ b/my/twitter/talon.py
@ -4,31 +4,32 @@ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter
 from __future__ import annotations

 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timezone
 import re
-from typing import Iterator, Sequence, Optional, Dict
+import sqlite3
+from typing import Iterator, Sequence, Union

-import pytz
+from more_itertools import unique_everseen
+
+from my.core import Paths, Res, datetime_aware, get_files
+from my.core.sqlite import sqlite_connection
+
+from .common import TweetId, permalink

 from my.config import twitter as user_config


-from ..core import Paths, Res, datetime_aware
@dataclass
 class config(user_config.talon):
    # paths[s]/glob to the exported sqlite databases
    export_path: Paths


-from ..core import get_files
 from pathlib import Path
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)


-from .common import TweetId, permalink
-
-
@dataclass(unsafe_hash=True)
 class Tweet:
    id_str: TweetId
@ -51,8 +52,6 @@ class _IsFavorire:
    tweet: Tweet


-from typing import Union
-from ..core.dataset import connect_readonly
 Entity = Union[_IsTweet, _IsFavorire]
 def _entities() -> Iterator[Res[Entity]]:
    for f in inputs():
@ -67,35 +66,36 @@ def _process_one(f: Path) -> Iterator[Res[Entity]]:
    fname = f.name
    handler = handlers.get(fname)
    if handler is None:
-        yield RuntimeError(f"Coulnd't find handler for {fname}")
+        yield RuntimeError(f"Could not find handler for {fname}")
        return
-    with connect_readonly(f) as db:
+    with sqlite_connection(f, immutable=True, row_factory='row') as db:
        yield from handler(db)


-def _process_user_tweets(db) -> Iterator[Res[Entity]]:
+def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
    # dunno why it's called 'lists'
-    for r in db['lists'].all(order_by='time'):
+    for r in db.execute('SELECT * FROM lists ORDER BY time'):
        try:
            yield _IsTweet(_parse_tweet(r))
        except Exception as e:
            yield e


-def _process_favorite_tweets(db) -> Iterator[Res[Entity]]:
-    for r in db['favorite_tweets'].all(order_by='time'):
+def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
+    for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'):
        try:
            yield _IsFavorire(_parse_tweet(r))
        except Exception as e:
            yield e

-def _parse_tweet(row) -> Tweet:
+
+def _parse_tweet(row: sqlite3.Row) -> Tweet:
    # ok so looks like it's tz aware..
    # https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95
    # uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime()
    # and it's created here, so looks like it's properly parsed from the api
    # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
-    created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
+    created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc)
    text = row['text']

    # try explanding URLs.. sadly there are no positions in the db
@ -132,7 +132,6 @@ def _parse_tweet(row) -> Tweet:
    )


-from more_itertools import unique_everseen
 def tweets() -> Iterator[Res[Tweet]]:
    for x in unique_everseen(_entities()):
        if isinstance(x, Exception):
@ -140,6 +139,7 @@ def tweets() -> Iterator[Res[Tweet]]:
        elif isinstance(x, _IsTweet):
            yield x.tweet

+
 def likes() -> Iterator[Res[Tweet]]:
    for x in unique_everseen(_entities()):
        if isinstance(x, Exception):
--- a/my/twitter/twint.py
+++ b/my/twitter/twint.py
@ -1,12 +1,16 @@
 """
 Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
 """
-
-REQUIRES = ['dataset']
-
-from ..core.common import Paths
-from ..core.error import Res
 from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import NamedTuple, Iterator, List
+
+
+from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats
+from my.core.cfg import make_config
+from my.core.sqlite import sqlite_connection
+
 from my.config import twint as user_config

 # TODO move to twitter.twint config structure
@ -17,16 +21,9 @@ class twint(user_config):

 ####

-from ..core.cfg import make_config
 config = make_config(twint)


-from datetime import datetime, timezone
-from typing import NamedTuple, Iterator, List
-from pathlib import Path
-
-from ..core.common import get_files, LazyLogger, Json, datetime_aware
-
 log = LazyLogger(__name__)


@ -110,25 +107,19 @@ WHERE {where}
 ORDER BY T.created_at
 '''

-def _get_db():
-    from ..core.dataset import connect_readonly
-    db_path = get_db_path()
-    return connect_readonly(db_path)
-

 def tweets() -> Iterator[Res[Tweet]]:
-    db = _get_db()
-    res = db.query(_QUERY.format(where='F.tweet_id IS NULL'))
+    with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
+        res = db.execute(_QUERY.format(where='F.tweet_id IS NULL'))
        yield from map(Tweet, res)


 def likes() -> Iterator[Res[Tweet]]:
-    db = _get_db()
-    res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL'))
+    with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
+        res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL'))
        yield from map(Tweet, res)


-from ..core import stat, Stats
 def stats() -> Stats:
    return {
        **stat(tweets),