diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 0f4a416..3c1902d 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -1,17 +1,19 @@ from .common import assert_subpackage; assert_subpackage(__name__) +from contextlib import contextmanager from pathlib import Path import shutil import sqlite3 from tempfile import TemporaryDirectory +from typing import Tuple, Any, Iterator, Callable, Optional, Union -from .common import PathIsh +from .common import PathIsh, assert_never +from .compat import Literal def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection: - # https://www.sqlite.org/draft/uri.html#uriimmutable return sqlite3.connect(f'file:{db}?immutable=1', uri=True) @@ -30,6 +32,42 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None: conn.execute('DROP TABLE testtable') +SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any] + +def dict_factory(cursor, row): + fields = [column[0] for column in cursor.description] + return {key: value for key, value in zip(fields, row)} + + +Factory = Union[SqliteRowFactory, Literal['row', 'dict']] + +@contextmanager +def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]: + dbp = f'file:{db}' + # https://www.sqlite.org/draft/uri.html#uriimmutable + if immutable: + dbp = f'{dbp}?immutable=1' + row_factory_: Any = None + if row_factory is not None: + if callable(row_factory): + row_factory_ = row_factory + elif row_factory == 'row': + row_factory_ = sqlite3.Row + elif row_factory == 'dict': + row_factory_ = dict_factory + else: + assert_never() + + conn = sqlite3.connect(dbp, uri=True) + try: + conn.row_factory = row_factory_ + with conn: + yield conn + finally: + # Connection context manager isn't actually closing the connection, only keeps transaction + conn.close() + + # TODO come up with a better name? # NOTE: this is tested by tests/sqlite.py::test_sqlite_read_with_wal def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: @@ -52,8 +90,6 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: return dest -from typing import Tuple, Any, Iterator - # NOTE hmm, so this kinda works # V = TypeVar('V', bound=Tuple[Any, ...]) # def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]: diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index a8078d6..ef3711a 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -3,25 +3,27 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab """ from __future__ import annotations -REQUIRES = ['dataset'] - from dataclasses import dataclass from datetime import datetime -from typing import Iterator, Sequence, Optional, Dict +import json +from pathlib import Path +import sqlite3 +from typing import Iterator, Sequence, Optional, Dict, Union +from more_itertools import unique_everseen + +from my.core import get_files, Paths, datetime_naive, Res, assert_never +from my.core.sqlite import sqlite_connection from my.config import fbmessenger as user_config -from ..core import Paths @dataclass class config(user_config.android): # paths[s]/glob to the exported sqlite databases export_path: Paths -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -38,7 +40,6 @@ class Thread: name: Optional[str] # todo not sure about order of fields... -from ..core import datetime_naive @dataclass class _BaseMessage: id: str @@ -63,22 +64,18 @@ class Message(_BaseMessage): reply_to: Optional[Message] -import json -from typing import Union -from ..core import Res, assert_never -from ..core.dataset import connect_readonly, DatabaseT Entity = Union[Sender, Thread, _Message] def _entities() -> Iterator[Res[Entity]]: for f in inputs(): - with connect_readonly(f) as db: + with sqlite_connection(f, immutable=True, row_factory='row') as db: yield from _process_db(db) -def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]: +def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: # works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user threadkey2id = lambda key: key.split(':')[1] - for r in db['threads'].find(): + for r in db.execute('SELECT * FROM threads'): try: yield Thread( id=threadkey2id(r['thread_key']), @@ -88,7 +85,7 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]: yield e continue - for r in db['messages'].find(order_by='timestamp_ms'): + for r in db.execute('SELECT * FROM messages ORDER BY timestamp_ms'): mtype: int = r['msg_type'] if mtype == -1: # likely immediately deleted or something? doesn't have any data at all @@ -133,7 +130,6 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]: yield e -from more_itertools import unique_everseen def messages() -> Iterator[Res[Message]]: senders: Dict[str, Sender] = {} msgs: Dict[str, Message] = {} diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py index 7329690..462cbc0 100644 --- a/my/hackernews/dogsheep.py +++ b/my/hackernews/dogsheep.py @@ -5,13 +5,15 @@ from __future__ import annotations from dataclasses import dataclass from datetime import datetime -from typing import Iterator, Sequence, Optional, Dict +from pathlib import Path +from typing import Iterator, Sequence, Optional +from my.core import get_files, Paths, Res +from my.core.sqlite import sqlite_connection from my.config import hackernews as user_config -from ..core import Paths @dataclass class config(user_config.dogsheep): # paths[s]/glob to the dogsheep database @@ -20,8 +22,6 @@ class config(user_config.dogsheep): # todo so much boilerplate... really need some common wildcard imports?... # at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -44,15 +44,15 @@ class Item: @property def permalink(self) -> str: return hackernews_link(self.id) +# TODO hmm kinda annoying that permalink isn't getting serialized +# maybe won't be such a big problem if we used hpi query directly on objects, without jsons? +# so we could just take .permalink thing -from ..core.error import Res -from ..core.dataset import connect_readonly def items() -> Iterator[Res[Item]]: f = max(inputs()) - with connect_readonly(f) as db: - items = db['items'] - for r in items.all(order_by='time'): + with sqlite_connection(f, immutable=True, row_factory='row') as conn: + for r in conn.execute('SELECT * FROM items ORDER BY time'): yield Item( id=r['id'], type=r['type'], diff --git a/my/hackernews/materialistic.py b/my/hackernews/materialistic.py index 65a1cb6..e0d634a 100644 --- a/my/hackernews/materialistic.py +++ b/my/hackernews/materialistic.py @@ -1,20 +1,17 @@ """ [[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews """ - -REQUIRES = ['dataset'] - -from datetime import datetime +from datetime import datetime, timezone +from pathlib import Path from typing import Any, Dict, Iterator, NamedTuple, Sequence -import pytz +from my.core import get_files +from my.core.sqlite import sqlite_connection from my.config import materialistic as config # todo migrate config to my.hackernews.materialistic -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -28,7 +25,7 @@ class Saved(NamedTuple): @property def when(self) -> datetime: ts = int(self.row['time']) / 1000 - return datetime.fromtimestamp(ts, tz=pytz.utc) + return datetime.fromtimestamp(ts, tz=timezone.utc) @property def uid(self) -> str: @@ -47,13 +44,11 @@ class Saved(NamedTuple): return hackernews_link(self.uid) -from ..core.dataset import connect_readonly def raw() -> Iterator[Row]: last = max(inputs()) - with connect_readonly(last) as db: - saved = db['saved'] + with sqlite_connection(last, immutable=True, row_factory='dict') as conn: + yield from conn.execute('SELECT * FROM saved ORDER BY time') # TODO wonder if it's 'save time' or creation time? - yield from saved.all(order_by='time') def saves() -> Iterator[Saved]: diff --git a/my/taplog.py b/my/taplog.py index f668a10..6353c14 100644 --- a/my/taplog.py +++ b/my/taplog.py @@ -1,11 +1,11 @@ ''' [[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data ''' - from datetime import datetime from typing import NamedTuple, Dict, Optional, Iterable -from .core import get_files +from my.core import get_files, stat, Stats +from my.core.sqlite import sqlite_connection from my.config import taplog as user_config @@ -46,11 +46,10 @@ class Entry(NamedTuple): def entries() -> Iterable[Entry]: last = max(get_files(user_config.export_path)) - from .core.dataset import connect_readonly - db = connect_readonly(last) - # todo is it sorted by timestamp? - for row in db['Log'].all(): - yield Entry(row) + with sqlite_connection(last, immutable=True, row_factory='dict') as db: + # todo is it sorted by timestamp? + for row in db.execute('SELECT * FROM Log'): + yield Entry(row) # I guess worth having as top level considering it would be quite common? @@ -60,6 +59,5 @@ def by_button(button: str) -> Iterable[Entry]: yield e -from .core import stat, Stats def stats() -> Stats: return stat(entries) diff --git a/my/tinder/android.py b/my/tinder/android.py index e92f316..9f68992 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -3,19 +3,18 @@ Tinder data from Android app database (in =/data/data/com.tinder/databases/tinde """ from __future__ import annotations -REQUIRES = ['dataset'] - from collections import defaultdict from dataclasses import dataclass from datetime import datetime, timezone from itertools import chain from pathlib import Path +import sqlite3 from typing import Sequence, Iterator, Union, Dict, List, Mapping from more_itertools import unique_everseen from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware -from my.core.dataset import connect_readonly, DatabaseT +from my.core.sqlite import sqlite_connection from my.config import tinder as user_config @@ -73,6 +72,8 @@ class Message(_BaseMessage): to: Person +# todo hmm I have a suspicion it might be cumulative? +# although still possible that the user might remove/install app back, so need to keep that in mind def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -83,40 +84,43 @@ Entity = Union[Person, Match, Message] def _entities() -> Iterator[Res[_Entity]]: for db_file in inputs(): - with connect_readonly(db_file) as db: + with sqlite_connection(db_file, immutable=True, row_factory='row') as db: yield from _handle_db(db) -def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]: +def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]: # profile_user_view contains our own user id - for row in chain(db['profile_user_view'], db['match_person']): + for row in chain( + db.execute('SELECT * FROM profile_user_view'), + db.execute('SELECT * FROM match_person'), + ): try: yield _parse_person(row) except Exception as e: # todo attach error contex? yield e - for row in db['match']: + for row in db.execute('SELECT * FROM match'): try: yield _parse_match(row) except Exception as e: yield e - for row in db['message']: + for row in db.execute('SELECT * FROM message'): try: yield _parse_msg(row) except Exception as e: yield e -def _parse_person(row) -> Person: +def _parse_person(row: sqlite3.Row) -> Person: return Person( id=row['id'], name=row['name'], ) -def _parse_match(row) -> _Match: +def _parse_match(row: sqlite3.Row) -> _Match: return _Match( id=row['id'], person_id=row['person_id'], @@ -124,7 +128,7 @@ def _parse_match(row) -> _Match: ) -def _parse_msg(row) -> _Message: +def _parse_msg(row: sqlite3.Row) -> _Message: # note it also has raw_message_data -- not sure which is best to use.. sent = row['sent_date'] return _Message( diff --git a/my/twitter/talon.py b/my/twitter/talon.py index 81137d6..e43f600 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -4,31 +4,32 @@ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter from __future__ import annotations from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone import re -from typing import Iterator, Sequence, Optional, Dict +import sqlite3 +from typing import Iterator, Sequence, Union -import pytz +from more_itertools import unique_everseen + +from my.core import Paths, Res, datetime_aware, get_files +from my.core.sqlite import sqlite_connection + +from .common import TweetId, permalink from my.config import twitter as user_config -from ..core import Paths, Res, datetime_aware @dataclass class config(user_config.talon): # paths[s]/glob to the exported sqlite databases export_path: Paths -from ..core import get_files from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) -from .common import TweetId, permalink - - @dataclass(unsafe_hash=True) class Tweet: id_str: TweetId @@ -51,8 +52,6 @@ class _IsFavorire: tweet: Tweet -from typing import Union -from ..core.dataset import connect_readonly Entity = Union[_IsTweet, _IsFavorire] def _entities() -> Iterator[Res[Entity]]: for f in inputs(): @@ -67,35 +66,36 @@ def _process_one(f: Path) -> Iterator[Res[Entity]]: fname = f.name handler = handlers.get(fname) if handler is None: - yield RuntimeError(f"Coulnd't find handler for {fname}") + yield RuntimeError(f"Could not find handler for {fname}") return - with connect_readonly(f) as db: + with sqlite_connection(f, immutable=True, row_factory='row') as db: yield from handler(db) -def _process_user_tweets(db) -> Iterator[Res[Entity]]: +def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]: # dunno why it's called 'lists' - for r in db['lists'].all(order_by='time'): + for r in db.execute('SELECT * FROM lists ORDER BY time'): try: yield _IsTweet(_parse_tweet(r)) except Exception as e: yield e -def _process_favorite_tweets(db) -> Iterator[Res[Entity]]: - for r in db['favorite_tweets'].all(order_by='time'): +def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]: + for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'): try: yield _IsFavorire(_parse_tweet(r)) except Exception as e: yield e -def _parse_tweet(row) -> Tweet: + +def _parse_tweet(row: sqlite3.Row) -> Tweet: # ok so looks like it's tz aware.. # https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95 # uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime() # and it's created here, so looks like it's properly parsed from the api # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79 - created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc) + created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc) text = row['text'] # try explanding URLs.. sadly there are no positions in the db @@ -132,7 +132,6 @@ def _parse_tweet(row) -> Tweet: ) -from more_itertools import unique_everseen def tweets() -> Iterator[Res[Tweet]]: for x in unique_everseen(_entities()): if isinstance(x, Exception): @@ -140,6 +139,7 @@ def tweets() -> Iterator[Res[Tweet]]: elif isinstance(x, _IsTweet): yield x.tweet + def likes() -> Iterator[Res[Tweet]]: for x in unique_everseen(_entities()): if isinstance(x, Exception): diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 5ba0460..54c7f91 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -1,12 +1,16 @@ """ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export. """ - -REQUIRES = ['dataset'] - -from ..core.common import Paths -from ..core.error import Res from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import NamedTuple, Iterator, List + + +from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats +from my.core.cfg import make_config +from my.core.sqlite import sqlite_connection + from my.config import twint as user_config # TODO move to twitter.twint config structure @@ -17,16 +21,9 @@ class twint(user_config): #### -from ..core.cfg import make_config config = make_config(twint) -from datetime import datetime, timezone -from typing import NamedTuple, Iterator, List -from pathlib import Path - -from ..core.common import get_files, LazyLogger, Json, datetime_aware - log = LazyLogger(__name__) @@ -110,25 +107,19 @@ WHERE {where} ORDER BY T.created_at ''' -def _get_db(): - from ..core.dataset import connect_readonly - db_path = get_db_path() - return connect_readonly(db_path) - def tweets() -> Iterator[Res[Tweet]]: - db = _get_db() - res = db.query(_QUERY.format(where='F.tweet_id IS NULL')) - yield from map(Tweet, res) + with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db: + res = db.execute(_QUERY.format(where='F.tweet_id IS NULL')) + yield from map(Tweet, res) def likes() -> Iterator[Res[Tweet]]: - db = _get_db() - res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL')) - yield from map(Tweet, res) + with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db: + res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL')) + yield from map(Tweet, res) -from ..core import stat, Stats def stats() -> Stats: return { **stat(tweets),