diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c45d99a..8b23921 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -50,12 +50,12 @@ jobs: - run: bash scripts/ci/run - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: .coverage.mypy-misc_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-misc/ - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: .coverage.mypy-core_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-core/ diff --git a/my/body/blood.py b/my/body/blood.py index c1d66e2..e282068 100644 --- a/my/body/blood.py +++ b/my/body/blood.py @@ -13,7 +13,7 @@ import pandas as pd # type: ignore import orgparse -from my.config import blood as config +from my.config import blood as config # type: ignore[attr-defined] class Entry(NamedTuple): diff --git a/my/body/weight.py b/my/body/weight.py index 28688b6..659b759 100644 --- a/my/body/weight.py +++ b/my/body/weight.py @@ -10,7 +10,7 @@ from ..core.error import Res, set_error_datetime, extract_error_datetime from .. import orgmode -from my.config import weight as config +from my.config import weight as config # type: ignore[attr-defined] log = LazyLogger('my.body.weight') diff --git a/my/books/kobo.py b/my/books/kobo.py index d5f5416..2a469d0 100644 --- a/my/books/kobo.py +++ b/my/books/kobo.py @@ -4,4 +4,4 @@ warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!') from ..core.util import __NOT_HPI_MODULE__ -from ..kobo import * +from ..kobo import * # type: ignore[no-redef] diff --git a/my/coding/codeforces.py b/my/coding/codeforces.py index 3793988..a4c7de2 100644 --- a/my/coding/codeforces.py +++ b/my/coding/codeforces.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 -from my.config import codeforces as config +from my.config import codeforces as config # type: ignore[attr-defined] + from datetime import datetime, timezone from typing import NamedTuple import json from typing import Dict, Iterator + from ..core import get_files, Res, unwrap from ..core.compat import cached_property from ..core.konsume import ignore, wrap diff --git a/my/coding/topcoder.py b/my/coding/topcoder.py index 5711254..32a9ff8 100644 --- a/my/coding/topcoder.py +++ b/my/coding/topcoder.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 -from my.config import topcoder as config +from my.config import topcoder as config # type: ignore[attr-defined] + from datetime import datetime from typing import NamedTuple import json from typing import Dict, Iterator + from ..core import get_files, Res, unwrap, Json from ..core.compat import cached_property from ..core.error import Res, unwrap diff --git a/my/config.py b/my/config.py index 3310c37..58aadfc 100644 --- a/my/config.py +++ b/my/config.py @@ -14,8 +14,14 @@ from my.core import init ### +from datetime import tzinfo +from pathlib import Path +from typing import List + + from my.core import Paths, PathIsh + class hypothesis: # expects outputs from https://github.com/karlicoss/hypexport # (it's just the standard Hypothes.is export format) @@ -141,9 +147,14 @@ class hackernews: export_path: Paths +class materialistic: + export_path: Paths + + class fbmessenger: class fbmessengerexport: export_db: PathIsh + facebook_id: Optional[str] class android: export_path: Paths @@ -156,8 +167,87 @@ class twitter: class talon: export_path: Paths + +class twint: + export_path: Paths + + class browser: class export: export_path: Paths = '' class active_browser: export_path: Paths = '' + + +class telegram: + class telegram_backup: + export_path: PathIsh = '' + + +class demo: + data_path: Paths + username: str + timezone: tzinfo + + +class simple: + count: int + + +class vk_messages_backup: + storage_path: Path + + +class kobo: + export_path: Paths + + +class feedly: + export_path: Paths + + +class feedbin: + export_path: Paths + + +class taplog: + export_path: Paths + + +class lastfm: + export_path: Paths + + +class rescuetime: + export_path: Paths + + +class runnerup: + export_path: Paths + + +class emfit: + export_path: Path + timezone: tzinfo + excluded_sids: List[str] + + +class foursquare: + export_path: Paths + + +class rtm: + export_path: Paths + + +class imdb: + export_path: Paths + + +class roamresearch: + export_path: Paths + username: str + + + + diff --git a/my/core/__main__.py b/my/core/__main__.py index d8e9ebd..11f32fc 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -344,8 +344,8 @@ def _requires(modules: Sequence[str]) -> Sequence[str]: reqs = mod.requires if reqs is None: - error(f"Module {mod.name} has no REQUIRES specification") - sys.exit(1) + warning(f"Module {mod.name} has no REQUIRES specification") + continue for r in reqs: if r not in res: res.append(r) @@ -369,6 +369,10 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool=False) - requirements = _requires(module) + if len(requirements) == 0: + warning('requirements list is empty, no need to install anything') + return + pre_cmd = [ sys.executable, '-m', 'pip', 'install', diff --git a/my/core/cfg.py b/my/core/cfg.py index 4b5cbed..3321a4c 100644 --- a/my/core/cfg.py +++ b/my/core/cfg.py @@ -28,7 +28,7 @@ F = TypeVar('F') from contextlib import contextmanager from typing import Iterator @contextmanager -def override_config(config: F) -> Iterator[F]: +def _override_config(config: F) -> Iterator[F]: ''' Temporary override for config's parameters, useful for testing/fake data/etc. ''' @@ -44,12 +44,53 @@ def override_config(config: F) -> Iterator[F]: delattr(config, k) -# helper for tests? not sure if could be useful elsewhere +import importlib +import sys +from typing import Optional, Set +ModuleRegex = str @contextmanager -def tmp_config(): - import my.config as C - with override_config(C): - yield C # todo not sure? +def _reload_modules(modules: ModuleRegex) -> Iterator[None]: + def loaded_modules() -> Set[str]: + return {name for name in sys.modules if re.fullmatch(modules, name)} + + modules_before = loaded_modules() + + for m in modules_before: + importlib.reload(sys.modules[m]) + + try: + yield + finally: + modules_after = loaded_modules() + for m in modules_after: + if m in modules_before: + # was previously loaded, so need to reload to pick up old config + importlib.reload(sys.modules[m]) + else: + # wasn't previously loaded, so need to unload it + # otherwise it might fail due to missing config etc + sys.modules.pop(m, None) + + +from contextlib import ExitStack +import re +@contextmanager +def tmp_config(*, modules: Optional[ModuleRegex]=None, config=None): + if modules is None: + assert config is None + if modules is not None: + assert config is not None + + import my.config + with ExitStack() as module_reload_stack, _override_config(my.config) as new_config: + if config is not None: + overrides = {k: v for k, v in vars(config).items() if not k.startswith('__')} + for k, v in overrides.items(): + setattr(new_config, k, v) + + if modules is not None: + module_reload_stack.enter_context(_reload_modules(modules)) + yield new_config def test_tmp_config() -> None: @@ -63,3 +104,8 @@ def test_tmp_config() -> None: # todo hmm. not sure what should do about new properties?? assert not hasattr(c, 'extra') assert c.google != 'whatever' + + +### +# todo properly deprecate, this isn't really meant for public use +override_config = _override_config diff --git a/my/core/core_config.py b/my/core/core_config.py index 48f3eb4..f87a1ba 100644 --- a/my/core/core_config.py +++ b/my/core/core_config.py @@ -123,8 +123,8 @@ from contextlib import contextmanager as ctx @ctx def _reset_config() -> Iterator[Config]: # todo maybe have this decorator for the whole of my.config? - from .cfg import override_config - with override_config(config) as cc: + from .cfg import _override_config + with _override_config(config) as cc: cc.enabled_modules = None cc.disabled_modules = None cc.cache_dir = None diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 0f4a416..7c02940 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -1,17 +1,19 @@ from .common import assert_subpackage; assert_subpackage(__name__) +from contextlib import contextmanager from pathlib import Path import shutil import sqlite3 from tempfile import TemporaryDirectory +from typing import Tuple, Any, Iterator, Callable, Optional, Union -from .common import PathIsh +from .common import PathIsh, assert_never +from .compat import Literal def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection: - # https://www.sqlite.org/draft/uri.html#uriimmutable return sqlite3.connect(f'file:{db}?immutable=1', uri=True) @@ -30,6 +32,44 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None: conn.execute('DROP TABLE testtable') +SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any] + +def dict_factory(cursor, row): + fields = [column[0] for column in cursor.description] + return {key: value for key, value in zip(fields, row)} + + +Factory = Union[SqliteRowFactory, Literal['row', 'dict']] + +@contextmanager +def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]: + dbp = f'file:{db}' + # https://www.sqlite.org/draft/uri.html#uriimmutable + if immutable: + # assert results in nicer error than sqlite3.OperationalError + assert Path(db).exists(), db + dbp = f'{dbp}?immutable=1' + row_factory_: Any = None + if row_factory is not None: + if callable(row_factory): + row_factory_ = row_factory + elif row_factory == 'row': + row_factory_ = sqlite3.Row + elif row_factory == 'dict': + row_factory_ = dict_factory + else: + assert_never() + + conn = sqlite3.connect(dbp, uri=True) + try: + conn.row_factory = row_factory_ + with conn: + yield conn + finally: + # Connection context manager isn't actually closing the connection, only keeps transaction + conn.close() + + # TODO come up with a better name? # NOTE: this is tested by tests/sqlite.py::test_sqlite_read_with_wal def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: @@ -52,8 +92,6 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: return dest -from typing import Tuple, Any, Iterator - # NOTE hmm, so this kinda works # V = TypeVar('V', bound=Tuple[Any, ...]) # def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]: diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index 997ba6c..a081416 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -3,6 +3,11 @@ Consumes data exported by https://github.com/karlicoss/emfitexport """ + +REQUIRES = [ + 'git+https://github.com/karlicoss/emfitexport', +] + from pathlib import Path from typing import Dict, List, Iterable, Any, Optional @@ -140,16 +145,20 @@ def stats() -> Stats: from contextlib import contextmanager from typing import Iterator @contextmanager -def fake_data(nights: int=500) -> Iterator[None]: - from ..core.cfg import override_config +def fake_data(nights: int=500) -> Iterator: + from my.core.cfg import tmp_config from tempfile import TemporaryDirectory - with override_config(config) as cfg, TemporaryDirectory() as td: + with TemporaryDirectory() as td: tdir = Path(td) - cfg.export_path = tdir - gen = dal.FakeData() gen.fill(tdir, count=nights) - yield + + class override: + class emfit: + export_path = tdir + + with tmp_config(modules=__name__, config=override) as cfg: + yield cfg # TODO remove/deprecate it? I think used by timeline diff --git a/my/endomondo.py b/my/endomondo.py index 0df7aa9..0fa396f 100644 --- a/my/endomondo.py +++ b/my/endomondo.py @@ -87,20 +87,24 @@ def stats() -> Stats: # TODO make sure it's possible to 'advise' functions and override stuff from contextlib import contextmanager +from typing import Iterator @contextmanager -def fake_data(count: int=100): - from .core.cfg import override_config +def fake_data(count: int=100) -> Iterator: + from my.core.cfg import tmp_config from tempfile import TemporaryDirectory import json - with override_config(endomondo) as cfg, TemporaryDirectory() as td: + with TemporaryDirectory() as td: tdir = Path(td) - cfg.export_path = tdir - - # todo would be nice to somehow expose the generator so it's possible to hack from the outside? fd = dal.FakeData() data = fd.generate(count=count) jf = tdir / 'data.json' jf.write_text(json.dumps(data)) - yield + class override: + class endomondo: + export_path = tdir + + with tmp_config(modules=__name__, config=override) as cfg: + # todo would be nice to somehow expose the generator so it's possible to hack from the outside? + yield cfg diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index a8078d6..69555cb 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -3,25 +3,37 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab """ from __future__ import annotations -REQUIRES = ['dataset'] - from dataclasses import dataclass from datetime import datetime -from typing import Iterator, Sequence, Optional, Dict +from pathlib import Path +import sqlite3 +from typing import Iterator, Sequence, Optional, Dict, Union, List +from more_itertools import unique_everseen + +from my.core import get_files, Paths, datetime_naive, Res, assert_never, LazyLogger, make_config +from my.core.error import echain +from my.core.sqlite import sqlite_connection from my.config import fbmessenger as user_config -from ..core import Paths +logger = LazyLogger(__name__) + + @dataclass -class config(user_config.android): +class Config(user_config.android): # paths[s]/glob to the exported sqlite databases export_path: Paths + facebook_id: Optional[str] = None + + +# hmm. this is necessary for default value (= None) to work +# otherwise Config.facebook_id is always None.. +config = make_config(Config) + -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -35,10 +47,9 @@ class Sender: @dataclass(unsafe_hash=True) class Thread: id: str - name: Optional[str] + name: Optional[str] # isn't set for groups or one to one messages # todo not sure about order of fields... -from ..core import datetime_naive @dataclass class _BaseMessage: id: str @@ -63,77 +74,92 @@ class Message(_BaseMessage): reply_to: Optional[Message] -import json -from typing import Union -from ..core import Res, assert_never -from ..core.dataset import connect_readonly, DatabaseT Entity = Union[Sender, Thread, _Message] def _entities() -> Iterator[Res[Entity]]: - for f in inputs(): - with connect_readonly(f) as db: - yield from _process_db(db) + dbs = inputs() + for i, f in enumerate(dbs): + logger.debug(f'processing {f} {i}/{len(dbs)}') + with sqlite_connection(f, immutable=True, row_factory='row') as db: + try: + yield from _process_db(db) + except Exception as e: + yield echain(RuntimeError(f'While processing {f}'), cause=e) -def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]: +def _normalise_user_id(ukey: str) -> str: + # trying to match messages.author from fbchat + prefix = 'FACEBOOK:' + assert ukey.startswith(prefix), ukey + return ukey[len(prefix):] + + +def _normalise_thread_id(key) -> str: # works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user - threadkey2id = lambda key: key.split(':')[1] + return key.split(':')[1] - for r in db['threads'].find(): - try: - yield Thread( - id=threadkey2id(r['thread_key']), - name=r['name'], - ) - except Exception as e: - yield e + +def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: + senders: Dict[str, Sender] = {} + for r in db.execute('''SELECT * FROM thread_users'''): + # for messaging_actor_type == 'REDUCED_MESSAGING_ACTOR', name is None + # but they are still referenced, so need to keep + name = r['name'] or '' + user_key = r['user_key'] + s = Sender( + id=_normalise_user_id(user_key), + name=name, + ) + senders[user_key] = s + yield s + + self_id = config.facebook_id + thread_users: Dict[str, List[Sender]] = {} + for r in db.execute('SELECT * from thread_participants'): + thread_key = r['thread_key'] + user_key = r['user_key'] + if self_id is not None and user_key == f'FACEBOOK:{self_id}': + # exclude yourself, otherwise it's just spammy to show up in all participants continue - for r in db['messages'].find(order_by='timestamp_ms'): - mtype: int = r['msg_type'] - if mtype == -1: - # likely immediately deleted or something? doesn't have any data at all + ll = thread_users.get(thread_key) + if ll is None: + ll = [] + thread_users[thread_key] = ll + ll.append(senders[user_key]) + + for r in db.execute('SELECT * FROM threads'): + thread_key = r['thread_key'] + thread_type = thread_key.split(':')[0] + if thread_type == 'MONTAGE': # no idea what this is? continue + name = r['name'] # seems that it's only set for some groups + if name is None: + users = thread_users[thread_key] + name = ', '.join([u.name for u in users]) + yield Thread( + id=_normalise_thread_id(thread_key), + name=name, + ) - user_id = None - try: - # todo could use thread_users? - sj = json.loads(r['sender']) - ukey: str = sj['user_key'] - prefix = 'FACEBOOK:' - assert ukey.startswith(prefix), ukey - user_id = ukey[len(prefix):] - yield Sender( - id=user_id, - name=sj['name'], - ) - except Exception as e: - yield e - continue - - thread_id = None - try: - thread_id = threadkey2id(r['thread_key']) - except Exception as e: - yield e - continue - - try: - assert user_id is not None - assert thread_id is not None - yield _Message( - id=r['msg_id'], - dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000), - # is_incoming=False, TODO?? - text=r['text'], - thread_id=thread_id, - sender_id=user_id, - reply_to_id=r['message_replied_to_id'] - ) - except Exception as e: - yield e + for r in db.execute(''' + SELECT *, json_extract(sender, "$.user_key") AS user_key FROM messages + WHERE msg_type NOT IN ( + -1, /* these don't have any data at all, likely immediately deleted or something? */ + 2 /* these are 'left group' system messages, also a bit annoying since they might reference nonexistent users */ + ) + ORDER BY timestamp_ms /* they aren't in order in the database, so need to sort */ + '''): + yield _Message( + id=r['msg_id'], + dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000), + # is_incoming=False, TODO?? + text=r['text'], + thread_id=_normalise_thread_id(r['thread_key']), + sender_id=_normalise_user_id(r['user_key']), + reply_to_id=r['message_replied_to_id'] + ) -from more_itertools import unique_everseen def messages() -> Iterator[Res[Message]]: senders: Dict[str, Sender] = {} msgs: Dict[str, Message] = {} @@ -150,12 +176,12 @@ def messages() -> Iterator[Res[Message]]: continue if isinstance(x, _Message): reply_to_id = x.reply_to_id + # hmm, reply_to be missing due to the synthetic nature of export, so have to be defensive + reply_to = None if reply_to_id is None else msgs.get(reply_to_id) + # also would be interesting to merge together entities rather than resuling messages from different sources.. + # then the merging thing could be moved to common? try: sender = senders[x.sender_id] - # hmm, reply_to be missing due to the synthetic nature of export - # also would be interesting to merge together entities rather than resuling messages from different sources.. - # then the merging thing could be moved to common? - reply_to = None if reply_to_id is None else msgs[reply_to_id] thread = threads[x.thread_id] except Exception as e: yield e diff --git a/my/fbmessenger/export.py b/my/fbmessenger/export.py index 0edb571..3a9d227 100644 --- a/my/fbmessenger/export.py +++ b/my/fbmessenger/export.py @@ -7,10 +7,13 @@ REQUIRES = [ 'git+https://github.com/karlicoss/fbmessengerexport', ] +from contextlib import ExitStack, contextmanager from dataclasses import dataclass from pathlib import Path from typing import Iterator +from my.core import PathIsh, Res, stat, Stats +from my.core.warnings import high from my.config import fbmessenger as user_config import fbmessengerexport.dal as messenger @@ -22,7 +25,6 @@ _new_section = getattr(user_config, 'fbmessengerexport', None) _old_attr = getattr(user_config, 'export_db', None) if _new_section is None and _old_attr is not None: - from my.core.warnings import high high("""DEPRECATED! Please modify your fbmessenger config to look like: class fbmessenger: @@ -35,24 +37,26 @@ class fbmessenger: ### -from ..core import PathIsh @dataclass class config(user_config.fbmessengerexport): export_db: PathIsh -def _dal() -> messenger.DAL: - return messenger.DAL(config.export_db) +@contextmanager +def _dal() -> Iterator[messenger.DAL]: + model = messenger.DAL(config.export_db) + with ExitStack() as stack: + if hasattr(model, '__dal__'): # defensive to support legacy fbmessengerexport + stack.enter_context(model) + yield model -from ..core import Res def messages() -> Iterator[Res[messenger.Message]]: - model = _dal() - for t in model.iter_threads(): - yield from t.iter_messages() + with _dal() as model: + for t in model.iter_threads(): + yield from t.iter_messages() -from ..core import stat, Stats def stats() -> Stats: return stat(messages) @@ -75,11 +79,9 @@ def dump_chat_history(where: PathIsh) -> None: p = Path(where) assert not p.exists() or p.is_dir() - model = _dal() - from shutil import rmtree from tempfile import TemporaryDirectory - with TemporaryDirectory() as tdir: + with TemporaryDirectory() as tdir, _dal() as model: td = Path(tdir) _dump_helper(model, td) diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py index 7329690..462cbc0 100644 --- a/my/hackernews/dogsheep.py +++ b/my/hackernews/dogsheep.py @@ -5,13 +5,15 @@ from __future__ import annotations from dataclasses import dataclass from datetime import datetime -from typing import Iterator, Sequence, Optional, Dict +from pathlib import Path +from typing import Iterator, Sequence, Optional +from my.core import get_files, Paths, Res +from my.core.sqlite import sqlite_connection from my.config import hackernews as user_config -from ..core import Paths @dataclass class config(user_config.dogsheep): # paths[s]/glob to the dogsheep database @@ -20,8 +22,6 @@ class config(user_config.dogsheep): # todo so much boilerplate... really need some common wildcard imports?... # at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -44,15 +44,15 @@ class Item: @property def permalink(self) -> str: return hackernews_link(self.id) +# TODO hmm kinda annoying that permalink isn't getting serialized +# maybe won't be such a big problem if we used hpi query directly on objects, without jsons? +# so we could just take .permalink thing -from ..core.error import Res -from ..core.dataset import connect_readonly def items() -> Iterator[Res[Item]]: f = max(inputs()) - with connect_readonly(f) as db: - items = db['items'] - for r in items.all(order_by='time'): + with sqlite_connection(f, immutable=True, row_factory='row') as conn: + for r in conn.execute('SELECT * FROM items ORDER BY time'): yield Item( id=r['id'], type=r['type'], diff --git a/my/hackernews/materialistic.py b/my/hackernews/materialistic.py index 65a1cb6..e0d634a 100644 --- a/my/hackernews/materialistic.py +++ b/my/hackernews/materialistic.py @@ -1,20 +1,17 @@ """ [[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews """ - -REQUIRES = ['dataset'] - -from datetime import datetime +from datetime import datetime, timezone +from pathlib import Path from typing import Any, Dict, Iterator, NamedTuple, Sequence -import pytz +from my.core import get_files +from my.core.sqlite import sqlite_connection from my.config import materialistic as config # todo migrate config to my.hackernews.materialistic -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -28,7 +25,7 @@ class Saved(NamedTuple): @property def when(self) -> datetime: ts = int(self.row['time']) / 1000 - return datetime.fromtimestamp(ts, tz=pytz.utc) + return datetime.fromtimestamp(ts, tz=timezone.utc) @property def uid(self) -> str: @@ -47,13 +44,11 @@ class Saved(NamedTuple): return hackernews_link(self.uid) -from ..core.dataset import connect_readonly def raw() -> Iterator[Row]: last = max(inputs()) - with connect_readonly(last) as db: - saved = db['saved'] + with sqlite_connection(last, immutable=True, row_factory='dict') as conn: + yield from conn.execute('SELECT * FROM saved ORDER BY time') # TODO wonder if it's 'save time' or creation time? - yield from saved.all(order_by='time') def saves() -> Iterator[Saved]: diff --git a/my/instagram/android.py b/my/instagram/android.py index a34660c..8e44ebe 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -119,15 +119,17 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # todo use TypedDict? for f in inputs(): with sqlite_connect_immutable(f) as db: - for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db): j = json.loads(thread_json) # todo in principle should leave the thread attached to the message? # since thread is a group of users? # inviter usually contains our own user for r in [j['inviter'], *j['recipients']]: + # id disappeared and seems that pk_id is in use now (around december 2022) + uid = r.get('id') or r.get('pk_id') + assert uid is not None yield User( - id=str(r['id']), # for some reason it's int in the db + id=str(uid), # for some reason it's int in the db full_name=r['full_name'], username=r['username'], ) diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 50932bf..89f104a 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -10,7 +10,7 @@ from ..core.common import LazyLogger logger = LazyLogger(__name__) -from my.config import jawbone as config +from my.config import jawbone as config # type: ignore[attr-defined] BDIR = config.export_dir diff --git a/my/jawbone/plots.py b/my/jawbone/plots.py index 195ddb5..5332fe6 100755 --- a/my/jawbone/plots.py +++ b/my/jawbone/plots.py @@ -85,7 +85,7 @@ def iter_useful(data_file: str): # TODO <<< hmm. these files do contain deep and light sleep?? # also steps stats?? -from my.config import jawbone as config +from my.config import jawbone as config # type: ignore[attr-defined] p = config.export_dir / 'old_csv' # TODO with_my? @@ -95,7 +95,7 @@ files = [ p / "2017.csv", ] -from kython import concat, parse_date +from kython import concat, parse_date # type: ignore useful = concat(*(list(iter_useful(str(f))) for f in files)) # for u in useful: @@ -108,7 +108,7 @@ dates = [parse_date(u.date, yearfirst=True, dayfirst=False) for u in useful] # TODO filter outliers? # TODO don't need this anymore? it's gonna be in dashboards package -from kython.plotting import plot_timestamped +from kython.plotting import plot_timestamped # type: ignore for attr, lims, mavg, fig in [ # type: ignore ('light', (0, 400), 5, None), ('deep', (0, 600), 5, None), diff --git a/my/photos/main.py b/my/photos/main.py index 6be3163..69e5a46 100644 --- a/my/photos/main.py +++ b/my/photos/main.py @@ -19,7 +19,7 @@ from ..core.common import LazyLogger, mcachew, fastermime from ..core.error import Res, sort_res_by from ..core.cachew import cache_dir -from my.config import photos as config +from my.config import photos as config # type: ignore[attr-defined] logger = LazyLogger(__name__) diff --git a/my/rescuetime.py b/my/rescuetime.py index 5d64375..40aa6b7 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -58,22 +58,27 @@ def stats() -> Stats: # basically, hack config and populate it with fake data? fake data generated by DAL, but the rest is handled by this? -from typing import Iterator from contextlib import contextmanager +from typing import Iterator # todo take seed, or what? @contextmanager -def fake_data(rows: int=1000) -> Iterator[None]: +def fake_data(rows: int=1000) -> Iterator: # todo also disable cachew automatically for such things? - from .core.cachew import disabled_cachew - from .core.cfg import override_config + from my.core.cfg import tmp_config + from my.core.cachew import disabled_cachew from tempfile import TemporaryDirectory - with disabled_cachew(), override_config(config) as cfg, TemporaryDirectory() as td: + import json + with disabled_cachew(), TemporaryDirectory() as td: tdir = Path(td) - cfg.export_path = tdir f = tdir / 'rescuetime.json' - import json f.write_text(json.dumps(dal.fake_data_generator(rows=rows))) - yield + + class override: + class rescuetime: + export_path = tdir + + with tmp_config(modules=__name__, config=override) as cfg: + yield cfg # TODO ok, now it's something that actually could run on CI! # todo would be kinda nice if doctor could run against the fake data, to have a basic health check of the module? diff --git a/my/runnerup.py b/my/runnerup.py index 8e31770..6140236 100644 --- a/my/runnerup.py +++ b/my/runnerup.py @@ -13,7 +13,7 @@ from typing import Iterable from .core import Res, get_files from .core.common import isoparse, Json -import tcxparser +import tcxparser # type: ignore[import] from my.config import runnerup as config diff --git a/my/simple.py b/my/simple.py new file mode 100644 index 0000000..7462291 --- /dev/null +++ b/my/simple.py @@ -0,0 +1,21 @@ +''' +Just a demo module for testing and documentation purposes +''' +from dataclasses import dataclass +from typing import Iterator + +from my.core import make_config + +from my.config import simple as user_config + + +@dataclass +class simple(user_config): + count: int + + +config = make_config(simple) + + +def items() -> Iterator[int]: + yield from range(config.count) diff --git a/my/taplog.py b/my/taplog.py index f668a10..6353c14 100644 --- a/my/taplog.py +++ b/my/taplog.py @@ -1,11 +1,11 @@ ''' [[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data ''' - from datetime import datetime from typing import NamedTuple, Dict, Optional, Iterable -from .core import get_files +from my.core import get_files, stat, Stats +from my.core.sqlite import sqlite_connection from my.config import taplog as user_config @@ -46,11 +46,10 @@ class Entry(NamedTuple): def entries() -> Iterable[Entry]: last = max(get_files(user_config.export_path)) - from .core.dataset import connect_readonly - db = connect_readonly(last) - # todo is it sorted by timestamp? - for row in db['Log'].all(): - yield Entry(row) + with sqlite_connection(last, immutable=True, row_factory='dict') as db: + # todo is it sorted by timestamp? + for row in db.execute('SELECT * FROM Log'): + yield Entry(row) # I guess worth having as top level considering it would be quite common? @@ -60,6 +59,5 @@ def by_button(button: str) -> Iterable[Entry]: yield e -from .core import stat, Stats def stats() -> Stats: return stat(entries) diff --git a/my/telegram/telegram_backup.py b/my/telegram/telegram_backup.py new file mode 100644 index 0000000..3e2d6a7 --- /dev/null +++ b/my/telegram/telegram_backup.py @@ -0,0 +1,103 @@ +""" +Telegram data via [fabianonline/telegram_backup](https://github.com/fabianonline/telegram_backup) tool +""" + +from dataclasses import dataclass +from datetime import datetime, timezone +import sqlite3 +from typing import Dict, Iterator, Optional + +from my.core import datetime_aware, PathIsh +from my.core.sqlite import sqlite_connection + +from my.config import telegram as user_config + + +@dataclass +class config(user_config.telegram_backup): + # path to the export database.sqlite + export_path: PathIsh + + +@dataclass +class Chat: + id: str + name: Optional[str] + # not all users have short handle + groups don't have them either? + # TODO hmm some groups have it -- it's just the tool doesn't dump them?? + handle: Optional[str] + # not sure if need type? + + +@dataclass +class User: + id: str + name: Optional[str] + + +@dataclass +class Message: + # NOTE: message id is NOT unique globally -- only with respect to chat! + id: int + time: datetime_aware + chat: Chat + sender: User + text: str + + @property + def permalink(self) -> str: + handle = self.chat.handle + if handle is None: + clink = str(self.chat.id) + else: + # FIXME add c/ + clink = f'{handle}' + + # NOTE: don't think deep links to messages work for private conversations sadly https://core.telegram.org/api/links#message-links + # NOTE: doesn't look like this works with private groups at all, doesn't even jump into it + return f'https://t.me/{clink}/{self.id}' + + + +Chats = Dict[str, Chat] +def _message_from_row(r: sqlite3.Row, *, chats: Chats) -> Message: + ts = r['time'] + time = datetime.fromtimestamp(ts, tz=timezone.utc) + chat = chats[r['source_id']] + sender = chats[r['sender_id']] + return Message( + id=r['message_id'], + time=time, + chat=chat, + sender=User(id=sender.id, name=sender.name), + text=r['text'], + ) + + +def messages() -> Iterator[Message]: + with sqlite_connection(config.export_path, immutable=True, row_factory='row') as db: + + chats: Chats = {} + for r in db.execute('SELECT * FROM chats'): + chat = Chat(id=r['id'], name=r['name'], handle=None) + assert chat.id not in chats + chats[chat.id] = chat + + for r in db.execute('SELECT * FROM users'): + first = r["first_name"] + last = r["last_name"] + name: Optional[str] + if first is not None and last is not None: + name = f'{first} {last}' + else: + name = first or last + + chat = Chat(id=r['id'], name=name, handle=r['username']) + assert chat.id not in chats + chats[chat.id] = chat + + # TODO order by? not sure + for r in db.execute('SELECT * FROM messages WHERE message_type NOT IN ("service_message", "empty_message")'): + # seems like the only remaining have message_type = 'message' + yield _message_from_row(r, chats=chats) + diff --git a/my/tinder/android.py b/my/tinder/android.py index e92f316..18b59d8 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -3,19 +3,21 @@ Tinder data from Android app database (in =/data/data/com.tinder/databases/tinde """ from __future__ import annotations -REQUIRES = ['dataset'] - from collections import defaultdict from dataclasses import dataclass from datetime import datetime, timezone from itertools import chain from pathlib import Path +import sqlite3 from typing import Sequence, Iterator, Union, Dict, List, Mapping from more_itertools import unique_everseen -from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware -from my.core.dataset import connect_readonly, DatabaseT +from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware, LazyLogger +from my.core.sqlite import sqlite_connection + + +logger = LazyLogger(__name__) from my.config import tinder as user_config @@ -39,7 +41,7 @@ class _BaseMatch: id: str -@dataclass +@dataclass(unsafe_hash=True) class _Match(_BaseMatch): person_id: str @@ -59,7 +61,7 @@ class _BaseMessage: text: str -@dataclass +@dataclass(unsafe_hash=True) class _Message(_BaseMessage): match_id: str from_id: str @@ -73,6 +75,8 @@ class Message(_BaseMessage): to: Person +# todo hmm I have a suspicion it might be cumulative? +# although still possible that the user might remove/install app back, so need to keep that in mind def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -82,41 +86,46 @@ Entity = Union[Person, Match, Message] def _entities() -> Iterator[Res[_Entity]]: - for db_file in inputs(): - with connect_readonly(db_file) as db: + dbs = inputs() + for i, db_file in enumerate(dbs): + logger.debug(f'processing {db_file} {i}/{len(dbs)}') + with sqlite_connection(db_file, immutable=True, row_factory='row') as db: yield from _handle_db(db) -def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]: +def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]: # profile_user_view contains our own user id - for row in chain(db['profile_user_view'], db['match_person']): + for row in chain( + db.execute('SELECT * FROM profile_user_view'), + db.execute('SELECT * FROM match_person'), + ): try: yield _parse_person(row) except Exception as e: # todo attach error contex? yield e - for row in db['match']: + for row in db.execute('SELECT * FROM match'): try: yield _parse_match(row) except Exception as e: yield e - for row in db['message']: + for row in db.execute('SELECT * FROM message'): try: yield _parse_msg(row) except Exception as e: yield e -def _parse_person(row) -> Person: +def _parse_person(row: sqlite3.Row) -> Person: return Person( id=row['id'], name=row['name'], ) -def _parse_match(row) -> _Match: +def _parse_match(row: sqlite3.Row) -> _Match: return _Match( id=row['id'], person_id=row['person_id'], @@ -124,7 +133,7 @@ def _parse_match(row) -> _Match: ) -def _parse_msg(row) -> _Message: +def _parse_msg(row: sqlite3.Row) -> _Message: # note it also has raw_message_data -- not sure which is best to use.. sent = row['sent_date'] return _Message( diff --git a/my/twitter/archive.py b/my/twitter/archive.py index c59d7a1..bdd1497 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -12,7 +12,7 @@ except ImportError as ie: # must be caused by something else raise ie try: - from my.config import twitter as user_config # type: ignore[misc] + from my.config import twitter as user_config # type: ignore[misc,assignment] except ImportError: raise ie # raise the original exception.. must be something else else: diff --git a/my/twitter/talon.py b/my/twitter/talon.py index 81137d6..e43f600 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -4,31 +4,32 @@ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter from __future__ import annotations from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone import re -from typing import Iterator, Sequence, Optional, Dict +import sqlite3 +from typing import Iterator, Sequence, Union -import pytz +from more_itertools import unique_everseen + +from my.core import Paths, Res, datetime_aware, get_files +from my.core.sqlite import sqlite_connection + +from .common import TweetId, permalink from my.config import twitter as user_config -from ..core import Paths, Res, datetime_aware @dataclass class config(user_config.talon): # paths[s]/glob to the exported sqlite databases export_path: Paths -from ..core import get_files from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) -from .common import TweetId, permalink - - @dataclass(unsafe_hash=True) class Tweet: id_str: TweetId @@ -51,8 +52,6 @@ class _IsFavorire: tweet: Tweet -from typing import Union -from ..core.dataset import connect_readonly Entity = Union[_IsTweet, _IsFavorire] def _entities() -> Iterator[Res[Entity]]: for f in inputs(): @@ -67,35 +66,36 @@ def _process_one(f: Path) -> Iterator[Res[Entity]]: fname = f.name handler = handlers.get(fname) if handler is None: - yield RuntimeError(f"Coulnd't find handler for {fname}") + yield RuntimeError(f"Could not find handler for {fname}") return - with connect_readonly(f) as db: + with sqlite_connection(f, immutable=True, row_factory='row') as db: yield from handler(db) -def _process_user_tweets(db) -> Iterator[Res[Entity]]: +def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]: # dunno why it's called 'lists' - for r in db['lists'].all(order_by='time'): + for r in db.execute('SELECT * FROM lists ORDER BY time'): try: yield _IsTweet(_parse_tweet(r)) except Exception as e: yield e -def _process_favorite_tweets(db) -> Iterator[Res[Entity]]: - for r in db['favorite_tweets'].all(order_by='time'): +def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]: + for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'): try: yield _IsFavorire(_parse_tweet(r)) except Exception as e: yield e -def _parse_tweet(row) -> Tweet: + +def _parse_tweet(row: sqlite3.Row) -> Tweet: # ok so looks like it's tz aware.. # https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95 # uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime() # and it's created here, so looks like it's properly parsed from the api # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79 - created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc) + created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc) text = row['text'] # try explanding URLs.. sadly there are no positions in the db @@ -132,7 +132,6 @@ def _parse_tweet(row) -> Tweet: ) -from more_itertools import unique_everseen def tweets() -> Iterator[Res[Tweet]]: for x in unique_everseen(_entities()): if isinstance(x, Exception): @@ -140,6 +139,7 @@ def tweets() -> Iterator[Res[Tweet]]: elif isinstance(x, _IsTweet): yield x.tweet + def likes() -> Iterator[Res[Tweet]]: for x in unique_everseen(_entities()): if isinstance(x, Exception): diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 5ba0460..54c7f91 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -1,12 +1,16 @@ """ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export. """ - -REQUIRES = ['dataset'] - -from ..core.common import Paths -from ..core.error import Res from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import NamedTuple, Iterator, List + + +from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats +from my.core.cfg import make_config +from my.core.sqlite import sqlite_connection + from my.config import twint as user_config # TODO move to twitter.twint config structure @@ -17,16 +21,9 @@ class twint(user_config): #### -from ..core.cfg import make_config config = make_config(twint) -from datetime import datetime, timezone -from typing import NamedTuple, Iterator, List -from pathlib import Path - -from ..core.common import get_files, LazyLogger, Json, datetime_aware - log = LazyLogger(__name__) @@ -110,25 +107,19 @@ WHERE {where} ORDER BY T.created_at ''' -def _get_db(): - from ..core.dataset import connect_readonly - db_path = get_db_path() - return connect_readonly(db_path) - def tweets() -> Iterator[Res[Tweet]]: - db = _get_db() - res = db.query(_QUERY.format(where='F.tweet_id IS NULL')) - yield from map(Tweet, res) + with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db: + res = db.execute(_QUERY.format(where='F.tweet_id IS NULL')) + yield from map(Tweet, res) def likes() -> Iterator[Res[Tweet]]: - db = _get_db() - res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL')) - yield from map(Tweet, res) + with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db: + res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL')) + yield from map(Tweet, res) -from ..core import stat, Stats def stats() -> Stats: return { **stat(tweets), diff --git a/my/vk/favorites.py b/my/vk/favorites.py index e6ccbf3..eb1a89b 100644 --- a/my/vk/favorites.py +++ b/my/vk/favorites.py @@ -3,7 +3,8 @@ from datetime import datetime import json from typing import NamedTuple, Iterable, Sequence, Optional -from my.config import vk as config + +from my.config import vk as config # type: ignore[attr-defined] class Favorite(NamedTuple): diff --git a/tests/bluemaestro.py b/tests/bluemaestro.py index 1416900..283bd77 100644 --- a/tests/bluemaestro.py +++ b/tests/bluemaestro.py @@ -1,13 +1,26 @@ -#!/usr/bin/env python3 from pathlib import Path +from typing import TYPE_CHECKING, Iterator, Any + from more_itertools import one -import pytest # type: ignore +import pytest + + +if TYPE_CHECKING: + from my.bluemaestro import Measurement +else: + Measurement = Any + + +def ok_measurements() -> Iterator[Measurement]: + from my.bluemaestro import measurements + for m in measurements(): + assert not isinstance(m, Exception) + yield m def test() -> None: - from my.bluemaestro import measurements - res2020 = [m for m in measurements() if '2020' in str(m.dt)] + res2020 = [m for m in ok_measurements() if '2020' in str(m.dt)] tp = [x for x in res2020 if x.temp == 2.1] assert len(tp) > 0 @@ -24,8 +37,7 @@ def test() -> None: def test_old_db() -> None: - from my.bluemaestro import measurements - res = list(measurements()) + res = list(ok_measurements()) r1 = one(x for x in res if x.dt.strftime('%Y%m%d %H:%M:%S') == '20181003 09:07:00') r2 = one(x for x in res if x.dt.strftime('%Y%m%d %H:%M:%S') == '20181003 09:19:00') diff --git a/tests/jawbone.py b/tests/jawbone.py index c53459d..776ac50 100644 --- a/tests/jawbone.py +++ b/tests/jawbone.py @@ -4,7 +4,7 @@ from datetime import date, time # todo private test.. move away def test_tz() -> None: - from my.jawbone import sleeps_by_date + from my.jawbone import sleeps_by_date # type: ignore[attr-defined] sleeps = sleeps_by_date() for s in sleeps.values(): assert s.sleep_start.tzinfo is not None diff --git a/tests/pdfs.py b/tests/pdfs.py index d5134bf..ae6318d 100644 --- a/tests/pdfs.py +++ b/tests/pdfs.py @@ -23,7 +23,8 @@ def test_with_error(with_config, tmp_path: Path) -> None: g = root / 'garbage.pdf' g.write_text('garbage') from my.config import pdfs - del pdfs.roots # meh. otherwise legacy config value 'wins' + # meh. otherwise legacy config value 'wins' + del pdfs.roots # type: ignore[attr-defined] pdfs.paths = (root,) from my.pdfs import annotations diff --git a/tests/takeout.py b/tests/takeout.py index f45a51d..7cc2164 100644 --- a/tests/takeout.py +++ b/tests/takeout.py @@ -13,7 +13,7 @@ from more_itertools import ilen def test_location_perf() -> None: # 2.80 s for 10 iterations and 10K points # TODO try switching to jq and see how it goes? not sure.. - print(ilen(islice(LT.iter_locations(), 0, 10000))) + print(ilen(islice(LT.iter_locations(), 0, 10000))) # type: ignore # in theory should support any HTML takeout file? diff --git a/tests/test_tmp_config.py b/tests/test_tmp_config.py new file mode 100644 index 0000000..197d3f7 --- /dev/null +++ b/tests/test_tmp_config.py @@ -0,0 +1,33 @@ +from pathlib import Path +import tempfile + +from my.core.cfg import tmp_config + +import pytest + + +def _init_default_config() -> None: + import my.config + class default_config: + count = 5 + my.config.simple = default_config # type: ignore[attr-defined,assignment,misc] + + +def test_tmp_config() -> None: + ## ugh. ideally this would be on the top level (would be a better test) + ## but pytest imports eveything first, executes hooks, and some reset_modules() fictures mess stuff up + ## later would be nice to be a bit more careful about them + _init_default_config() + from my.simple import items + ## + + assert len(list(items())) == 5 + + class config: + class simple: + count = 3 + + with tmp_config(modules='my.simple', config=config): + assert len(list(items())) == 3 + + assert len(list(items())) == 5