From f641dbb305b02dd4bc1498752f7135de7e2cc400 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 15 Apr 2022 14:20:33 +0100 Subject: [PATCH 001/302] ci: attempt to use pip cache to speedup https://github.blog/changelog/2021-11-23-github-actions-setup-python-now-supports-dependency-caching/ UPD: hmm it doesn't seem to work, complains that requirements.txt is missing this might be relevant... https://github.com/actions/setup-python/issues/361 --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 35cddb2..a763cd4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,6 +30,7 @@ jobs: - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + cache: 'pip' - uses: actions/checkout@v2 with: @@ -61,6 +62,7 @@ jobs: - uses: actions/setup-python@v2 with: python-version: '3.7' + cache: 'pip' - uses: actions/checkout@v2 with: From 599a8b0dd732a3abadbd1c3fc5624f2813a3de17 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 15 Apr 2022 13:34:09 +0100 Subject: [PATCH 002/302] ZipPath: support hash, iterdir and proper / operator --- my/core/kompress.py | 34 +++++++++++++++++++++++++--------- tests/core/test_kompress.py | 11 ++++++++++- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/my/core/kompress.py b/my/core/kompress.py index b8e7724..8725bcf 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -6,7 +6,7 @@ from __future__ import annotations import pathlib from pathlib import Path import sys -from typing import Union, IO, Sequence, Any +from typing import Union, IO, Sequence, Any, Iterator import io PathIsh = Union[Path, str] @@ -139,18 +139,22 @@ class ZipPath(ZipPathBase): root: zipfile.ZipFile @property - def filename(self) -> str: + def filepath(self) -> Path: res = self.root.filename assert res is not None # make mypy happy - return res + return Path(res) + + @property + def subpath(self) -> Path: + return Path(self.at) def absolute(self) -> ZipPath: - return ZipPath(Path(self.filename).absolute(), self.at) + return ZipPath(self.filepath.absolute(), self.at) def exists(self) -> bool: if self.at == '': # special case, the base class returns False in this case for some reason - return Path(self.filename).exists() + return self.filepath.exists() return super().exists() def rglob(self, glob: str) -> Sequence[ZipPath]: @@ -162,16 +166,25 @@ class ZipPath(ZipPathBase): def relative_to(self, other: ZipPath) -> Path: assert self.root == other.root, (self.root, other.root) - return Path(self.at).relative_to(Path(other.at)) + return self.subpath.relative_to(other.subpath) @property def parts(self) -> Sequence[str]: # messy, but might be ok.. - return Path(self.filename).parts + Path(self.at).parts + return self.filepath.parts + self.subpath.parts + + def __truediv__(self, key) -> ZipPath: + # need to implement it so the return type is not zipfile.Path + s = super().__truediv__(key) + return ZipPath(s.root, s.at) # type: ignore[attr-defined] + + def iterdir(self) -> Iterator[ZipPath]: + for s in super().iterdir(): + yield ZipPath(s.root, s.at) # type: ignore[attr-defined] @property def stem(self) -> str: - return Path(self.at).stem + return self.subpath.stem @property # type: ignore[misc] def __class__(self): @@ -181,4 +194,7 @@ class ZipPath(ZipPathBase): # hmm, super class doesn't seem to treat as equals unless they are the same object if not isinstance(other, ZipPath): return False - return self.filename == other.filename and Path(self.at) == Path(other.at) + return (self.filepath, self.subpath) == (other.filepath, other.subpath) + + def __hash__(self) -> int: + return hash((self.filepath, self.subpath)) diff --git a/tests/core/test_kompress.py b/tests/core/test_kompress.py index 3561444..949a7f1 100644 --- a/tests/core/test_kompress.py +++ b/tests/core/test_kompress.py @@ -63,12 +63,17 @@ def test_zippath() -> None: # magic! convenient to make third party libraries agnostic of ZipPath assert isinstance(zp, Path) + assert isinstance(zp, ZipPath) + assert isinstance(zp / 'subpath', Path) # TODO maybe change __str__/__repr__? since it's a bit misleading: # Path('/code/hpi/tests/core/structure_data/gdpr_export.zip', 'gdpr_export/') assert ZipPath(target) == ZipPath(target) assert zp.absolute() == zp + # shouldn't crash + hash(zp) + assert zp.exists() assert (zp / 'gdpr_export/comments').exists() # check str constructor just in case @@ -77,7 +82,7 @@ def test_zippath() -> None: matched = list(zp.rglob('*')) assert len(matched) > 0 - assert all(p.filename == str(target) for p in matched), matched + assert all(p.filepath == target for p in matched), matched rpaths = [str(p.relative_to(zp)) for p in matched] assert rpaths == [ @@ -106,3 +111,7 @@ def test_zippath() -> None: ] assert list(zp.rglob('mes*')) == [ZipPath(target, 'gdpr_export/messages')] + + iterdir_res = list((zp / 'gdpr_export').iterdir()) + assert len(iterdir_res) == 3 + assert all(isinstance(p, Path) for p in iterdir_res) From 382f2054298d92077d796f6eccebf467c6f04143 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 4 Jul 2021 21:02:23 +0100 Subject: [PATCH 003/302] my.body.sleep: fix issue with attaching temperature seems that the index operator only works when boundaries are in the dataframe --- my/body/sleep/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/my/body/sleep/common.py b/my/body/sleep/common.py index 0b6fa1c..a07b3fa 100644 --- a/my/body/sleep/common.py +++ b/my/body/sleep/common.py @@ -23,8 +23,9 @@ class Combine: if pd.isna(start) or pd.isna(end): return None + between = (start <= temp.index) & (temp.index <= end) # on no temp data, returns nan, ok - return temp[start: end].mean() + return temp[between].mean() df['avg_temp'] = df.apply(calc_avg_temperature, axis=1) return df From 6e921627d328b354d2ac70771c0ef3c51b4a2538 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 15 Apr 2022 22:46:01 +0100 Subject: [PATCH 004/302] compat: workaround for Literal to work in runtime in python<3.8 previously it would crash with: SyntaxError: Forward reference must be an expression -- got 'yield' (reproducible via python3 -c 'from typing import Union; Union[int, "yield"]' ) --- my/core/compat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/my/core/compat.py b/my/core/compat.py index 8fc3ef5..a4175b6 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -58,9 +58,11 @@ else: if TYPE_CHECKING: from typing_extensions import Literal else: - from typing import Union # erm.. I guess as long as it's not crashing, whatever... - Literal = Union + class _Literal: + def __getitem__(self, args): + pass + Literal = _Literal() import os From f9f73dda24ebdf086f0eb9ebbfef7affb8083ce2 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 15 Apr 2022 12:52:15 +0100 Subject: [PATCH 005/302] my.google.takeout.parser: new takeout parser, using https://github.com/seanbreckenridge/google_takeout_parser adapted from https://github.com/seanbreckenridge/HPI/blob/master/my/google_takeout.py additions: - pass my.core.time.user_forced() to google_takeout_parser without it, BST gets weird results for me, e.g. US/Aleutian - support ZipPath via a config switch - flexible error handling via a config switch --- my/core/error.py | 3 + my/google/takeout/parser.py | 126 ++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 my/google/takeout/parser.py diff --git a/my/core/error.py b/my/core/error.py index ee63277..cf3feac 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -6,6 +6,8 @@ See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail from itertools import tee from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast +from .compat import Literal + T = TypeVar('T') E = TypeVar('E', bound=Exception) # TODO make covariant? @@ -14,6 +16,7 @@ ResT = Union[T, E] Res = ResT[T, Exception] +ErrorPolicy = Literal["yield", "raise", "drop"] def notnone(x: Optional[T]) -> T: assert x is not None diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py new file mode 100644 index 0000000..94ac876 --- /dev/null +++ b/my/google/takeout/parser.py @@ -0,0 +1,126 @@ +""" +Parses my Google Takeout using https://github.com/seanbreckenridge/google_takeout_parser + +can set DISABLE_TAKEOUT_CACHE as an environment +variable to disable caching for individual exports +in ~/.cache/google_takeout_parser + see https://github.com/seanbreckenridge/google_takeout_parser +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] + +from contextlib import ExitStack +import os +from typing import List, Sequence, cast +from pathlib import Path +from my.core import make_config, dataclass +from my.core.common import Stats, LazyLogger, mcachew, get_files, Paths +from my.core.error import ErrorPolicy +from my.core.structure import match_structure + +from my.core.time import user_forced +from google_takeout_parser.parse_html.html_time_utils import ABBR_TIMEZONES +ABBR_TIMEZONES.extend(user_forced()) + +from google_takeout_parser.path_dispatch import TakeoutParser +from google_takeout_parser.merge import GoogleEventSet, CacheResults + +# see https://github.com/seanbreckenridge/dotfiles/blob/master/.config/my/my/config/__init__.py for an example +from my.config import google as user_config + + +@dataclass +class google(user_config): + # directory to unzipped takeout data + takeout_path: Paths + + error_policy: ErrorPolicy = 'yield' + + # experimental flag to use core.kompress.ZipPath + # instead of unpacking to a tmp dir via match_structure + _use_zippath: bool = False + + +config = make_config(google) + + +logger = LazyLogger(__name__, level="warning") + +# patch TAKEOUT_LOGS to match HPI_LOGS +if "HPI_LOGS" in os.environ: + from google_takeout_parser.log import setup as setup_takeout_logger + from my.core.logging import mklevel + + setup_takeout_logger(mklevel(os.environ["HPI_LOGS"])) + + +DISABLE_TAKEOUT_CACHE = "DISABLE_TAKEOUT_CACHE" in os.environ + + +def inputs() -> Sequence[Path]: + return get_files(config.takeout_path) + + +EXPECTED = ( + "My Activity", + "Chrome", + "Location History", + "Youtube", + "YouTube and YouTube Music", +) + + +def _cachew_depends_on() -> List[str]: + return sorted([str(p) for p in inputs()]) + + +# ResultsType is a Union of all of the models in google_takeout_parser +@mcachew(depends_on=_cachew_depends_on, logger=logger, force_file=True) +def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: + error_policy = config.error_policy + count = 0 + emitted = GoogleEventSet() + # reversed shouldn't really matter? but logic is to use newer + # takeouts if they're named according to date, since JSON Activity + # is nicer than HTML Activity + for path in reversed(inputs()): + with ExitStack() as exit_stack: + if config._use_zippath: + from my.core.kompress import ZipPath + # for later takeouts it's just 'Takeout' dir, + # but for older (pre 2015) it contains email/date in the subdir name + results = tuple(cast(Sequence[Path], ZipPath(path).iterdir())) + else: + results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True)) + for m in results: + # e.g. /home/sean/data/google_takeout/Takeout-1634932457.zip") -> 'Takeout-1634932457' + # means that zipped takeouts have nice filenames from cachew + cw_id, _, _ = path.name.rpartition(".") + # each takeout result is cached as well, in individual databases per-type + tk = TakeoutParser(m, cachew_identifier=cw_id, error_policy=error_policy) + # TODO might be nice to pass hpi cache dir? + for event in tk.parse(cache=not disable_takeout_cache): + count += 1 + if isinstance(event, Exception): + if error_policy == 'yield': + yield event + elif error_policy == 'raise': + raise event + elif error_policy == 'drop': + pass + continue + if event in emitted: + continue + emitted.add(event) + yield event # type: ignore[misc] + logger.debug( + f"HPI Takeout merge: from a total of {count} events, removed {count - len(emitted)} duplicates" + ) + + +def stats() -> Stats: + from my.core import stat + + return { + **stat(events), + } From 915cfe69b3a581acaa2e3ad90bbdb008424d257f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 18 Apr 2022 18:49:47 +0100 Subject: [PATCH 006/302] kompress.ZipPath: support stat().st_mtime --- my/core/kompress.py | 20 ++++++++++++++++++++ tests/core/test_kompress.py | 8 ++++++++ 2 files changed, 28 insertions(+) diff --git a/my/core/kompress.py b/my/core/kompress.py index 8725bcf..93a19a0 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -3,6 +3,7 @@ Various helpers for compression """ from __future__ import annotations +from datetime import datetime import pathlib from pathlib import Path import sys @@ -198,3 +199,22 @@ class ZipPath(ZipPathBase): def __hash__(self) -> int: return hash((self.filepath, self.subpath)) + + def stat(self) -> os.stat_result: + # NOTE: zip datetimes have no notion of time zone, usually they just keep local time? + # see https://en.wikipedia.org/wiki/ZIP_(file_format)#Structure + dt = datetime(*self.root.getinfo(str(self.subpath)).date_time) + ts = int(dt.timestamp()) + params = dict( + st_mode=0, + st_ino=0, + st_dev=0, + st_nlink=1, + st_uid=1000, + st_gid=1000, + st_size=0, # todo compute it properly? + st_atime=ts, + st_mtime=ts, + st_ctime=ts, + ) + return os.stat_result(tuple(params.values())) diff --git a/tests/core/test_kompress.py b/tests/core/test_kompress.py index 949a7f1..481a025 100644 --- a/tests/core/test_kompress.py +++ b/tests/core/test_kompress.py @@ -1,3 +1,4 @@ +from datetime import datetime import lzma from pathlib import Path import sys @@ -115,3 +116,10 @@ def test_zippath() -> None: iterdir_res = list((zp / 'gdpr_export').iterdir()) assert len(iterdir_res) == 3 assert all(isinstance(p, Path) for p in iterdir_res) + + # date recorded in the zip archive + assert (zp / 'gdpr_export/comments/comments.json').stat().st_mtime > 1625000000 + # TODO ugh. + # unzip -l shows the date as 2021-07-01 09:43 + # however, python reads it as 2021-07-01 01:43 ?? + # don't really feel like dealing with this for now, it's not tz aware anyway From 78f6ae96d12aca8852a6d259c08064cff762ae44 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 20 Apr 2022 21:58:10 +0100 Subject: [PATCH 007/302] my.youtube: use new my.google.takeout.parser module for its data - fallback on the old logic if google_takeout_parser isn't available - move to my.youtube.takeout (possibly mixing in other sources later) - keep my.media.youtube, but issue deprecation warning currently used in orger etc, so doesn't hurt to keep - also fixes https://github.com/karlicoss/HPI/issues/113 --- my/core/compat.py | 7 +++ my/media/__init__.py | 0 my/media/youtube.py | 46 ++-------------- my/youtube/takeout.py | 120 ++++++++++++++++++++++++++++++++++++++++++ tests/youtube.py | 32 +++++++---- 5 files changed, 154 insertions(+), 51 deletions(-) delete mode 100644 my/media/__init__.py mode change 100755 => 100644 my/media/youtube.py create mode 100755 my/youtube/takeout.py diff --git a/my/core/compat.py b/my/core/compat.py index a4175b6..4dc8865 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -83,3 +83,10 @@ def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwa dest.cursor().executescript(tempfile.read()) dest.commit() + + +# can remove after python3.9 +def removeprefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + return text[len(prefix):] + return text diff --git a/my/media/__init__.py b/my/media/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/my/media/youtube.py b/my/media/youtube.py old mode 100755 new mode 100644 index 8212f12..efaa74b --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -1,43 +1,5 @@ -#!/usr/bin/env python3 -from datetime import datetime -from typing import NamedTuple, List, Iterable - -from ..google.takeout.html import read_html -from ..google.takeout.paths import get_last_takeout - - -class Watched(NamedTuple): - url: str - title: str - when: datetime - - @property - def eid(self) -> str: - return f'{self.url}-{self.when.isoformat()}' - - -def watched() -> Iterable[Watched]: - # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/ - path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last - # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json - last = get_last_takeout(path=path) - if last is None: - return [] - - - watches: List[Watched] = [] - for dt, url, title in read_html(last, path): - watches.append(Watched(url=url, title=title, when=dt)) - - # TODO hmm they already come sorted.. wonder if should just rely on it.. - return list(sorted(watches, key=lambda e: e.when)) - - -from ..core import stat, Stats -def stats() -> Stats: - return stat(watched) - - -# todo deprecate -get_watched = watched +from ..core.warnings import high +high("DEPRECATED! Please use my.youtube.takeout instead.") +from ..core.util import __NOT_HPI_MODULE__ +from ..youtube.takeout import * diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py new file mode 100755 index 0000000..3d284b6 --- /dev/null +++ b/my/youtube/takeout.py @@ -0,0 +1,120 @@ +from typing import NamedTuple, List, Iterable + +from ..core import datetime_aware, Res, LazyLogger +from ..core.compat import removeprefix + + +logger = LazyLogger(__name__) + + +class Watched(NamedTuple): + url: str + title: str + when: datetime_aware + + @property + def eid(self) -> str: + return f'{self.url}-{self.when.isoformat()}' + + +# todo define error policy? +# although it has one from google takeout module.. so not sure + +def watched() -> Iterable[Res[Watched]]: + try: + from ..google.takeout.parser import events + from google_takeout_parser.models import Activity + except ModuleNotFoundError as ex: + logger.exception(ex) + from ..core.warnings import high + high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.") + yield from _watched_legacy() + return + + YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v=' + + # TODO would be nice to filter, e.g. it's kinda pointless to process Location events + for e in events(): + if isinstance(e, Exception): + yield e + + if not isinstance(e, Activity): + continue + + url = e.titleUrl + header = e.header + title = e.title + + if url is None: + continue + + if header in {'Image Search', 'Search', 'Chrome'}: + # sometimes results in youtube links.. but definitely not watch history + continue + + if header not in {'YouTube', 'youtube.com'}: + # TODO hmm -- wonder if these would end up in dupes in takeout? would be nice to check + # perhaps this would be easier once we have universal ids + if YOUTUBE_VIDEO_LINK in url: + # TODO maybe log in this case or something? + pass + continue + + if header == 'youtube.com' and title.startswith('Visited '): + continue + + if title.startswith('Searched for') and url.startswith('https://www.youtube.com/results'): + # search activity, don't need it here + continue + + if title.startswith('Subscribed to') and url.startswith('https://www.youtube.com/channel/'): + # todo might be interesting to process somwhere? + continue + + # all titles contain it, so pointless to include 'Watched ' + # also compatible with legacy titles + title = removeprefix(title, 'Watched ') + + if YOUTUBE_VIDEO_LINK not in url: + if e.details == ['From Google Ads']: + # weird, sometimes results in odd + continue + if title == 'Used YouTube' and e.products == ['Android']: + continue + + yield RuntimeError(f'Unexpected url: {e}') + continue + + yield Watched( + url=url, + title=title, + when=e.time, + ) + + +from ..core import stat, Stats +def stats() -> Stats: + return stat(watched) + + +### deprecated stuff (keep in my.media.youtube) + +get_watched = watched + + +def _watched_legacy() -> Iterable[Watched]: + from ..google.takeout.html import read_html + from ..google.takeout.paths import get_last_takeout + + # todo looks like this one doesn't have retention? so enough to use the last + path = 'Takeout/My Activity/YouTube/MyActivity.html' + last = get_last_takeout(path=path) + if last is None: + return [] + + watches: List[Watched] = [] + for dt, url, title in read_html(last, path): + watches.append(Watched(url=url, title=title, when=dt)) + + # todo hmm they already come sorted.. wonder if should just rely on it.. + return list(sorted(watches, key=lambda e: e.when)) diff --git a/tests/youtube.py b/tests/youtube.py index d514061..4864ee9 100644 --- a/tests/youtube.py +++ b/tests/youtube.py @@ -1,22 +1,36 @@ # TODO move elsewhere? # these tests would only make sense with some existing data? although some of them would work for everyone.. # not sure what's a good way of handling this.. +from datetime import datetime +import pytz +from more_itertools import bucket + + from .common import skip_if_not_karlicoss as pytestmark # TODO ugh. if i uncomment this here (on top level), then this test vvv fails # from my.media.youtube import get_watched, Watched # HPI_TESTS_KARLICOSS=true pytest -raps tests/tz.py tests/youtube.py -def test() -> None: - from my.media.youtube import get_watched, Watched - watched = list(get_watched()) - assert len(watched) > 1000 - from datetime import datetime - import pytz - w = Watched( +def test() -> None: + from my.youtube.takeout import watched, Watched + videos = [w for w in watched() if not isinstance(w, Exception)] + assert len(videos) > 1000 + + # results in nicer errors, otherwise annoying to check against thousands of videos + grouped = bucket(videos, key=lambda w: (w.url, w.title)) + + w1 = Watched( url='https://www.youtube.com/watch?v=hTGJfRPLe08', title='Jamie xx - Gosh', - when=datetime(year=2018, month=6, day=21, hour=5, minute=48, second=34, tzinfo=pytz.utc), + when=pytz.timezone('Europe/London').localize(datetime(year=2018, month=6, day=21, hour=6, minute=48, second=34)), ) - assert w in watched + assert w1 in list(grouped[(w1.url, w1.title)]) + + w2 = Watched( + url='https://www.youtube.com/watch?v=IZ_8b_Ydsv0', + title='Why LESS Sensitive Tests Might Be Better', + when=pytz.utc.localize(datetime(year=2021, month=1, day=15, hour=17, minute=54, second=12)), + ) + assert w2 in list(grouped[(w2.url, w2.title)]) From 66a00c6ada841088fd53934d756cc879cca573ec Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 24 Apr 2022 18:10:56 -0700 Subject: [PATCH 008/302] docs: add docs for google_takeout_parser --- doc/MODULES.org | 46 +++++++++++++++++++++++++------------ my/google/takeout/parser.py | 17 +++++++++----- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/doc/MODULES.org b/doc/MODULES.org index a160ecb..e4bcdad 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -118,17 +118,17 @@ import importlib # from lint import all_modules # meh # TODO figure out how to discover configs automatically... modules = [ - ('google' , 'my.google.takeout.paths'), - ('hypothesis' , 'my.hypothesis' ), - ('pocket' , 'my.pocket' ), - ('twint' , 'my.twitter.twint' ), - ('twitter_archive', 'my.twitter.archive' ), - ('lastfm' , 'my.lastfm' ), - ('polar' , 'my.polar' ), - ('instapaper' , 'my.instapaper' ), - ('github' , 'my.github.gdpr' ), - ('github' , 'my.github.ghexport' ), - ('kobo' , 'my.kobo' ), + ('google' , 'my.google.takeout.parser'), + ('hypothesis' , 'my.hypothesis' ), + ('pocket' , 'my.pocket' ), + ('twint' , 'my.twitter.twint' ), + ('twitter_archive', 'my.twitter.archive' ), + ('lastfm' , 'my.lastfm' ), + ('polar' , 'my.polar' ), + ('instapaper' , 'my.instapaper' ), + ('github' , 'my.github.gdpr' ), + ('github' , 'my.github.ghexport' ), + ('kobo' , 'my.kobo' ), ] def indent(s, spaces=4): @@ -164,13 +164,29 @@ for cls, p in modules: #+RESULTS: -** [[file:../my/google/takeout/paths.py][my.google.takeout.paths]] +** [[file:../my/google/takeout/parser.py][my.google.takeout.parser]] - Module for locating and accessing [[https://takeout.google.com][Google Takeout]] data + Parses Google Takeout using [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] + + See [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] for more information about how to export and organize your takeouts + + If the =DISABLE_TAKEOUT_CACHE= environment variable is set, this won't + cache individual exports in =~/.cache/google_takeout_parser= + + The directory set as takeout_path can be unpacked directories, or + zip files of the exports, which are temporarily unpacked while creating + the cachew cache #+begin_src python - class google: - takeout_path: Paths # path/paths/glob for the takeout zips + class google(user_config): + # directory which includes unpacked/zipped takeouts + takeout_path: Paths + + error_policy: ErrorPolicy = 'yield' + + # experimental flag to use core.kompress.ZipPath + # instead of unpacking to a tmp dir via match_structure + _use_zippath: bool = False #+end_src ** [[file:../my/hypothesis.py][my.hypothesis]] diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 94ac876..a6ea81c 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -1,10 +1,15 @@ """ -Parses my Google Takeout using https://github.com/seanbreckenridge/google_takeout_parser +Parses Google Takeout using [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] -can set DISABLE_TAKEOUT_CACHE as an environment -variable to disable caching for individual exports -in ~/.cache/google_takeout_parser - see https://github.com/seanbreckenridge/google_takeout_parser +See [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] for more information +about how to export and organize your takeouts + +If the DISABLE_TAKEOUT_CACHE environment variable is set, this won't cache individual +exports in ~/.cache/google_takeout_parser + +The directory set as takeout_path can be unpacked directories, or +zip files of the exports, which are temporarily unpacked while creating +the cachew cache """ REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] @@ -31,7 +36,7 @@ from my.config import google as user_config @dataclass class google(user_config): - # directory to unzipped takeout data + # directory which includes unpacked/zipped takeouts takeout_path: Paths error_policy: ErrorPolicy = 'yield' From 2cb836181b091449e437dd6958c2208374776068 Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Tue, 26 Apr 2022 13:11:35 -0700 Subject: [PATCH 009/302] location: add all.py, using takeout/gpslogger/ip (#237) * location: add all.py, using takeout/gpslogger/ip, update docs --- doc/MODULES.org | 82 ++++++++++++++++++++++- my/config.py | 11 ++- my/ip/all.py | 29 ++++++++ my/ip/common.py | 39 +++++++++++ my/location/all.py | 48 +++++++++++++ my/location/common.py | 17 +++++ my/location/google.py | 7 ++ my/location/google_takeout.py | 33 +++++++++ my/location/gpslogger.py | 74 ++++++++++++++++++++ my/location/home.py | 8 +-- my/location/via_ip.py | 39 +++++++++++ my/time/tz/common.py | 11 +-- my/time/tz/via_location.py | 123 ++++++++++++++++++++++++++-------- tests/tz.py | 6 +- tox.ini | 7 ++ 15 files changed, 488 insertions(+), 46 deletions(-) create mode 100644 my/ip/all.py create mode 100644 my/ip/common.py create mode 100644 my/location/all.py create mode 100644 my/location/common.py create mode 100644 my/location/google_takeout.py create mode 100644 my/location/gpslogger.py create mode 100644 my/location/via_ip.py diff --git a/doc/MODULES.org b/doc/MODULES.org index e4bcdad..a6dcd9d 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -16,9 +16,12 @@ If you have some issues with the setup, see [[file:SETUP.org::#troubleshooting][ - [[#toc][TOC]] - [[#intro][Intro]] - [[#configs][Configs]] - - [[#mygoogletakeoutpaths][my.google.takeout.paths]] + - [[#mygoogletakeoutparser][my.google.takeout.parser]] - [[#myhypothesis][my.hypothesis]] - [[#myreddit][my.reddit]] + - [[#mybrowser][my.browser]] + - [[#mylocation][my.location]] + - [[#mytimetzvia_location][my.time.tz.via_location]] - [[#mypocket][my.pocket]] - [[#mytwittertwint][my.twitter.twint]] - [[#mytwitterarchive][my.twitter.archive]] @@ -90,12 +93,12 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http export_path: Paths #+end_src + ** [[file:../my/browser/][my.browser]] Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]] #+begin_src python - @dataclass class browser: class export: # path[s]/glob to your backed up browser history sqlite files @@ -108,6 +111,80 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http # active_databases = Firefox.locate_database() export_path: Paths #+end_src +** [[file:../my/location][my.location]] + + Merged location history from lots of sources. + + The main sources here are + [[https://github.com/mendhak/gpslogger][gpslogger]] .gpx (XML) files, and + google takeout (using =my.google.takeout.parser=), with a fallback on + manually defined home locations. + + You might also be able to use [[file:../my/location/via_ip.py][my.location.via_ip]] which uses =my.ip.all= to + provide geolocation data for an IPs (though no IPs are provided from any + of the sources here). For an example of usage, see [[https://github.com/seanbreckenridge/HPI/tree/master/my/ip][here]] + + #+begin_src python + class location: + home = ( + # supports ISO strings + ('2005-12-04' , (42.697842, 23.325973)), # Bulgaria, Sofia + # supports date/datetime objects + (date(year=1980, month=2, day=15) , (40.7128 , -74.0060 )), # NY + (datetime.fromtimestamp(1600000000, tz=timezone.utc), (55.7558 , 37.6173 )), # Moscow, Russia + ) + # note: order doesn't matter, will be sorted in the data provider + + class gpslogger: + # path[s]/glob to the exported gpx files + export_path: Paths + + # default accuracy for gpslogger + accuracy: float = 50.0 + + class via_ip: + # guess ~15km accuracy for IP addresses + accuracy: float = 15_000 + #+end_src +** [[file:../my/time/tz/via_location.py][my.time.tz.via_location]] + + Uses the =my.location= module to determine the timezone for a location. + + This can be used to 'localize' timezones. Most modules here return + datetimes in UTC, to prevent confusion whether or not its a local + timezone, one from UTC, or one in your timezone. + + Depending on the specific data provider and your level of paranoia you might expect different behaviour.. E.g.: + - if your objects already have tz info, you might not need to call localize() at all + - it's safer when either all of your objects are tz aware or all are tz unware, not a mixture + - you might trust your original timezone, or it might just be UTC, and you want to use something more reasonable + + #+begin_src python + TzPolicy = Literal[ + 'keep' , # if datetime is tz aware, just preserve it + 'convert', # if datetime is tz aware, convert to provider's tz + 'throw' , # if datetime is tz aware, throw exception + ] + #+end_src + + This is still a work in progress, plan is to integrate it with =hpi query= + so that you can easily convert/localize timezones for some module/data + + #+begin_src python + class time: + class tz: + policy = 'keep' + + class via_location: + # less precise, but faster + fast: bool = True + + # if the accuracy for the location is more than 5km (this + # isn't an accurate location, so shouldn't use it to determine + # timezone), don't use + require_accuracy: float = 5_000 + #+end_src + # TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh. @@ -163,7 +240,6 @@ for cls, p in modules: #+RESULTS: - ** [[file:../my/google/takeout/parser.py][my.google.takeout.parser]] Parses Google Takeout using [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] diff --git a/my/config.py b/my/config.py index 5bb316f..0746803 100644 --- a/my/config.py +++ b/my/config.py @@ -72,10 +72,19 @@ class location: # and we can't import the types from the module itself, otherwise would be circular. common module? home: Union[LatLon, Sequence[Tuple[DateIsh, LatLon]]] = (1.0, -1.0) + class via_ip: + accuracy: float + + class gpslogger: + export_path: Paths = '' + accuracy: float + class time: class tz: - pass + class via_location: + fast: bool + require_accuracy: float class orgmode: diff --git a/my/ip/all.py b/my/ip/all.py new file mode 100644 index 0000000..b21b543 --- /dev/null +++ b/my/ip/all.py @@ -0,0 +1,29 @@ +""" +An example all.py stub module that provides ip data + +To use this, you'd add IP providers that yield IPs to the 'ips' function + +For an example of how this could be used, see https://github.com/seanbreckenridge/HPI/tree/master/my/ip +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + + +from typing import Iterator + +from my.core.common import Stats, warn_if_empty + +from .common import IP + + +@warn_if_empty +def ips() -> Iterator[IP]: + yield from () + + +def stats() -> Stats: + from my.core import stat + + return { + **stat(ips), + } diff --git a/my/ip/common.py b/my/ip/common.py new file mode 100644 index 0000000..82008e2 --- /dev/null +++ b/my/ip/common.py @@ -0,0 +1,39 @@ +""" +Provides location/timezone data from IP addresses, using [[https://github.com/seanbreckenridge/ipgeocache][ipgeocache]] +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + +from my.core import __NOT_HPI_MODULE__ + +import ipaddress +from typing import NamedTuple, Iterator +from datetime import datetime + +import ipgeocache + +from my.core import Json + + +class IP(NamedTuple): + dt: datetime + addr: str # an IP address + + # TODO: could cache? not sure if it's worth it + def ipgeocache(self) -> Json: + return ipgeocache.get(self.addr) + + @property + def tzname(self) -> str: + tz: str = self.ipgeocache()["timezone"] + return tz + + +def drop_private(ips: Iterator[IP]) -> Iterator[IP]: + """ + Helper function that can be used to filter out private IPs + """ + for ip in ips: + if ipaddress.ip_address(ip.addr).is_private: + continue + yield ip diff --git a/my/location/all.py b/my/location/all.py new file mode 100644 index 0000000..bd9364e --- /dev/null +++ b/my/location/all.py @@ -0,0 +1,48 @@ +""" +Merges location data from multiple sources +""" + +from typing import Iterator + +from my.core import Stats, LazyLogger +from my.core.source import import_source + +from my.location.via_ip import locations + +from .common import Location + + +logger = LazyLogger(__name__, level="warning") + + +def locations() -> Iterator[Location]: + # can add/comment out sources here to disable them, or use core.disabled_modules + yield from _takeout_locations() + yield from _gpslogger_locations() + yield from _ip_locations() + + +@import_source(module_name="my.location.google_takeout") +def _takeout_locations() -> Iterator[Location]: + from . import google_takeout + yield from google_takeout.locations() + + +@import_source(module_name="my.location.gpslogger") +def _gpslogger_locations() -> Iterator[Location]: + from . import gpslogger + yield from gpslogger.locations() + + +@import_source(module_name="my.location.via_ip") +def _ip_locations() -> Iterator[Location]: + from . import via_ip + yield from via_ip.locations() + + +def stats() -> Stats: + from my.core import stat + + return { + **stat(locations), + } diff --git a/my/location/common.py b/my/location/common.py new file mode 100644 index 0000000..b0676ec --- /dev/null +++ b/my/location/common.py @@ -0,0 +1,17 @@ +from datetime import date, datetime +from typing import Union, Tuple, NamedTuple, Optional + +from my.core import __NOT_HPI_MODULE__ + +DateIsh = Union[datetime, date, str] + +LatLon = Tuple[float, float] + + +# TODO: add timezone to this? can use timezonefinder in tz provider instead though +class Location(NamedTuple): + lat: float + lon: float + dt: datetime + accuracy: Optional[float] + elevation: Optional[float] diff --git a/my/location/google.py b/my/location/google.py index f196301..21ba3ed 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -1,6 +1,9 @@ """ Location data from Google Takeout + +DEPRECATED: setup my.google.takeout.parser and use my.location.google_takeout instead """ + REQUIRES = [ 'geopy', # checking that coordinates are valid 'ijson', @@ -20,6 +23,10 @@ from ..core.common import LazyLogger, mcachew from ..core.cachew import cache_dir from ..core import kompress +from my.core.warnings import high + +high("Please set up my.google.takeout.parser module for better takeout support") + # otherwise uses ijson # todo move to config?? diff --git a/my/location/google_takeout.py b/my/location/google_takeout.py new file mode 100644 index 0000000..80b31cb --- /dev/null +++ b/my/location/google_takeout.py @@ -0,0 +1,33 @@ +""" +Extracts locations using google_takeout_parser -- no shared code with the deprecated my.location.google +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] + +from typing import Iterator + +from my.google.takeout.parser import events, _cachew_depends_on +from google_takeout_parser.models import Location as GoogleLocation + +from my.core.common import mcachew, LazyLogger, Stats +from .common import Location + +logger = LazyLogger(__name__) + + +@mcachew( + depends_on=_cachew_depends_on, + logger=logger, +) +def locations() -> Iterator[Location]: + for g in events(): + if isinstance(g, GoogleLocation): + yield Location( + lon=g.lng, lat=g.lat, dt=g.dt, accuracy=g.accuracy, elevation=None + ) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/location/gpslogger.py b/my/location/gpslogger.py new file mode 100644 index 0000000..95f4474 --- /dev/null +++ b/my/location/gpslogger.py @@ -0,0 +1,74 @@ +""" +Parse [[https://github.com/mendhak/gpslogger][gpslogger]] .gpx (xml) files +""" + +REQUIRES = ["gpxpy"] + +from my.config import location +from my.core import Paths, dataclass + + +@dataclass +class config(location.gpslogger): + # path[s]/glob to the synced gpx (XML) files + export_path: Paths + + # default accuracy for gpslogger + accuracy: float = 50.0 + + +from itertools import chain +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterator, Sequence, List + +import gpxpy # type: ignore[import] +from more_itertools import unique_everseen + +from my.core import Stats, LazyLogger +from my.core.common import get_files, mcachew +from .common import Location + + +logger = LazyLogger(__name__, level="warning") + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path, glob="*.gpx") + + +def _cachew_depends_on() -> List[float]: + return [p.stat().st_mtime for p in inputs()] + + +# TODO: could use a better cachew key/this has to recompute every file whenever the newest one changes +@mcachew(depends_on=_cachew_depends_on, logger=logger) +def locations() -> Iterator[Location]: + yield from unique_everseen( + chain(*map(_extract_locations, inputs())), key=lambda loc: loc.dt + ) + + +def _extract_locations(path: Path) -> Iterator[Location]: + with path.open("r") as gf: + gpx_obj = gpxpy.parse(gf) + for track in gpx_obj.tracks: + for segment in track.segments: + for point in segment.points: + if point.time is None: + continue + # hmm - for gpslogger, seems that timezone is always SimpleTZ('Z'), which + # specifies UTC -- see https://github.com/tkrajina/gpxpy/blob/cb243b22841bd2ce9e603fe3a96672fc75edecf2/gpxpy/gpxfield.py#L38 + yield Location( + lat=point.latitude, + lon=point.longitude, + accuracy=config.accuracy, + elevation=point.elevation, + dt=datetime.replace(point.time, tzinfo=timezone.utc), + ) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/location/home.py b/my/location/home.py index dd7209f..ac0fcb8 100644 --- a/my/location/home.py +++ b/my/location/home.py @@ -2,17 +2,13 @@ Simple location provider, serving as a fallback when more detailed data isn't available ''' from dataclasses import dataclass -from datetime import datetime, date, time, timezone +from datetime import datetime, time, timezone from functools import lru_cache from typing import Sequence, Tuple, Union, cast from my.config import location as user_config - -DateIsh = Union[datetime, date, str] - -# todo hopefully reasonable? might be nice to add name or something too -LatLon = Tuple[float, float] +from my.location.common import LatLon, DateIsh @dataclass class Config(user_config): diff --git a/my/location/via_ip.py b/my/location/via_ip.py new file mode 100644 index 0000000..e882cdb --- /dev/null +++ b/my/location/via_ip.py @@ -0,0 +1,39 @@ +""" +Converts IP addresses provided by my.location.ip to estimated locations +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + +from my.core import dataclass, Stats +from my.config import location + + +@dataclass +class config(location.via_ip): + # no real science to this, just a guess of ~15km accuracy for IP addresses + accuracy: float = 15_000.0 + + +from typing import Iterator + +from .common import Location +from my.ip.all import ips + + +def locations() -> Iterator[Location]: + for ip in ips(): + loc: str = ip.ipgeocache()["loc"] + lat, _, lon = loc.partition(",") + yield Location( + lat=float(lat), + lon=float(lon), + dt=ip.dt, + accuracy=config.accuracy, + elevation=None, + ) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/time/tz/common.py b/my/time/tz/common.py index b6ebbe5..e2c428d 100644 --- a/my/time/tz/common.py +++ b/my/time/tz/common.py @@ -10,24 +10,27 @@ Depending on the specific data provider and your level of paranoia you might exp - it's safer when either all of your objects are tz aware or all are tz unware, not a mixture - you might trust your original timezone, or it might just be UTC, and you want to use something more reasonable ''' -Policy = Literal[ +TzPolicy = Literal[ 'keep' , # if datetime is tz aware, just preserve it 'convert', # if datetime is tz aware, convert to provider's tz 'throw' , # if datetime is tz aware, throw exception # todo 'warn'? not sure if very useful ] -def default_policy() -> Policy: +# backwards compatibility +Policy = TzPolicy + +def default_policy() -> TzPolicy: try: from my.config import time as user_config - return cast(Policy, user_config.tz.policy) + return cast(TzPolicy, user_config.tz.policy) except Exception as e: # todo meh.. need to think how to do this more carefully # rationale: do not mess with user's data unless they want return 'keep' -def localize_with_policy(lfun: Callable[[datetime], tzdatetime], dt: datetime, policy: Policy=default_policy()) -> tzdatetime: +def localize_with_policy(lfun: Callable[[datetime], tzdatetime], dt: datetime, policy: TzPolicy=default_policy()) -> tzdatetime: tz = dt.tzinfo if tz is None: return lfun(dt) diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index e390c43..0e91193 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -7,27 +7,34 @@ REQUIRES = [ ] +from my.config import time +from my.core import dataclass + + +@dataclass +class config(time.tz.via_location): + # less precise, but faster + fast: bool = True + + # if the accuracy for the location is more than 5km, don't use + require_accuracy: float = 5_000 + + from collections import Counter from datetime import date, datetime from functools import lru_cache from itertools import groupby -from typing import Iterator, NamedTuple, Optional +from typing import Iterator, NamedTuple, Optional, Tuple, Any, List from more_itertools import seekable import pytz -from ...core.common import LazyLogger, mcachew, tzdatetime -from ...core.cachew import cache_dir -from ...location.google import locations +from my.core.common import LazyLogger, mcachew, tzdatetime +logger = LazyLogger(__name__, level='warning') -logger = LazyLogger(__name__, level='debug') - - -# todo should move to config? not sure -_FASTER: bool = True @lru_cache(2) -def _timezone_finder(fast: bool): +def _timezone_finder(fast: bool) -> Any: if fast: # less precise, but faster from timezonefinder import TimezoneFinderL as Finder # type: ignore @@ -46,39 +53,89 @@ class DayWithZone(NamedTuple): zone: Zone -def _iter_local_dates(start=0, stop=None) -> Iterator[DayWithZone]: - finder = _timezone_finder(fast=_FASTER) # rely on the default - pdt = None +from my.location.common import LatLon + +# for backwards compatibility +def _locations() -> Iterator[Tuple[LatLon, datetime]]: + try: + import my.location.all + for loc in my.location.all.locations(): + if loc.accuracy is not None and loc.accuracy > config.require_accuracy: + continue + yield ((loc.lat, loc.lon), loc.dt) + + except Exception as e: + from my.core.warnings import high + logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implemetation", exc_info=e) + high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data") + + import my.location.google + + for gloc in my.location.google.locations(): + yield ((gloc.lat, gloc.lon), gloc.dt) + +# TODO: could use heapmerge or sort the underlying iterators somehow? +# see https://github.com/karlicoss/HPI/pull/237#discussion_r858372934 +def _sorted_locations() -> List[Tuple[LatLon, datetime]]: + return list(sorted(_locations(), key=lambda x: x[1])) + + +# Note: this takes a while, as the upstream since _locations isn't sorted, so this +# has to do an iterative sort of the entire my.locations.all list +def _iter_local_dates() -> Iterator[DayWithZone]: + finder = _timezone_finder(fast=config.fast) # rely on the default + #pdt = None + # TODO: warnings doesnt actually warn? warnings = [] # todo allow to skip if not noo many errors in row? - for l in locations(start=start, stop=stop): + for (lat, lon), dt in _sorted_locations(): # TODO right. its _very_ slow... - zone = finder.timezone_at(lng=l.lon, lat=l.lat) + zone = finder.timezone_at(lat=lat, lng=lon) if zone is None: - warnings.append(f"Couldn't figure out tz for {l}") + warnings.append(f"Couldn't figure out tz for {lat}, {lon}") continue tz = pytz.timezone(zone) # TODO this is probably a bit expensive... test & benchmark - ldt = l.dt.astimezone(tz) + ldt = dt.astimezone(tz) ndate = ldt.date() - if pdt is not None and ndate < pdt.date(): - # TODO for now just drop and collect the stats - # I guess we'd have minor drops while air travel... - warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}") - continue - pdt = ldt + #if pdt is not None and ndate < pdt.date(): + # # TODO for now just drop and collect the stats + # # I guess we'd have minor drops while air travel... + # warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}") + # continue + #pdt = ldt z = tz.zone; assert z is not None yield DayWithZone(day=ndate, zone=z) -def most_common(l): - res, count = Counter(l).most_common(1)[0] # type: ignore[var-annotated] +def most_common(lst: List[DayWithZone]) -> DayWithZone: + res, _ = Counter(lst).most_common(1)[0] # type: ignore[var-annotated] return res -@mcachew(cache_path=cache_dir()) +def _iter_tz_depends_on() -> str: + """ + Since you might get new data which specifies a new timezone sometime + in the day, this causes _iter_tzs to refresh every 6 hours, like: + 2022-04-26_00 + 2022-04-26_06 + 2022-04-26_12 + 2022-04-26_18 + """ + day = str(date.today()) + hr = datetime.now().hour + hr_truncated = hr // 6 * 6 + return "{}_{}".format(day, hr_truncated) + + +# refresh _iter_tzs every 6 hours -- don't think a better depends_on is possible dynamically +@mcachew(logger=logger, depends_on=_iter_tz_depends_on) def _iter_tzs() -> Iterator[DayWithZone]: - for d, gr in groupby(_iter_local_dates(), key=lambda p: p.day): + # since we have no control over what order the locations are returned, + # we need to sort them first before we can do a groupby + local_dates: List[DayWithZone] = list(_iter_local_dates()) + local_dates.sort(key=lambda p: p.day) + for d, gr in groupby(local_dates, key=lambda p: p.day): logger.info('processed %s', d) zone = most_common(list(gr)).zone yield DayWithZone(day=d, zone=zone) @@ -106,6 +163,7 @@ def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]: break return None if zone is None else pytz.timezone(zone) + # ok to cache, there are only a few home locations? @lru_cache(maxsize=None) def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]: @@ -119,8 +177,10 @@ def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]: return pytz.timezone(zone) -# TODO expose? to main as well? def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: + ''' + Given a datetime, returns the timezone for that date. + ''' res = _get_day_tz(d=dt.date()) if res is not None: return res @@ -129,6 +189,9 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: loc = home.get_location(dt) return _get_home_tz(loc=loc) +# expose as 'public' function +get_tz = _get_tz + def localize(dt: datetime) -> tzdatetime: tz = _get_tz(dt) @@ -144,11 +207,13 @@ def stats() -> Stats: # TODO not sure what would be a good stat() for this module... # might be nice to print some actual timezones? # there aren't really any great iterables to expose + import os + VIA_LOCATION_START_YEAR = int(os.environ.get("VIA_LOCATION_START_YEAR", 1990)) def localized_years(): last = datetime.now().year + 2 # note: deliberately take + 2 years, so the iterator exhausts. otherwise stuff might never get cached # need to think about it... - for Y in range(1990, last): + for Y in range(VIA_LOCATION_START_YEAR, last): dt = datetime.fromisoformat(f'{Y}-01-01 01:01:01') yield localize(dt) return stat(localized_years) diff --git a/tests/tz.py b/tests/tz.py index cb8c513..0ea2b40 100644 --- a/tests/tz.py +++ b/tests/tz.py @@ -1,6 +1,5 @@ from datetime import datetime, timedelta, date, timezone from pathlib import Path -import sys import pytest # type: ignore import pytz # type: ignore @@ -80,7 +79,7 @@ def prepare(tmp_path: Path): from .common import reset_modules reset_modules() - LTZ._FASTER = True + LTZ.config.fast = True from .location import _prepare_google_config google = _prepare_google_config(tmp_path) @@ -98,7 +97,8 @@ def prepare(tmp_path: Path): class time: class tz: - pass # just rely on the default.. + class via_location: + pass # just rely on the defaults... import my.core.cfg as C with C.tmp_config() as config: diff --git a/tox.ini b/tox.ini index b8c89db..52bfdfb 100644 --- a/tox.ini +++ b/tox.ini @@ -100,6 +100,9 @@ commands = hpi module install my.goodreads hpi module install my.pdfs hpi module install my.smscalls + hpi module install my.location.gpslogger + hpi module install my.location.via_ip + hpi module install my.google.takeout.parser # todo fuck. -p my.github isn't checking the subpackages?? wtf... # guess it wants .pyi file?? @@ -118,6 +121,10 @@ commands = -p my.body.exercise.cross_trainer \ -p my.bluemaestro \ -p my.location.google \ + -p my.location.google_takeout \ + -p my.location.via_ip \ + -p my.location.gpslogger \ + -p my.ip.common \ -p my.time.tz.via_location \ -p my.calendar.holidays \ -p my.arbtt \ From f43eedd52a73b3e02b1f4c5cc0d40ee768d34271 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Tue, 26 Apr 2022 23:12:45 -0700 Subject: [PATCH 010/302] docs: describe the all.py/import_source pattern --- README.org | 1 + doc/MODULES.org | 10 +++++ doc/MODULE_DESIGN.org | 94 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 102 insertions(+), 3 deletions(-) diff --git a/README.org b/README.org index 865ca42..4843a9f 100644 --- a/README.org +++ b/README.org @@ -12,6 +12,7 @@ If you're in a hurry, feel free to jump straight to the [[#usecases][demos]]. - see [[https://github.com/karlicoss/HPI/tree/master/doc/SETUP.org][SETUP]] for the *installation/configuration guide* - see [[https://github.com/karlicoss/HPI/tree/master/doc/DEVELOPMENT.org][DEVELOPMENT]] for the *development guide* - see [[https://github.com/karlicoss/HPI/tree/master/doc/DESIGN.org][DESIGN]] for the *design goals* +- see [[https://github.com/karlicoss/HPI/tree/master/doc/MODULES.org][MODULES]] for *module-specific setup* - see [[https://github.com/karlicoss/HPI/tree/master/doc/MODULE_DESIGN.org][MODULE_DESIGN]] for some thoughts on structuring modules, and possibly *extending HPI* - see [[https://beepb00p.xyz/exobrain/projects/hpi.html][exobrain/HPI]] for some of my raw thoughts and todos on the project diff --git a/doc/MODULES.org b/doc/MODULES.org index a6dcd9d..239a2be 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -60,6 +60,16 @@ Some explanations: For more thoughts on modules and their structure, see [[file:MODULE_DESIGN.org][MODULE_DESIGN]] +* all.py + +Some modules have lots of different sources for data. For example, +~my.location~ (location data) has lots of possible sources -- from +~my.google.takeout.parser~, using the ~gpslogger~ android app, or through +geolocating ~my.ip~ addresses. If you only plan on using one the modules, you +can just import from the individual module, (e.g. ~my.google.takeout.parser~) +or you can disable the others using the ~core~ config -- See the +[[https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy][MODULE_DESIGN]] docs for more details. + * Configs The config snippets below are meant to be modified accordingly and *pasted into your private configuration*, e.g =$MY_CONFIG/my/config.py=. diff --git a/doc/MODULE_DESIGN.org b/doc/MODULE_DESIGN.org index d51b677..b17526d 100644 --- a/doc/MODULE_DESIGN.org +++ b/doc/MODULE_DESIGN.org @@ -2,6 +2,64 @@ Some thoughts on modules, how to structure them, and adding your own/extending H This is slightly more advanced, and would be useful if you're trying to extend HPI by developing your own modules, or contributing back to HPI +* all.py + +Some modules have lots of different sources for data. For example, ~my.location~ (location data) has lots of possible sources -- from ~my.google.takeout.parser~, using the ~gpslogger~ android app, or through geo locating ~my.ip~ addresses. For a module with multiple possible sources, its common to split it into files like: + + #+begin_src + my/location + ├── all.py -- specifies all possible sources/combines/merges data + ├── common.py -- defines shared code, e.g. to merge data from across entries, a shared model (namedtuple/dataclass) or protocol + ├── google_takeout.py -- source for data using my.google.takeout.parser + ├── gpslogger.py -- source for data using gpslogger + ├── home.py -- fallback source + └── via_ip.py -- source using my.ip + #+end_src + +Its common for each of those sources to have their own file, like ~my.location.google_takeout~, ~my.location.gpslogger~ and ~my.location.via_ip~, and then they all get merged into a single function in ~my.location.all~, like: + + #+begin_src python + from .common import Location + + def locations() -> Iterator[Location]: + # can add/comment out sources here to enable/disable them + yield from _takeout_locations() + yield from _gpslogger_locations() + + + @import_source(module_name="my.location.google_takeout") + def _takeout_locations() -> Iterator[Location]: + from . import google_takeout + yield from google_takeout.locations() + + + @import_source(module_name="my.location.gpslogger") + def _gpslogger_locations() -> Iterator[Location]: + from . import gpslogger + yield from gpslogger.locations() + #+end_src + +If you want to disable a source, you have a few options. + + - If you're using a local editable install or just want to quickly troubleshoot, you can just comment out the line in the ~locations~ function + - Since these are decorated behind ~import_source~, they automatically catch import/config errors, so instead of fatally erroring and crashing if you don't have a module setup, it'll warn you and continue to process the other sources. To get rid of the warnings, you can add the module you're not planning on using to your core config, like: + +#+begin_src python + class core: + disabled_modules = ( + "my.location.gpslogger", + "my.location.via_ip", + ) +#+end_src + +... that suppresses the warning message and lets you use ~my.location.all~ without having to change any lines of code + +Another benefit is that all the custom sources/data is localized to the ~all.py~ file, so a user can override the ~all.py~ (see the sections below on ~namespace packages~) file in their own HPI repository, adding additional sources without having to maintain a fork and patching in changes as things eventually change. For a 'real world' example of that, see [[https://github.com/seanbreckenridge/HPI#partially-in-usewith-overrides][seanbreckenridge]]s location and ip modules. + +This is of course not required for personal or single file modules, its just the pattern that seems to have the least amount of friction for the user, while being extendable, and without using a bulky plugin system to let users add additional sources. + +Another common way an ~all.py~ file is used is to merge data from a periodic export, and a GDPR export (e.g. see the ~stackexchange~, or ~github~ modules) + * module count Having way too many modules could end up being an issue. For now, I'm basically happy to merge new modules - With the current module count, things don't seem to break much, and most of them are modules I use myself, so they get tested with my own data. @@ -49,18 +107,32 @@ As an example of this, take a look at the [[https://github.com/karlicoss/HPI/tre - Cons: - Leads to some code duplication, as you can no longer use helper functions from ~my.core~ in the new repository - Additional boilerplate - instructions, installation scripts, testing. It's not required, but typically you want to leverage ~setuptools~ to allows ~pip install git+https...~ type installs, which are used in ~hpi module install~ + - Is difficult to convert to a namespace module/directory down the road Not all HPI Modules are currently at that level of complexity -- some are simple enough that one can understand the file by just reading it top to bottom. Some wouldn't make sense to split off into separate modules for one reason or another. A related concern is how to structure namespace packages to allow users to easily extend them, and how this conflicts with single file modules (Keep reading below for more information on namespace packages/extension) If a module is converted from a single file module to a namespace with multiple files, it seems this is a breaking change, see [[https://github.com/karlicoss/HPI/issues/89][#89]] for an example of this. The current workaround is to leave it a regular python package with an =__init__.py= for some amount of time and send a deprecation warning, and then eventually remove the =__init__.py= file to convert it into a namespace package. For an example, see the [[https://github.com/karlicoss/HPI/blob/8422c6e420f5e274bd1da91710663be6429c666c/my/reddit/__init__.py][reddit init file]]. +Its quite a pain to have to convert a file from a single file module to a namespace module, so if theres *any* possibility that you might convert it to a namespace package, might as well just start it off as one, to avoid the pain down the road. As an example, say you were creating something to parse ~zsh~ history. Instead of creating ~my/zsh.py~, it would be better to create ~my/zsh/parser.py~. That lets users override the file using editable/namespace packages, and it also means in the future its much more trivial to extend it to something like: + + #+begin_src + my/zsh + ├── all.py -- e.g. combined/unique/sorted zsh history + ├── aliases.py -- parse zsh alias files + ├── common.py -- shared models/merging code + ├── compdump.py -- parse zsh compdump files + └── parser.py -- parse individual zsh history files + #+end_src + +There's no requirement to follow this entire structure when you start off, the entire module could live in ~my/zsh/parser.py~, including all the merging/parsing/locating code. It just avoids the trouble in the future, and the only downside is having to type a bit more when importing from it. + #+html:
* Adding new modules As always, if the changes you wish to make are small, or you just want to add a few modules, you can clone and edit an editable install of HPI. See [[file:SETUP.org][SETUP]] for more information - The "proper way" (unless you want to contribute to the upstream) is to create a separate file hierarchy and add your module to =PYTHONPATH=. + The "proper way" (unless you want to contribute to the upstream) is to create a separate file hierarchy and add your module to =PYTHONPATH= (or use 'editable namespace packages' as described below, which also modifies your computed ~sys.path~) # TODO link to 'overlays' documentation? You can check my own [[https://github.com/karlicoss/hpi-personal-overlay][personal overlay]] as a reference. @@ -137,7 +209,7 @@ You may use the other modules or [[https://github.com/karlicoss/hpi-personal-ove In this context, 'overlay'/'override' means you create your own namespace package/file structure like described above, and since your files are in front of the upstream repository files in the computed ~sys.path~ (either by using namespace modules, the ~PYTHONPATH~ or ~with_my~), your file overrides the upstream repository -This isn't set in stone, and is currently being discussed in multiple issues: [[https://github.com/karlicoss/HPI/issues/102][#102]], [[https://github.com/karlicoss/HPI/issues/89][#89]], [[https://github.com/karlicoss/HPI/issues/154][#154]] +Related issues: [[https://github.com/karlicoss/HPI/issues/102][#102]], [[https://github.com/karlicoss/HPI/issues/89][#89]], [[https://github.com/karlicoss/HPI/issues/154][#154]] The main goals are: @@ -145,4 +217,20 @@ The main goals are: - good interop: e.g. ability to keep with the upstream, use modules coming from separate repositories, etc. - ideally mypy friendly. This kind of means 'not too dynamic and magical', which is ultimately a good thing even if you don't care about mypy. -# TODO: add example with overriding 'all' +~all.py~ using modules/sources behind ~import_source~ is the solution we've arrived at in HPI, because it meets all of these goals: + + - it doesn't require an additional plugin system, is just python imports and + namespace packages + - is generally mypy friendly (the only exception is the ~import_source~ + decorator, but that typically returns nothing if the import failed) + - doesn't require you to maintain a fork of this repository, though you can maintain a separate HPI repository (so no patching/merge conflicts) + - allows you to easily add/remove sources to the ~all.py~ module, either by: + - overriding an ~all.py~ in your own repository + - just commenting out the source/adding 2 lines to import and ~yield + from~ your new source + - doing nothing! (~import_source~ will catch the error and just warn you + and continue to work without changing any code) + +It could be argued that namespace packages and editable installs are a bit complex for a new user to get the hang of, and this is true. But fortunately ~import_source~ means any user just using HPI only needs to follow the instructions when a warning is printed, or peruse the docs here a bit -- there's no need to clone or create your own override to just use the ~all.py~ file. + +There's no requirement to use this for individual modules, it just seems to be the best solution we've arrived at so far From 0ce44bf0d18f13b43e787030251f2a3cfdfa6045 Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Sun, 1 May 2022 16:13:05 -0700 Subject: [PATCH 011/302] doctor: better quick option propogation for stats (#239) doctor: better quick option propogation for stats * use contextmanager for quick stats instead of editing global state directly * send quick to lots of stat related functions, so they could possibly be used without doctor, if someone wanted to * if a stats function has a 'quick' kwarg, send the value there as well * add an option to sort locations in my.time.tz.via_location --- doc/MODULES.org | 4 ++++ my/config.py | 1 + my/core/__main__.py | 16 +++++++++++----- my/core/common.py | 29 +++++++++++++++++++++++------ my/core/stats.py | 4 ++-- my/core/util.py | 2 +- my/time/tz/via_location.py | 21 ++++++++++++++++++--- 7 files changed, 60 insertions(+), 17 deletions(-) diff --git a/doc/MODULES.org b/doc/MODULES.org index 239a2be..2bcb052 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -189,6 +189,10 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http # less precise, but faster fast: bool = True + # sort locations by date + # incase multiple sources provide them out of order + sort_locations: bool = True + # if the accuracy for the location is more than 5km (this # isn't an accurate location, so shouldn't use it to determine # timezone), don't use diff --git a/my/config.py b/my/config.py index 0746803..7d31f5d 100644 --- a/my/config.py +++ b/my/config.py @@ -84,6 +84,7 @@ class time: class tz: class via_location: fast: bool + sort_locations: bool require_accuracy: float diff --git a/my/core/__main__.py b/my/core/__main__.py index 22068a6..faff852 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -215,11 +215,11 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: Li verbose = True vw = '' if verbose else '; pass --verbose to print more information' - from . import common - common.QUICK_STATS = quick # dirty, but hopefully OK for cli - tabulate_warnings() + import contextlib + + from .common import quick_stats from .util import get_stats, HPIModule from .stats import guess_stats from .error import warn_my_config_import_error @@ -256,15 +256,21 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: Li stats = get_stats(m) if stats is None: # then try guessing.. not sure if should log somehow? - stats = guess_stats(m) + stats = guess_stats(m, quick=quick) if stats is None: eprint(" - no 'stats' function, can't check the data") # todo point to a readme on the module structure or something? continue + quick_context = quick_stats() if quick else contextlib.nullcontext() + try: - res = stats() + kwargs = {} + if callable(stats) and 'quick' in inspect.signature(stats).parameters: + kwargs['quick'] = quick + with quick_context: + res = stats(**kwargs) assert res is not None, 'stats() returned None' except Exception as ee: warning(f' - {click.style("stats:", fg="red")} computing failed{vw}') diff --git a/my/core/common.py b/my/core/common.py index 92806d2..b7db362 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -2,6 +2,7 @@ from glob import glob as do_glob from pathlib import Path from datetime import datetime import functools +from contextlib import contextmanager import types from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple, TYPE_CHECKING import warnings @@ -425,16 +426,32 @@ def warn_if_empty(f): return wrapped # type: ignore -# hacky hook to speed up for 'hpi doctor' -# todo think about something better +# global state that turns on/off quick stats +# can use the 'quick_stats' contextmanager +# to enable/disable this in cli so that module 'stats' +# functions don't have to implement custom 'quick' logic QUICK_STATS = False +# incase user wants to use the stats functions/quick option +# elsewhere -- can use this decorator instead of editing +# the global state directly +@contextmanager +def quick_stats(): + global QUICK_STATS + prev = QUICK_STATS + try: + QUICK_STATS = True + yield + finally: + QUICK_STATS = prev + + C = TypeVar('C') Stats = Dict[str, Any] StatsFun = Callable[[], Stats] # todo not sure about return type... -def stat(func: Union[Callable[[], Iterable[C]], Iterable[C]]) -> Stats: +def stat(func: Union[Callable[[], Iterable[C]], Iterable[C]], quick: bool=False) -> Stats: if callable(func): fr = func() fname = func.__name__ @@ -451,13 +468,13 @@ def stat(func: Union[Callable[[], Iterable[C]], Iterable[C]]) -> Stats: rows=len(df), ) else: - res = _stat_iterable(fr) + res = _stat_iterable(fr, quick=quick) return { fname: res, } -def _stat_iterable(it: Iterable[C]) -> Any: +def _stat_iterable(it: Iterable[C], quick: bool=False) -> Any: from more_itertools import ilen, take, first # todo not sure if there is something in more_itertools to compute this? @@ -476,7 +493,7 @@ def _stat_iterable(it: Iterable[C]) -> Any: eit = funcit() count: Any - if QUICK_STATS: + if quick or QUICK_STATS: initial = take(100, eit) count = len(initial) if first(eit, None) is not None: # todo can actually be none... diff --git a/my/core/stats.py b/my/core/stats.py index 9750061..3a93f68 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -13,12 +13,12 @@ from .common import StatsFun, Stats, stat # TODO maybe could be enough to annotate OUTPUTS or something like that? # then stats could just use them as hints? -def guess_stats(module_name: str) -> Optional[StatsFun]: +def guess_stats(module_name: str, quick: bool=False) -> Optional[StatsFun]: providers = guess_data_providers(module_name) if len(providers) == 0: return None def auto_stats() -> Stats: - return {k: stat(v) for k, v in providers.items()} + return {k: stat(v, quick=quick) for k, v in providers.items()} return auto_stats diff --git a/my/core/util.py b/my/core/util.py index 222cdec..a6204d9 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -20,7 +20,7 @@ def get_stats(module: str) -> Optional[StatsFun]: # todo detect via ast? try: mod = import_module(module) - except Exception as e: + except Exception: return None return getattr(mod, 'stats', None) diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 0e91193..d31f04b 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -16,6 +16,10 @@ class config(time.tz.via_location): # less precise, but faster fast: bool = True + # sort locations by date + # incase multiple sources provide them out of order + sort_locations: bool = True + # if the accuracy for the location is more than 5km, don't use require_accuracy: float = 5_000 @@ -24,7 +28,7 @@ from collections import Counter from datetime import date, datetime from functools import lru_cache from itertools import groupby -from typing import Iterator, NamedTuple, Optional, Tuple, Any, List +from typing import Iterator, NamedTuple, Optional, Tuple, Any, List, Iterable from more_itertools import seekable import pytz @@ -87,8 +91,12 @@ def _iter_local_dates() -> Iterator[DayWithZone]: #pdt = None # TODO: warnings doesnt actually warn? warnings = [] + + locs: Iterable[Tuple[LatLon, datetime]] + locs = _sorted_locations() if config.sort_locations else _locations() + # todo allow to skip if not noo many errors in row? - for (lat, lon), dt in _sorted_locations(): + for (lat, lon), dt in locs: # TODO right. its _very_ slow... zone = finder.timezone_at(lat=lat, lng=lon) if zone is None: @@ -203,7 +211,14 @@ def localize(dt: datetime) -> tzdatetime: from ...core import stat, Stats -def stats() -> Stats: +def stats(quick: bool=False) -> Stats: + if quick: + prev, config.sort_locations = config.sort_locations, False + res = { + 'first': next(_iter_local_dates()) + } + config.sort_locations = prev + return res # TODO not sure what would be a good stat() for this module... # might be nice to print some actual timezones? # there aren't really any great iterables to expose From 80c5be7293293c29b3b12ae845172a2bab402ccc Mon Sep 17 00:00:00 2001 From: Maxim Efremov Date: Mon, 2 May 2022 13:30:35 +0600 Subject: [PATCH 012/302] Adding bots file type to reduce parsing issues --- my/github/gdpr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/my/github/gdpr.py b/my/github/gdpr.py index fe8a64b..a676b1b 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -35,6 +35,7 @@ def events() -> Iterable[Res[Event]]: 'issue_events_': None, # eh, doesn't seem to have any useful bodies 'attachments_' : None, # not sure if useful 'users' : None, # just contains random users + 'bots' : None, # just contains random bots 'repositories_' : _parse_repository, 'issue_comments_': _parse_issue_comment, 'issues_' : _parse_issue, From 637982a5babe383c82fa2fc303775737785ae8d0 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 2 May 2022 17:35:15 +0100 Subject: [PATCH 013/302] ci: update ci configs - add windows runner - update actions versions - other minor enhancements --- .github/workflows/main.yml | 35 ++++++++++++++++++++++------------- pytest.ini | 2 ++ scripts/ci/run | 32 ++++++++++++++++++++++++-------- setup.py | 2 +- tox.ini | 3 +++ 5 files changed, 52 insertions(+), 22 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 35cddb2..acace3e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,31 +7,37 @@ on: tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them". pull_request: # needed to trigger on others' PRs + # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them". workflow_dispatch: # needed to trigger workflows manually # todo cron? -env: - # useful for scripts & sometimes tests to know - CI: true jobs: build: strategy: matrix: - platform: [ubuntu-latest, macos-latest] # TODO windows-latest?? - python-version: [3.7, 3.8, 3.9] + platform: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.7', '3.8', '3.9'] + exclude: [ + # windows runners are pretty scarce, so let's only run one of them.. + {platform: windows-latest, python-version: '3.7'}, + {platform: windows-latest, python-version: '3.9'}, + ] runs-on: ${{ matrix.platform }} + # TODO let's at least start running windows for now, will fix later + continue-on-error: ${{ matrix.platform == 'windows-latest' }} + steps: # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation - run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: recursive fetch-depth: 0 # nicer to have all git history when debugging/for tests @@ -39,13 +45,16 @@ jobs: # uncomment for SSH debugging # - uses: mxschmitt/action-tmate@v3 - - run: scripts/ci/run + # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd... + - run: bash scripts/ci/run - - uses: actions/upload-artifact@v2 + - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms + uses: actions/upload-artifact@v2 with: name: .coverage.mypy-misc_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-misc/ - - uses: actions/upload-artifact@v2 + - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms + uses: actions/upload-artifact@v2 with: name: .coverage.mypy-core_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-core/ @@ -58,11 +67,11 @@ jobs: # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation - run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v3 with: - python-version: '3.7' + python-version: '3.8' - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: recursive diff --git a/pytest.ini b/pytest.ini index b6406e2..aaa3df2 100644 --- a/pytest.ini +++ b/pytest.ini @@ -9,3 +9,5 @@ addopts = # otherwise it won't discover doctests # eh? importing too much # --doctest-modules + # show all test durations (unless they are too short) + --durations=0 diff --git a/scripts/ci/run b/scripts/ci/run index a7ea3ba..47014ec 100755 --- a/scripts/ci/run +++ b/scripts/ci/run @@ -1,7 +1,8 @@ -#!/bin/bash -eu +#!/bin/bash +set -eu cd "$(dirname "$0")" -cd ../.. +cd .. # git root if ! command -v sudo; then # CI or Docker sometimes doesn't have it, so useful to have a dummy @@ -10,16 +11,31 @@ if ! command -v sudo; then } fi -if ! [ -z "$CI" ]; then +if [ -n "${CI-}" ]; then # install OS specific stuff here - if [[ "$OSTYPE" == "darwin"* ]]; then + case "$OSTYPE" in + darwin*) # macos brew install fd - else + ;; + cygwin* | msys* | win*) + # windows + : + ;; + *) + # must be linux? sudo apt update sudo apt install fd-find - fi + ;; + esac fi -pip3 install --user tox -tox + +PY_BIN="python3" +# some systems might have python pointing to python3 +if ! command -v python3 &> /dev/null; then + PY_BIN="python" +fi + +"$PY_BIN" -m pip install --user tox +"$PY_BIN" -m tox diff --git a/setup.py b/setup.py index 1163008..31fc393 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ INSTALL_REQUIRES = [ ] -def main(): +def main() -> None: pkg = 'my' subpackages = find_namespace_packages('.', include=('my.*',)) setup( diff --git a/tox.ini b/tox.ini index 52bfdfb..8c22eb3 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,8 @@ [tox] minversion = 3.5 +# https://github.com/tox-dev/tox/issues/20#issuecomment-247788333 +# hack to prevent .tox from crapping to the project directory +toxworkdir={env:TOXWORKDIR_BASE:}{toxinidir}/.tox [testenv] passenv = CI CI_* From 64a4782f0ed5ec7322354a9f2035834e1803dc1b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 2 May 2022 18:26:22 +0100 Subject: [PATCH 014/302] core/ci: fix windows-specific issues - use portable separators - paths should be prepended with r' (so backwards slash isn't treated as escaping) - sqlite connections should be closed (otherwise windows fails to remove the underlying db file) - workaround for emojis via PYTHONUTF8=1 test for now - make ZipPath portable - properly use tox python environment everywhere this was causing issues on Windows e.g. WARNING: test command found but not installed in testenv cmd: C:\hostedtoolcache\windows\Python\3.9.12\x64\python3.EXE --- demo.py | 17 +++++++++++------ my/config.py | 2 +- my/core/discovery_pure.py | 5 +++-- my/core/kompress.py | 4 +++- my/core/sqlite.py | 1 + my/core/util.py | 4 ++-- tests/cli.py | 12 +++++++++++- tests/commits.py | 7 ++++++- tests/core/test_kompress.py | 32 +++++++++++++++++--------------- tests/sqlite.py | 4 ++++ tox.ini | 16 ++++++++-------- 11 files changed, 67 insertions(+), 37 deletions(-) diff --git a/demo.py b/demo.py index ae0ba06..3c08cce 100755 --- a/demo.py +++ b/demo.py @@ -3,6 +3,7 @@ from subprocess import check_call, DEVNULL from shutil import copy, copytree import os from os.path import abspath +from sys import executable as python from pathlib import Path my_repo = Path(__file__).absolute().parent @@ -18,12 +19,12 @@ def run(): # 2. prepare repositories you'd be using. For this demo we only set up Hypothesis tox = 'TOX' in os.environ if tox: # tox doesn't like --user flag - check_call('pip3 install git+https://github.com/karlicoss/hypexport.git'.split()) + check_call(f'{python} -m pip install git+https://github.com/karlicoss/hypexport.git'.split()) else: try: import hypexport except ModuleNotFoundError: - check_call('pip3 install --user git+https://github.com/karlicoss/hypexport.git'.split()) + check_call(f'{python} -m pip --user git+https://github.com/karlicoss/hypexport.git'.split()) # 3. prepare some demo Hypothesis data @@ -48,7 +49,7 @@ def run(): # 4. now we can use it! os.chdir(my_repo) - check_call(['python3', '-c', ''' + check_call([python, '-c', ''' import my.hypothesis pages = my.hypothesis.pages() @@ -106,13 +107,17 @@ def named_temp_dir(name: str): """ Fixed name tmp dir """ - td = (Path('/tmp') / name) + import tempfile + td = Path(tempfile.gettempdir()) / name try: td.mkdir(exist_ok=False) yield td finally: - import shutil - shutil.rmtree(str(td)) + import os, shutil + skip_cleanup = 'CI' in os.environ and os.name == 'nt' + # TODO hmm for some reason cleanup on windows causes AccessError + if not skip_cleanup: + shutil.rmtree(str(td)) def main(): diff --git a/my/config.py b/my/config.py index 7d31f5d..b841d31 100644 --- a/my/config.py +++ b/my/config.py @@ -19,7 +19,7 @@ from my.core import Paths, PathIsh class hypothesis: # expects outputs from https://github.com/karlicoss/hypexport # (it's just the standard Hypothes.is export format) - export_path: Paths = '/path/to/hypothesis/data' + export_path: Paths = r'/path/to/hypothesis/data' class instapaper: export_path: Paths = '' diff --git a/my/core/discovery_pure.py b/my/core/discovery_pure.py index 17a1976..dbd07b9 100644 --- a/my/core/discovery_pure.py +++ b/my/core/discovery_pure.py @@ -16,6 +16,7 @@ NOT_HPI_MODULE_VAR = '__NOT_HPI_MODULE__' ### import ast +import os from typing import Optional, Sequence, List, NamedTuple, Iterable, cast, Any from pathlib import Path import re @@ -151,7 +152,7 @@ def _modules_under_root(my_root: Path) -> Iterable[HPIModule]: mp = f.relative_to(my_root.parent) if mp.name == '__init__.py': mp = mp.parent - m = str(mp.with_suffix('')).replace('/', '.') + m = str(mp.with_suffix('')).replace(os.sep, '.') if ignored(m): continue a: ast.Module = ast.parse(f.read_text()) @@ -192,7 +193,7 @@ def test() -> None: def test_demo() -> None: demo = module_by_name('my.demo') assert demo.doc is not None - assert str(demo.file) == 'my/demo.py' + assert demo.file == Path('my', 'demo.py') assert demo.requires is None diff --git a/my/core/kompress.py b/my/core/kompress.py index 93a19a0..60b8b78 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -203,7 +203,9 @@ class ZipPath(ZipPathBase): def stat(self) -> os.stat_result: # NOTE: zip datetimes have no notion of time zone, usually they just keep local time? # see https://en.wikipedia.org/wiki/ZIP_(file_format)#Structure - dt = datetime(*self.root.getinfo(str(self.subpath)).date_time) + # note: seems that zip always uses forward slash, regardless OS? + zip_subpath = '/'.join(self.subpath.parts) + dt = datetime(*self.root.getinfo(zip_subpath).date_time) ts = int(dt.timestamp()) params = dict( st_mode=0, diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 1b38869..5253607 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -48,4 +48,5 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: with sqlite3.connect(str(tdir / dp.name)) as conn: from .compat import sqlite_backup sqlite_backup(source=conn, dest=dest) + conn.close() return dest diff --git a/my/core/util.py b/my/core/util.py index a6204d9..0ffc3a7 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -229,9 +229,9 @@ def test_bad_modules(tmp_path: Path) -> None: (par / 'malicious.py').write_text(f''' from pathlib import Path -Path('{xx}').write_text('aaand your data is gone!') +Path(r'{xx}').write_text('aaand your data is gone!') -raise RuntimeError("FAIL ON IMPORT! naughy.") +raise RuntimeError("FAIL ON IMPORT! naughty.") def stats(): return [1, 2, 3] diff --git a/tests/cli.py b/tests/cli.py index 1e3c560..fce53b7 100644 --- a/tests/cli.py +++ b/tests/cli.py @@ -1,4 +1,14 @@ +import os from subprocess import check_call def test_lists_modules() -> None: - check_call(['hpi', 'modules']) + # hack PYTHONUTF8 for windows + # see https://github.com/karlicoss/promnesia/issues/274 + # https://memex.zulipchat.com/#narrow/stream/279600-promnesia/topic/indexing.3A.20utf8.28emoji.29.20filenames.20in.20Windows + # necessary for this test cause emooji is causing trouble + # TODO need to fix it properly + env = { + **os.environ, + 'PYTHONUTF8': '1', + } + check_call(['hpi', 'modules'], env=env) diff --git a/tests/commits.py b/tests/commits.py index ab4e2c7..1aa7aa0 100644 --- a/tests/commits.py +++ b/tests/commits.py @@ -1,9 +1,14 @@ -# TODO need fdfind on CI? from pathlib import Path from more_itertools import bucket import pytest +import os +pytestmark = pytest.mark.skipif( + os.name == 'nt', + reason='TODO figure out how to install fd-find on Windows', +) + def test() -> None: from my.coding.commits import commits diff --git a/tests/core/test_kompress.py b/tests/core/test_kompress.py index 481a025..97539cb 100644 --- a/tests/core/test_kompress.py +++ b/tests/core/test_kompress.py @@ -76,24 +76,25 @@ def test_zippath() -> None: hash(zp) assert zp.exists() - assert (zp / 'gdpr_export/comments').exists() + assert (zp / 'gdpr_export' / 'comments').exists() # check str constructor just in case - assert (ZipPath(str(target)) / 'gdpr_export/comments').exists() + assert (ZipPath(str(target)) / 'gdpr_export' / 'comments').exists() assert not (ZipPath(str(target)) / 'whatever').exists() matched = list(zp.rglob('*')) assert len(matched) > 0 assert all(p.filepath == target for p in matched), matched - rpaths = [str(p.relative_to(zp)) for p in matched] + rpaths = [p.relative_to(zp) for p in matched] + gdpr_export = Path('gdpr_export') assert rpaths == [ - 'gdpr_export', - 'gdpr_export/comments', - 'gdpr_export/comments/comments.json', - 'gdpr_export/profile', - 'gdpr_export/profile/settings.json', - 'gdpr_export/messages', - 'gdpr_export/messages/index.csv', + gdpr_export, + gdpr_export / 'comments', + gdpr_export / 'comments' / 'comments.json', + gdpr_export / 'profile', + gdpr_export / 'profile' / 'settings.json', + gdpr_export / 'messages', + gdpr_export / 'messages' / 'index.csv', ], rpaths @@ -103,14 +104,15 @@ def test_zippath() -> None: # same for this one # assert ZipPath(Path('test'), 'whatever').absolute() == ZipPath(Path('test').absolute(), 'whatever') - assert (ZipPath(target) / 'gdpr_export/comments').exists() + assert (ZipPath(target) / 'gdpr_export' / 'comments').exists() - jsons = [str(p.relative_to(zp / 'gdpr_export')) for p in zp.rglob('*.json')] + jsons = [p.relative_to(zp / 'gdpr_export') for p in zp.rglob('*.json')] assert jsons == [ - 'comments/comments.json', - 'profile/settings.json', + Path('comments','comments.json'), + Path('profile','settings.json'), ] + # NOTE: hmm interesting, seems that ZipPath is happy with forward slash regardless OS? assert list(zp.rglob('mes*')) == [ZipPath(target, 'gdpr_export/messages')] iterdir_res = list((zp / 'gdpr_export').iterdir()) @@ -118,7 +120,7 @@ def test_zippath() -> None: assert all(isinstance(p, Path) for p in iterdir_res) # date recorded in the zip archive - assert (zp / 'gdpr_export/comments/comments.json').stat().st_mtime > 1625000000 + assert (zp / 'gdpr_export' / 'comments' / 'comments.json').stat().st_mtime > 1625000000 # TODO ugh. # unzip -l shows the date as 2021-07-01 09:43 # however, python reads it as 2021-07-01 01:43 ?? diff --git a/tests/sqlite.py b/tests/sqlite.py index 1b423da..f80636e 100644 --- a/tests/sqlite.py +++ b/tests/sqlite.py @@ -43,20 +43,24 @@ def _test_do_copy(db: Path) -> None: shutil.copy(db, cdb) with sqlite3.connect(str(cdb)) as conn_copy: assert len(list(conn_copy.execute('SELECT * FROM testtable'))) == 5 + conn_copy.close() def _test_do_immutable(db: Path) -> None: # in readonly mode doesn't touch with sqlite_connect_immutable(db) as conn_imm: assert len(list(conn_imm.execute('SELECT * FROM testtable'))) == 5 + conn_imm.close() def _test_do_copy_and_open(db: Path) -> None: with sqlite_copy_and_open(db) as conn_mem: assert len(list(conn_mem.execute('SELECT * FROM testtable'))) == 10 + conn_mem.close() def _test_open_asis(db: Path) -> None: # NOTE: this also works... but leaves some potential for DB corruption with sqlite3.connect(str(db)) as conn_db_2: assert len(list(conn_db_2.execute('SELECT * FROM testtable'))) == 10 + conn_db_2.close() diff --git a/tox.ini b/tox.ini index 8c22eb3..33c2c71 100644 --- a/tox.ini +++ b/tox.ini @@ -12,7 +12,7 @@ passenv = CI CI_* [testenv:tests-core] commands = pip install -e .[testing] - python3 -m pytest \ + {envpython} -m pytest \ tests/core.py \ tests/sqlite.py \ tests/get_files.py \ @@ -29,7 +29,7 @@ commands = # installed to test my.core.serialize while using simplejson and not orjson pip install simplejson - python3 -m pytest \ + {envpython} -m pytest \ tests/serialize_simplejson.py \ {posargs} @@ -52,28 +52,28 @@ commands = hpi module install my.reddit.rexport - python3 -m pytest tests \ + {envpython} -m pytest tests \ # ignore some tests which might take a while to run on ci.. --ignore tests/takeout.py \ --ignore tests/extra/polar.py \ # dont run simplejson compatibility test since orjson is now installed - --ignore tests/serialize_simplejson.py + --ignore tests/serialize_simplejson.py \ {posargs} [testenv:demo] commands = pip install git+https://github.com/karlicoss/hypexport - ./demo.py + {envpython} ./demo.py [testenv:mypy-core] -whitelist_externals = cat +allowlist_externals = cat commands = pip install -e .[testing,optional] pip install orgparse # used it core.orgmode? # todo add tests? - python3 -m mypy --install-types --non-interactive \ + {envpython} -m mypy --install-types --non-interactive \ -p my.core \ --txt-report .coverage.mypy-core \ --html-report .coverage.mypy-core \ @@ -109,7 +109,7 @@ commands = # todo fuck. -p my.github isn't checking the subpackages?? wtf... # guess it wants .pyi file?? - python3 -m mypy --install-types --non-interactive \ + {envpython} -m mypy --install-types --non-interactive \ -p my.browser \ -p my.endomondo \ -p my.github.ghexport \ From 76a497f2bbeebf06122516d5b66ac29def3d7ab6 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Tue, 3 May 2022 19:11:23 +0100 Subject: [PATCH 015/302] general,ci: fix python 3.10 issues, add to CI (#242) --- .github/workflows/main.yml | 11 ++++++----- my/core/kompress.py | 30 ++++++++++++++++-------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index acace3e..c45d99a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,17 +17,18 @@ jobs: strategy: matrix: platform: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.7', '3.8', '3.9'] + python-version: ['3.7', '3.8', '3.9', '3.10'] exclude: [ # windows runners are pretty scarce, so let's only run one of them.. - {platform: windows-latest, python-version: '3.7'}, - {platform: windows-latest, python-version: '3.9'}, + {platform: windows-latest, python-version: '3.7' }, + {platform: windows-latest, python-version: '3.9' }, + {platform: windows-latest, python-version: '3.10'}, ] runs-on: ${{ matrix.platform }} - # TODO let's at least start running windows for now, will fix later - continue-on-error: ${{ matrix.platform == 'windows-latest' }} + # useful for 'optional' pipelines + # continue-on-error: ${{ matrix.platform == 'windows-latest' }} steps: # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation diff --git a/my/core/kompress.py b/my/core/kompress.py index 60b8b78..e5c910d 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -124,20 +124,20 @@ def kexists(path: PathIsh, subpath: str) -> bool: import zipfile if sys.version_info[:2] >= (3, 8): # meh... zipfile.Path is not available on 3.7 - ZipPathBase = zipfile.Path + zipfile_Path = zipfile.Path else: if typing.TYPE_CHECKING: - ZipPathBase = Any + zipfile_Path = Any else: - ZipPathBase = object + zipfile_Path = object -class ZipPath(ZipPathBase): +class ZipPath(zipfile_Path): # NOTE: is_dir/is_file might not behave as expected, the base class checks it only based on the slash in path - # seems that at/root are not exposed in the docs, so might be an implementation detail - at: str + # seems that root/at are not exposed in the docs, so might be an implementation detail root: zipfile.ZipFile + at: str @property def filepath(self) -> Path: @@ -156,7 +156,11 @@ class ZipPath(ZipPathBase): if self.at == '': # special case, the base class returns False in this case for some reason return self.filepath.exists() - return super().exists() + return super().exists() or self._as_dir().exists() + + def _as_dir(self) -> zipfile_Path: + # note: seems that zip always uses forward slash, regardless OS? + return zipfile_Path(self.root, self.at + '/') def rglob(self, glob: str) -> Sequence[ZipPath]: # note: not 100% sure about the correctness, but seem fine? @@ -166,7 +170,7 @@ class ZipPath(ZipPathBase): return [ZipPath(self.root, p) for p in rpaths] def relative_to(self, other: ZipPath) -> Path: - assert self.root == other.root, (self.root, other.root) + assert self.filepath == other.filepath, (self.filepath, other.filepath) return self.subpath.relative_to(other.subpath) @property @@ -176,11 +180,11 @@ class ZipPath(ZipPathBase): def __truediv__(self, key) -> ZipPath: # need to implement it so the return type is not zipfile.Path - s = super().__truediv__(key) - return ZipPath(s.root, s.at) # type: ignore[attr-defined] + tmp = zipfile_Path(self.root) / self.at / key + return ZipPath(self.root, tmp.at) # type: ignore[attr-defined] def iterdir(self) -> Iterator[ZipPath]: - for s in super().iterdir(): + for s in self._as_dir().iterdir(): yield ZipPath(s.root, s.at) # type: ignore[attr-defined] @property @@ -203,9 +207,7 @@ class ZipPath(ZipPathBase): def stat(self) -> os.stat_result: # NOTE: zip datetimes have no notion of time zone, usually they just keep local time? # see https://en.wikipedia.org/wiki/ZIP_(file_format)#Structure - # note: seems that zip always uses forward slash, regardless OS? - zip_subpath = '/'.join(self.subpath.parts) - dt = datetime(*self.root.getinfo(zip_subpath).date_time) + dt = datetime(*self.root.getinfo(self.at).date_time) ts = int(dt.timestamp()) params = dict( st_mode=0, From eae0e1a61426abcaaa8e19d00b199be992bfbd4e Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 22 May 2022 15:21:08 +0100 Subject: [PATCH 016/302] my.time.tz.via_location: provide default (empty) config if user doesn't have time config defined --- my/config.py | 3 +++ my/time/tz/via_location.py | 24 +++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/my/config.py b/my/config.py index b841d31..b1c17d2 100644 --- a/my/config.py +++ b/my/config.py @@ -80,8 +80,11 @@ class location: accuracy: float +from my.core.compat import Literal class time: class tz: + policy: Literal['keep', 'convert', 'throw'] + class via_location: fast: bool sort_locations: bool diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index d31f04b..6b8e835 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -7,12 +7,30 @@ REQUIRES = [ ] -from my.config import time +## user might not have tz config section, so makes sense to be more defensive about it +# todo might be useful to extract a helper for this +try: + from my.config import time +except ImportError as ie: + if ie.name != 'time': + raise ie +else: + try: + user_config = time.tz.via_location + except AttributeError as ae: + if not ("'tz'" in str(ae) or "'via_location'"): + raise ae + +# deliberately dynamic to prevent confusing mypy +if 'user_config' not in globals(): + globals()['user_config'] = object +## + + from my.core import dataclass - @dataclass -class config(time.tz.via_location): +class config(user_config): # less precise, but faster fast: bool = True From 19da373a0a3572eb61f1f0ed3221ac2257083edf Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 27 May 2022 14:34:32 -0700 Subject: [PATCH 017/302] location: remove duplicate via_ip import --- my/location/all.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/my/location/all.py b/my/location/all.py index bd9364e..eec4bcc 100644 --- a/my/location/all.py +++ b/my/location/all.py @@ -7,8 +7,6 @@ from typing import Iterator from my.core import Stats, LazyLogger from my.core.source import import_source -from my.location.via_ip import locations - from .common import Location From de7972be051885f1470fb6f5929d2847c806f867 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 16:57:08 +0100 Subject: [PATCH 018/302] twitter: add permalink to Talon objects; extract shared method --- my/twitter/archive.py | 22 +++++++++++++--------- my/twitter/common.py | 5 +++++ my/twitter/talon.py | 8 +++++++- my/twitter/twint.py | 9 +++++---- 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 1362137..6533f60 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -49,7 +49,7 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path)[-1:] -Tid = str +from .common import TweetId, permalink # TODO make sure it's not used anywhere else and simplify interface @@ -58,7 +58,7 @@ class Tweet(NamedTuple): screen_name: str @property - def id_str(self) -> str: + def id_str(self) -> TweetId: return self.raw['id_str'] @property @@ -68,7 +68,7 @@ class Tweet(NamedTuple): @property def permalink(self) -> str: - return f'https://twitter.com/{self.screen_name}/status/{self.tid}' + return permalink(screen_name=self.screen_name, id=self.id_str) @property def text(self) -> str: @@ -92,11 +92,11 @@ class Tweet(NamedTuple): # TODO deprecate tid? @property - def tid(self) -> Tid: + def tid(self) -> TweetId: return self.id_str @property - def dt(self) -> datetime: + def dt(self) -> datetime_aware: return self.created_at @@ -104,14 +104,13 @@ class Like(NamedTuple): raw: Json screen_name: str - # TODO need to make permalink/link/url consistent across my stuff.. @property def permalink(self) -> str: # doesn'tseem like link it export is more specific... - return f'https://twitter.com/{self.screen_name}/status/{self.tid}' + return permalink(screen_name=self.screen_name, id=self.id_str) @property - def id_str(self) -> Tid: + def id_str(self) -> TweetId: return self.raw['tweetId'] @property @@ -121,13 +120,14 @@ class Like(NamedTuple): # TODO deprecate? @property - def tid(self) -> Tid: + def tid(self) -> TweetId: return self.id_str from functools import lru_cache class ZipExport: def __init__(self, archive_path: Path) -> None: + # TODO use ZipPath self.epath = archive_path self.old_format = False # changed somewhere around 2020.03 @@ -189,3 +189,7 @@ def stats() -> Stats: **stat(tweets), **stat(likes), } + + +## Deprecated stuff +Tid = TweetId diff --git a/my/twitter/common.py b/my/twitter/common.py index 5fd7daa..4661c6d 100644 --- a/my/twitter/common.py +++ b/my/twitter/common.py @@ -8,6 +8,7 @@ from more_itertools import unique_everseen # TODO add proper Protocol for Tweet Tweet = Any +TweetId = str from my.core import warn_if_empty, Res @@ -19,3 +20,7 @@ def merge_tweets(*sources: Iterator[Res[Tweet]]) -> Iterator[Res[Tweet]]: else: return r.id_str yield from unique_everseen(chain(*sources), key=key) + + +def permalink(*, screen_name: str, id: str) -> str: + return f'https://twitter.com/{screen_name}/status/{id}' diff --git a/my/twitter/talon.py b/my/twitter/talon.py index 4b42b1f..3ff9ddf 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -25,15 +25,21 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) +from .common import TweetId, permalink + @dataclass(unsafe_hash=True) class Tweet: - id_str: str + id_str: TweetId created_at: datetime_aware screen_name: str text: str urls: Sequence[str] + @property + def permalink(self) -> str: + return permalink(screen_name=self.screen_name, id=self.id_str) + # meh... just wrappers to tell apart tweets from favorites... @dataclass(unsafe_hash=True) diff --git a/my/twitter/twint.py b/my/twitter/twint.py index ee84ea1..f20be42 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -35,11 +35,14 @@ def get_db_path() -> Path: return max(get_files(config.export_path)) +from .common import TweetId, permalink + + class Tweet(NamedTuple): row: Json @property - def id_str(self) -> str: + def id_str(self) -> TweetId: return self.row['id_str'] @property @@ -50,7 +53,6 @@ class Tweet(NamedTuple): dt = datetime.fromtimestamp(seconds, tz=tz) return dt - # TODO permalink -- take user into account? @property def screen_name(self) -> str: return self.row['screen_name'] @@ -66,10 +68,9 @@ class Tweet(NamedTuple): return [] return ustr.split(',') - # TODO move to common @property def permalink(self) -> str: - return f'https://twitter.com/{self.screen_name}/status/{self.id_str}' + return permalink(screen_name=self.screen_name, id=self.id_str) # TODO urls From d65e1b5245ca04049c42ced4153b6deef8cb061c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 17:24:09 +0100 Subject: [PATCH 019/302] twitter.twint: localize timestamps correctly same issue as discussed here https://memex.zulipchat.com/#narrow/stream/279610-data/topic/google.20takeout.20timestamps also see corresponding changes for google_takeout_parser - https://github.com/seanbreckenridge/google_takeout_parser/pull/28/files - https://github.com/seanbreckenridge/google_takeout_parser/pull/30/files --- my/core/time.py | 28 ++++++++++++++++++++-------- my/twitter/twint.py | 7 +++---- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/my/core/time.py b/my/core/time.py index b55fae3..7698332 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -1,8 +1,9 @@ from functools import lru_cache -from datetime import tzinfo -from typing import Sequence +from typing import Sequence, Dict -import pytz # type: ignore +import pytz + +from .common import datetime_aware, datetime_naive def user_forced() -> Sequence[str]: @@ -17,12 +18,12 @@ def user_forced() -> Sequence[str]: @lru_cache(1) -def _abbr_to_timezone_map(): +def _abbr_to_timezone_map() -> Dict[str, pytz.BaseTzInfo]: # also force UTC to always correspond to utc # this makes more sense than Zulu it ends up by default timezones = pytz.all_timezones + ['UTC'] + list(user_forced()) - res = {} + res: Dict[str, pytz.BaseTzInfo] = {} for tzname in timezones: tz = pytz.timezone(tzname) infos = getattr(tz, '_tzinfos', []) # not sure if can rely on attr always present? @@ -41,12 +42,23 @@ def _abbr_to_timezone_map(): return res -# todo dammit, lru_cache interferes with mypy? -@lru_cache(None) -def abbr_to_timezone(abbr: str) -> tzinfo: +@lru_cache(maxsize=None) +def abbr_to_timezone(abbr: str) -> pytz.BaseTzInfo: return _abbr_to_timezone_map()[abbr] +def localize_with_abbr(dt: datetime_naive, *, abbr: str) -> datetime_aware: + if abbr.lower() == 'utc': + # best to shortcut here to avoid complications + return pytz.utc.localize(dt) + + tz = abbr_to_timezone(abbr) + # this will compute the correct UTC offset + tzinfo = tz.localize(dt).tzinfo + assert tzinfo is not None # make mypy happy + return tz.normalize(dt.replace(tzinfo=tzinfo)) + + def zone_to_countrycode(zone: str) -> str: # todo make optional? return _zones_to_countrycode()[zone] diff --git a/my/twitter/twint.py b/my/twitter/twint.py index f20be42..a40c5bb 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -26,7 +26,7 @@ from typing import NamedTuple, Iterator, List from pathlib import Path from ..core.common import get_files, LazyLogger, Json, datetime_aware -from ..core.time import abbr_to_timezone +from ..core.time import localize_with_abbr log = LazyLogger(__name__) @@ -49,9 +49,8 @@ class Tweet(NamedTuple): def created_at(self) -> datetime_aware: seconds = self.row['created_at'] / 1000 tz_abbr = self.row['timezone'] - tz = abbr_to_timezone(tz_abbr) - dt = datetime.fromtimestamp(seconds, tz=tz) - return dt + naive = datetime.fromtimestamp(seconds) + return localize_with_abbr(naive, abbr=tz_abbr) @property def screen_name(self) -> str: From 4104f821fa29d04ea5e871a184ac96138acdccd4 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 19:47:42 +0100 Subject: [PATCH 020/302] twitter.twint: actually need to treat created_at is UTC --- my/twitter/twint.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/my/twitter/twint.py b/my/twitter/twint.py index a40c5bb..13b63cc 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -21,12 +21,11 @@ from ..core.cfg import make_config config = make_config(twint) -from datetime import datetime +from datetime import datetime, timezone from typing import NamedTuple, Iterator, List from pathlib import Path from ..core.common import get_files, LazyLogger, Json, datetime_aware -from ..core.time import localize_with_abbr log = LazyLogger(__name__) @@ -48,9 +47,18 @@ class Tweet(NamedTuple): @property def created_at(self) -> datetime_aware: seconds = self.row['created_at'] / 1000 - tz_abbr = self.row['timezone'] - naive = datetime.fromtimestamp(seconds) - return localize_with_abbr(naive, abbr=tz_abbr) + tz = timezone.utc + # NOTE: UTC seems to be the case at least for the older version of schema I was using + # in twint, it was extracted from "data-time-ms" field in the scraped HML + # https://github.com/twintproject/twint/blob/e3345426eb24154ff084be22e4fed5cfa4631930/twint/tweet.py#L85 + # + # I checked against twitter archive which is definitely UTC, and it seems to match + # also seems that other people are treating it as utc, e.g. + # https://github.com/thomasancheriyil/Red-Tide-Detection-based-on-Twitter/blob/beb200be60cc66dcbc394e670513715509837812/python/twitterGapParse.py#L61-L62 + # + # twint is also saving 'timezone', but this is local machine timezone at the time of scraping? + # perhaps they thought date-time-ms was local time... or just kept it just in case (they are keepin lots on unnecessary stuff in the db) + return datetime.fromtimestamp(seconds, tz=tz) @property def screen_name(self) -> str: From 44a6b17ec3aa605ee65ab3964083d69e9b6e55c7 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 19:53:35 +0100 Subject: [PATCH 021/302] twitter: use created_at as an extra key for merging --- my/twitter/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/my/twitter/common.py b/my/twitter/common.py index 4661c6d..258216f 100644 --- a/my/twitter/common.py +++ b/my/twitter/common.py @@ -18,9 +18,13 @@ def merge_tweets(*sources: Iterator[Res[Tweet]]) -> Iterator[Res[Tweet]]: if isinstance(r, Exception): return str(r) else: - return r.id_str + # using both fields as key makes it a bit easier to spot TZ issues + return (r.id_str, r.created_at) yield from unique_everseen(chain(*sources), key=key) def permalink(*, screen_name: str, id: str) -> str: return f'https://twitter.com/{screen_name}/status/{id}' + +# NOTE: tweets from archive are coming sorted by created_at +# NOTE: tweets from twint are also sorted by created_at? From 1e2fc3bec78edef1e2985726c74e3062735d5c60 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 20:24:24 +0100 Subject: [PATCH 022/302] twitter.archive: unescape stuff like </> --- my/twitter/archive.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 6533f60..342e05b 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -18,6 +18,7 @@ except ImportError as e: from dataclasses import dataclass +import html from ..core.common import Paths, datetime_aware from ..core.error import Res @@ -72,7 +73,10 @@ class Tweet(NamedTuple): @property def text(self) -> str: - return self.raw['full_text'] + res = self.raw['full_text'] + # replace stuff like </> + res = html.unescape(res) + return res @property def urls(self) -> List[str]: @@ -116,7 +120,11 @@ class Like(NamedTuple): @property def text(self) -> Optional[str]: # ugh. I think none means that tweet was deleted? - return self.raw.get('fullText') + res = self.raw.get('fullText') + if res is None: + return None + res = html.unescape(res) + return res # TODO deprecate? @property From bb6201bf2dd97806d1ae440a47f0fe31cf317972 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 21:08:25 +0100 Subject: [PATCH 023/302] my.twitter.archive: expand entities in tweet text --- my/twitter/archive.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 342e05b..70f55db 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -74,6 +74,29 @@ class Tweet(NamedTuple): @property def text(self) -> str: res = self.raw['full_text'] + + ## replace shortened URLS + repls = [] # from, to, what + for ue in self.entities['urls']: + [fr, to] = map(int, ue['indices']) + repls.append((fr, to, ue['expanded_url'])) + # seems that media field isn't always set + for me in self.entities.get('media', []): + [fr, to] = map(int, me['indices']) + repls.append((fr, to, me['display_url'])) + # todo not sure, maybe use media_url_https instead? + # for now doing this for compatibility with twint + repls = list(sorted(repls)) + parts = [] + idx = 0 + for fr, to, what in repls: + parts.append(res[idx: fr]) + parts.append(what) + idx = to + parts.append(res[idx:]) + res = ''.join(parts) + ## + # replace stuff like </> res = html.unescape(res) return res @@ -86,6 +109,7 @@ class Tweet(NamedTuple): @property def entities(self) -> Json: + # todo hmm what is 'extended_entities' return self.raw['entities'] def __str__(self) -> str: @@ -119,6 +143,7 @@ class Like(NamedTuple): @property def text(self) -> Optional[str]: + # NOTE: likes basically don't have anything except text and url # ugh. I think none means that tweet was deleted? res = self.raw.get('fullText') if res is None: From bb4c77612be4aaa6a7a7f818e9b147e14e47a970 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 21:24:23 +0100 Subject: [PATCH 024/302] twitter.twint: fix missing mentions in tweet text --- my/twitter/twint.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 13b63cc..5ba0460 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -66,7 +66,17 @@ class Tweet(NamedTuple): @property def text(self) -> str: - return self.row['tweet'] + text = self.row['tweet'] + mentions_s = self.row['mentions'] + if len(mentions_s) > 0: + # at some point for no apparent reasions mentions stopped appearing from tweet text in twint + # note that the order is still inconsisnent against twitter archive, but not much we can do + mentions = mentions_s.split(',') + for m in mentions: + # ugh. sometimes they appear as lowercase in text, sometimes not.. + if m.lower() not in text.lower(): + text = f'@{m} ' + text + return text @property def urls(self) -> List[str]: From 946daf40d08b711ad636f75bd70f5703dd843868 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 21:40:50 +0100 Subject: [PATCH 025/302] twitter: prefer archive data over twidump for tweets also add a script to check twitter data --- misc/check-twitter.sh | 86 +++++++++++++++++++++++++++++++++++++++++++ my/twitter/all.py | 4 +- my/twitter/talon.py | 2 + 3 files changed, 91 insertions(+), 1 deletion(-) create mode 100755 misc/check-twitter.sh diff --git a/misc/check-twitter.sh b/misc/check-twitter.sh new file mode 100755 index 0000000..f5f26ce --- /dev/null +++ b/misc/check-twitter.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# just a hacky script to check twitter module behaviour w.r.t. merging and normalising data +# this checks against orger output for @karlicoss data + +set -eu + +FILE="$1" + +function check() { + x="$1" + if [[ $(rg --count "$x" "$FILE") != "1" ]]; then + echo "FAILED! $x" + fi +} + +# only in old twitter archive data + test mentions +check '2010-03-24 Wed 10:02.*@GDRussia подлагивает' + +# check that old twitter archive data replaces </> +check '2011-05-12 Thu 17:51.*set ><' +# this would probs be from twint or something? +check '2013-06-01 Sat 18:48.* Iterator[Res[Tweet]]: def tweets() -> Iterator[Res[Tweet]]: + # for tweets, archive data is higher quality yield from merge_tweets( - _tweets_twint(), _tweets_archive(), + _tweets_twint(), ) def likes() -> Iterator[Res[Tweet]]: + # for likes, archive data barely has anything so twint is preferred yield from merge_tweets( _likes_twint(), _likes_archive(), diff --git a/my/twitter/talon.py b/my/twitter/talon.py index 3ff9ddf..f540d14 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -124,3 +124,5 @@ def likes() -> Iterator[Res[Tweet]]: elif isinstance(x, _IsFavorire): yield x.tweet + +# TODO maybe should combine all public iterators into a stats() From ef120bc6435de77a255590ed90661599df2f7812 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 23:20:35 +0100 Subject: [PATCH 026/302] twitter.talon: expland URLs --- misc/check-twitter.sh | 6 ++++++ my/twitter/talon.py | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/misc/check-twitter.sh b/misc/check-twitter.sh index f5f26ce..d4cf830 100755 --- a/misc/check-twitter.sh +++ b/misc/check-twitter.sh @@ -83,4 +83,10 @@ check 'It would be a really good time for countries' # https://twitter.com/karlicoss/status/1530303537476947968 check 'so there is clearly a pattern' + +# https://twitter.com/karlicoss/status/1488942357303238673 +# check URL expansion for Talon +check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode' + + # TODO check likes as well diff --git a/my/twitter/talon.py b/my/twitter/talon.py index f540d14..175a3fe 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -5,6 +5,7 @@ from __future__ import annotations from dataclasses import dataclass from datetime import datetime +import re from typing import Iterator, Sequence, Optional, Dict import pytz @@ -98,12 +99,27 @@ def _parse_tweet(row) -> Tweet: # and it's created here, so looks like it's properly parsed from the api # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79 created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc) + text = row['text'] + + # try explanding URLs.. sadly there are no positions in the db + urls = row['other_url'].split() + if len(urls) > 0: + ellipsis = '...' + # might have something collapsed + # e.g. deepmind.com/blog/article/Comp... + # NOTE: need a one character of lookahead to split on ellipsis.. hence ?= + for short in re.findall(r'(?:^|\s)([\S]+)' + re.escape(ellipsis) + r'(?=\s|$)', text): + for full in urls: + if short in full: + text = text.replace(short + ellipsis, full) + break + # return Tweet( id_str=str(row['tweet_id']), created_at=created_at, screen_name=row['screen_name'], - text=row['text'], + text=text, # todo hmm text sometimes is trimmed with ellipsis? at least urls urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0), ) From d092608002e856190654d7f8caccf36c10cca754 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 May 2022 23:45:59 +0100 Subject: [PATCH 027/302] twitter.talon: make retweets more compatible with twitter archive --- misc/check-twitter.sh | 5 +++++ my/twitter/talon.py | 15 +++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/misc/check-twitter.sh b/misc/check-twitter.sh index d4cf830..1aec2fb 100755 --- a/misc/check-twitter.sh +++ b/misc/check-twitter.sh @@ -89,4 +89,9 @@ check 'so there is clearly a pattern' check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode' +# https://twitter.com/karlicoss/status/349168455964033024 +# check link which is only in twidump +check '2013-06-24 Mon 14:13.*RT @gorod095: Нашел недавно в букинист' + + # TODO check likes as well diff --git a/my/twitter/talon.py b/my/twitter/talon.py index 175a3fe..81137d6 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -90,9 +90,6 @@ def _process_favorite_tweets(db) -> Iterator[Res[Entity]]: yield e def _parse_tweet(row) -> Tweet: - # TODO row['retweeter] if not empty, would be user's name and means retweet? - # screen name would be the actual tweet's author - # ok so looks like it's tz aware.. # https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95 # uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime() @@ -115,10 +112,20 @@ def _parse_tweet(row) -> Tweet: break # + screen_name = row['screen_name'] + # considering id_str is referring to the retweeter's tweet (rather than the original tweet) + # makes sense for the permalink to contain the retweeter as well + # also makes it more compatible to twitter archive + # a bit sad to lose structured information about RT, but then again we could always just parse it.. + retweeter = row['retweeter'] + if len(retweeter) > 0: + text = f'RT @{screen_name}: {text}' + screen_name = retweeter + return Tweet( id_str=str(row['tweet_id']), created_at=created_at, - screen_name=row['screen_name'], + screen_name=screen_name, text=text, # todo hmm text sometimes is trimmed with ellipsis? at least urls urls=tuple(u for u in row['other_url'].split(' ') if len(u.strip()) > 0), From 711157e0f5c063d52a75c8a566206365bee632bc Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 31 May 2022 12:46:21 +0100 Subject: [PATCH 028/302] my.twitter.archive: switch to zippath, add config section, better mypy coverage --- misc/check-twitter.sh | 8 ++++++++ my/config.py | 4 ++++ my/twitter/archive.py | 40 +++++++++++++++++++++++----------------- 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/misc/check-twitter.sh b/misc/check-twitter.sh index 1aec2fb..318ff71 100755 --- a/misc/check-twitter.sh +++ b/misc/check-twitter.sh @@ -93,5 +93,13 @@ check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article # check link which is only in twidump check '2013-06-24 Mon 14:13.*RT @gorod095: Нашел недавно в букинист' +# some older statuses, useful to test that all input data is properly detected +check '2010-04-01 Thu 11:34' +check '2010-06-28 Mon 23:42' + +# https://twitter.com/karlicoss/status/22916704915 +# this one is weird, just disappeared for no reason between 2021-12-22 and 2022-03-15 +# and the account isn't suspended etc. maybe it was temporary private or something? +check '2010-09-03 Fri 20:11.*Джобс' # TODO check likes as well diff --git a/my/config.py b/my/config.py index b1c17d2..1a8e49a 100644 --- a/my/config.py +++ b/my/config.py @@ -139,6 +139,10 @@ class fbmessenger: export_path: Paths +class twitter_archive: + export_path: Paths + + class twitter: class talon: export_path: Paths diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 70f55db..0583214 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -4,23 +4,28 @@ Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-do # before this config was named 'twitter', doesn't make too much sense for archive -# try to import it defensively.. +# todo unify with other code like this, e.g. time.tz.via_location try: from my.config import twitter_archive as user_config -except ImportError as e: +except ImportError as ie: + if ie.name != 'twitter_archive': + raise ie try: - from my.config import twitter as user_config + from my.config import twitter as user_config # type: ignore[misc] except ImportError: - raise e # raise the original exception.. must be something else + raise ie # raise the original exception.. must be something else else: from ..core import warnings warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config') +## from dataclasses import dataclass +from functools import lru_cache import html from ..core.common import Paths, datetime_aware from ..core.error import Res +from ..core.kompress import ZipPath @dataclass class twitter_archive(user_config): @@ -39,7 +44,6 @@ from pathlib import Path import json from ..core.common import get_files, LazyLogger, Json -from ..core import kompress @@ -47,7 +51,7 @@ logger = LazyLogger(__name__, level="warning") def inputs() -> Sequence[Path]: - return get_files(config.export_path)[-1:] + return get_files(config.export_path) from .common import TweetId, permalink @@ -73,7 +77,7 @@ class Tweet(NamedTuple): @property def text(self) -> str: - res = self.raw['full_text'] + res: str = self.raw['full_text'] ## replace shortened URLS repls = [] # from, to, what @@ -145,7 +149,7 @@ class Like(NamedTuple): def text(self) -> Optional[str]: # NOTE: likes basically don't have anything except text and url # ugh. I think none means that tweet was deleted? - res = self.raw.get('fullText') + res: Optional[str] = self.raw.get('fullText') if res is None: return None res = html.unescape(res) @@ -157,27 +161,27 @@ class Like(NamedTuple): return self.id_str -from functools import lru_cache class ZipExport: def __init__(self, archive_path: Path) -> None: - # TODO use ZipPath - self.epath = archive_path + # todo maybe this should be insude get_files instead, perhps covered with a flag? + self.zpath = ZipPath(archive_path) + if (self.zpath / 'tweets.csv').exists(): + from ..core.warnings import high + high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.") self.old_format = False # changed somewhere around 2020.03 - if not kompress.kexists(self.epath, 'Your archive.html'): + if not (self.zpath / 'Your archive.html').exists(): self.old_format = True - - def raw(self, what: str): # TODO Json in common? - logger.info('processing: %s %s', self.epath, what) + def raw(self, what: str) -> Iterator[Json]: + logger.info('processing: %s %s', self.zpath, what) path = what if not self.old_format: path = 'data/' + path path += '.js' - with kompress.kopen(self.epath, path) as fo: - ddd = fo.read() + ddd = (self.zpath / path).read_text() start = ddd.index('[') ddd = ddd[start:] for j in json.loads(ddd): @@ -194,6 +198,8 @@ class ZipExport: return acc['username'] def tweets(self) -> Iterator[Tweet]: + # NOTE: for some reason, created_at doesn't seem to be in order + # it mostly is, but there are a bunch of one-off random tweets where the time decreases (typically at the very end) for r in self.raw('tweet'): yield Tweet(r, screen_name=self.screen_name()) From 4e59a65f9acbb7172b21d2ecf84e5da71b125534 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 31 May 2022 13:06:29 +0100 Subject: [PATCH 029/302] core/general: move cached_property into compat, use standard implementation from python3.8 --- my/coding/codeforces.py | 10 +++++----- my/coding/topcoder.py | 11 ++++++----- my/core/common.py | 10 +++------- my/core/compat.py | 13 +++++++++++++ my/rtm.py | 13 +++++++------ my/twitter/archive.py | 8 ++++---- 6 files changed, 38 insertions(+), 27 deletions(-) diff --git a/my/coding/codeforces.py b/my/coding/codeforces.py index 659a2d9..1ac6ba4 100644 --- a/my/coding/codeforces.py +++ b/my/coding/codeforces.py @@ -6,8 +6,8 @@ from typing import NamedTuple import json from typing import Dict, Iterator -from ..common import cproperty, get_files -from ..error import Res, unwrap +from ..core import get_files, Res, unwrap +from ..core.compat import cached_property from ..core.konsume import ignore, wrap from kython import fget @@ -46,18 +46,18 @@ class Competition(NamedTuple): contest: str cmap: Cmap - @cproperty + @cached_property def uid(self) -> Cid: return self.contest_id def __hash__(self): return hash(self.contest_id) - @cproperty + @cached_property def when(self) -> datetime: return self.cmap[self.uid].when - @cproperty + @cached_property def summary(self) -> str: return f'participated in {self.contest}' # TODO diff --git a/my/coding/topcoder.py b/my/coding/topcoder.py index 43a2c8a..2577dd1 100644 --- a/my/coding/topcoder.py +++ b/my/coding/topcoder.py @@ -6,8 +6,9 @@ from typing import NamedTuple import json from typing import Dict, Iterator -from ..common import cproperty, get_files -from ..error import Res, unwrap +from ..core import get_files, Res, unwrap +from ..core.compat import cached_property +from ..core.error import Res, unwrap # TODO get rid of fget? from kython import fget @@ -26,18 +27,18 @@ class Competition(NamedTuple): percentile: float dates: str - @cproperty + @cached_property def uid(self) -> str: return self.contest_id def __hash__(self): return hash(self.contest_id) - @cproperty + @cached_property def when(self) -> datetime: return datetime.strptime(self.dates, '%Y-%m-%dT%H:%M:%S.%fZ') - @cproperty + @cached_property def summary(self) -> str: return f'participated in {self.contest}: {self.percentile:.0f}' diff --git a/my/core/common.py b/my/core/common.py index b7db362..c72fc77 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -128,13 +128,6 @@ def test_make_dict() -> None: assert d == {0: 0, 1: 1, 2: 0, 3: 1, 4: 0} -Cl = TypeVar('Cl') -R = TypeVar('R') - -def cproperty(f: Callable[[Cl], R]) -> R: - return property(functools.lru_cache(maxsize=1)(f)) # type: ignore - - # https://stackoverflow.com/a/12377059/706389 def listify(fn=None, wrapper=list): """ @@ -638,3 +631,6 @@ class DummyExecutor(Executor): def shutdown(self, wait: bool=True) -> None: # type: ignore[override] self._shutdown = True + +# legacy deprecated import +from .compat import cached_property as cproperty diff --git a/my/core/compat.py b/my/core/compat.py index 4dc8865..a2a627c 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -90,3 +90,16 @@ def removeprefix(text: str, prefix: str) -> str: if text.startswith(prefix): return text[len(prefix):] return text + + +# can remove after python3.8 +if sys.version_info[:2] >= (3, 8): + from functools import cached_property +else: + from typing import TypeVar, Callable + Cl = TypeVar('Cl') + R = TypeVar('R') + def cached_property(f: Callable[[Cl], R]) -> R: + return property(functools.lru_cache(maxsize=1)(f)) # type: ignore + del Cl + del R diff --git a/my/rtm.py b/my/rtm.py index 2fc783f..2731049 100755 --- a/my/rtm.py +++ b/my/rtm.py @@ -10,7 +10,8 @@ import re from typing import Dict, List, Iterator from datetime import datetime -from .common import LazyLogger, get_files, group_by_key, cproperty, make_dict +from .core.common import LazyLogger, get_files, group_by_key, make_dict +from .core.compat import cached_property from my.config import rtm as config @@ -28,14 +29,14 @@ class MyTodo: self.todo = todo self.revision = revision - @cproperty + @cached_property def notes(self) -> List[str]: # TODO can there be multiple?? desc = self.todo['DESCRIPTION'] notes = re.findall(r'---\n\n(.*?)\n\nUpdated:', desc, flags=re.DOTALL) return notes - @cproperty + @cached_property def tags(self) -> List[str]: desc = self.todo['DESCRIPTION'] [tags_str] = re.findall(r'\nTags: (.*?)\n', desc, flags=re.DOTALL) @@ -44,11 +45,11 @@ class MyTodo: tags = [t.strip() for t in tags_str.split(',')] return tags - @cproperty + @cached_property def uid(self) -> str: return str(self.todo['UID']) - @cproperty + @cached_property def title(self) -> str: return str(self.todo['SUMMARY']) @@ -59,7 +60,7 @@ class MyTodo: return str(self.todo['STATUS']) # TODO tz? - @cproperty + @cached_property def time(self) -> datetime: t1 = self.todo['DTSTAMP'].dt t2 = self.todo['LAST-MODIFIED'].dt diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 0583214..9975e6e 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -21,9 +21,9 @@ except ImportError as ie: from dataclasses import dataclass -from functools import lru_cache import html from ..core.common import Paths, datetime_aware +from ..core.compat import cached_property from ..core.error import Res from ..core.kompress import ZipPath @@ -192,7 +192,7 @@ class ZipExport: # older format yield j - @lru_cache(1) + @cached_property def screen_name(self) -> str: [acc] = self.raw('account') return acc['username'] @@ -201,14 +201,14 @@ class ZipExport: # NOTE: for some reason, created_at doesn't seem to be in order # it mostly is, but there are a bunch of one-off random tweets where the time decreases (typically at the very end) for r in self.raw('tweet'): - yield Tweet(r, screen_name=self.screen_name()) + yield Tweet(r, screen_name=self.screen_name) def likes(self) -> Iterator[Like]: # TODO ugh. would be nice to unify Tweet/Like interface # however, akeout only got tweetId, full text and url for r in self.raw('like'): - yield Like(r, screen_name=self.screen_name()) + yield Like(r, screen_name=self.screen_name) # todo not sure about list and sorting? although can't hurt considering json is not iterative? From 5799c062a5e901c2aea1ee9241d7efcfb3dac0d0 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 31 May 2022 13:54:28 +0100 Subject: [PATCH 030/302] my.zulip.organization: use tarfile instead of kopen/kompress potentially will extract some common interface here like ZipPath relevant to https://github.com/karlicoss/HPI/issues/20 --- my/zulip/organization.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/my/zulip/organization.py b/my/zulip/organization.py index b9bd190..3cfe0df 100644 --- a/my/zulip/organization.py +++ b/my/zulip/organization.py @@ -79,17 +79,22 @@ class Message: from typing import Union from itertools import count import json -from ..core.error import Res -from ..core.kompress import kopen, kexists -# TODO cache it +from ..core import Res +# todo cache it def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: # TODO hmm -- not sure if max lexicographically will actually be latest? last = max(inputs()) - no_suffix = last.name.split('.')[0] - # TODO check that it also works with unpacked dirs??? - with kopen(last, f'{no_suffix}/realm.json') as f: - rj = json.load(f) + subdir = last.with_suffix('').stem # there is a directory inside tar.gz + + # todo would be nice to switch it to unpacked dirs as well, similar to ZipPath + # I guess makes sense to have a special implementation for .tar.gz considering how common are they + import tarfile + from ..core.error import notnone + + tfile = tarfile.open(last) + with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo: + rj = json.load(fo) [sj] = rj['zerver_realm'] server = Server( @@ -126,11 +131,12 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: for idx in count(start=1, step=1): fname = f'messages-{idx:06}.json' - fpath = f'{no_suffix}/{fname}' - if not kexists(last, fpath): + fpath = f'{subdir}/{fname}' + if fpath not in tfile.getnames(): + # tarfile doesn't have .exists? break - with kopen(last, fpath) as f: - mj = json.load(f) + with notnone(tfile.extractfile(fpath)) as fo: + mj = json.load(fo) # TODO handle zerver_usermessage for j in mj['zerver_message']: try: From 2025d7ad1aaecf094ba610bd1cfc87a616d30800 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 31 May 2022 19:08:39 +0100 Subject: [PATCH 031/302] general: minor cleanup - get rid of unnecessary globs in get_files (they should be in config if the user wishes) - get rid of some old kython imports - do not convert Path twice in foursquare (so CPath works correctly) --- my/arbtt.py | 1 + my/coding/__init__.py | 0 my/coding/codeforces.py | 29 +++-------------------------- my/coding/topcoder.py | 28 ++++------------------------ my/fbmessenger/__init__.py | 4 ++++ my/foursquare.py | 4 ++-- my/jawbone/__init__.py | 2 +- my/media/imdb.py | 4 ++-- my/roamresearch.py | 2 +- tests/core.py | 4 ++-- 10 files changed, 20 insertions(+), 58 deletions(-) delete mode 100644 my/coding/__init__.py diff --git a/my/arbtt.py b/my/arbtt.py index e672e5b..02e06db 100644 --- a/my/arbtt.py +++ b/my/arbtt.py @@ -3,6 +3,7 @@ ''' REQUIRES = ['ijson', 'cffi'] +# NOTE likely also needs libyajl2 from apt or elsewhere? from pathlib import Path diff --git a/my/coding/__init__.py b/my/coding/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/my/coding/codeforces.py b/my/coding/codeforces.py index 1ac6ba4..3793988 100644 --- a/my/coding/codeforces.py +++ b/my/coding/codeforces.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 from my.config import codeforces as config -from datetime import datetime +from datetime import datetime, timezone from typing import NamedTuple import json from typing import Dict, Iterator @@ -10,10 +10,6 @@ from ..core import get_files, Res, unwrap from ..core.compat import cached_property from ..core.konsume import ignore, wrap -from kython import fget -# TODO remove -from kython.kdatetime import as_utc - Cid = int @@ -25,7 +21,7 @@ class Contest(NamedTuple): def make(cls, j) -> 'Contest': return cls( cid=j['id'], - when=as_utc(j['startTimeSeconds']), + when=datetime.fromtimestamp(j['startTimeSeconds'], tz=timezone.utc), ) Cmap = Dict[Cid, Contest] @@ -91,23 +87,4 @@ def iter_data() -> Iterator[Res[Competition]]: def get_data(): - return list(sorted(iter_data(), key=fget(Competition.when))) - - -def test(): - assert len(get_data()) > 10 - - -def main(): - for d in iter_data(): - try: - d = unwrap(d) - except Exception as e: - print(f'ERROR! {d}') - else: - print(f'{d.when}: {d.summary}') - - - -if __name__ == '__main__': - main() + return list(sorted(iter_data(), key=Competition.when.fget)) diff --git a/my/coding/topcoder.py b/my/coding/topcoder.py index 2577dd1..5711254 100644 --- a/my/coding/topcoder.py +++ b/my/coding/topcoder.py @@ -6,18 +6,14 @@ from typing import NamedTuple import json from typing import Dict, Iterator -from ..core import get_files, Res, unwrap +from ..core import get_files, Res, unwrap, Json from ..core.compat import cached_property from ..core.error import Res, unwrap - -# TODO get rid of fget? -from kython import fget from ..core.konsume import zoom, wrap, ignore -# TODO json type?? -def _get_latest() -> Dict: - pp = max(get_files(config.export_path, glob='*.json')) +def _get_latest() -> Json: + pp = max(get_files(config.export_path)) return json.loads(pp.read_text()) @@ -82,21 +78,5 @@ def iter_data() -> Iterator[Res[Competition]]: def get_data(): - return list(sorted(iter_data(), key=fget(Competition.when))) + return list(sorted(iter_data(), key=Competition.when.fget)) - -def test(): - assert len(get_data()) > 10 - -def main(): - for d in iter_data(): - try: - d = unwrap(d) - except Exception as e: - print(f'ERROR! {d}') - else: - print(d.summary) - - -if __name__ == '__main__': - main() diff --git a/my/fbmessenger/__init__.py b/my/fbmessenger/__init__.py index 910d7a6..2e60d17 100644 --- a/my/fbmessenger/__init__.py +++ b/my/fbmessenger/__init__.py @@ -53,3 +53,7 @@ if legacy: REQUIRES = [ 'git+https://github.com/karlicoss/fbmessengerexport', ] + + +# to prevent it from apprearing in modules list/doctor +from ..core import __NOT_HPI_MODULE__ diff --git a/my/foursquare.py b/my/foursquare.py index 7325f3c..b50ab0e 100755 --- a/my/foursquare.py +++ b/my/foursquare.py @@ -17,7 +17,7 @@ logger = LazyLogger(__name__) def inputs(): - return get_files(config.export_path, '*.json') + return get_files(config.export_path) class Checkin: @@ -61,7 +61,7 @@ class Place: def get_raw(fname=None): if fname is None: fname = max(inputs()) - j = json.loads(Path(fname).read_text()) + j = json.loads(fname.read_text()) assert isinstance(j, list) for chunk in j: diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 06cb262..28ef937 100755 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -110,7 +110,7 @@ def pre_dataframe() -> Iterable[Res[SleepEntry]]: sleeps = load_sleeps() # todo emit error if graph doesn't exist?? sleeps = [s for s in sleeps if s.graph.exists()] # TODO careful.. - from ..common import group_by_key + from ..core.common import group_by_key for dd, group in group_by_key(sleeps, key=lambda s: s.date_).items(): if len(group) == 1: yield group[0] diff --git a/my/media/imdb.py b/my/media/imdb.py index c7d5299..63531fe 100644 --- a/my/media/imdb.py +++ b/my/media/imdb.py @@ -3,12 +3,12 @@ import csv from datetime import datetime from typing import Iterator, List, NamedTuple -from ..common import get_files +from ..core import get_files from my.config import imdb as config def _get_last(): - return max(get_files(config.export_path, glob='*.csv')) + return max(get_files(config.export_path)) class Movie(NamedTuple): diff --git a/my/roamresearch.py b/my/roamresearch.py index 20a4391..0c1192f 100644 --- a/my/roamresearch.py +++ b/my/roamresearch.py @@ -17,7 +17,7 @@ logger = LazyLogger(__name__) def last() -> Path: - return max(get_files(config.export_path, '*.json')) + return max(get_files(config.export_path)) class Keys: diff --git a/tests/core.py b/tests/core.py index 72c16ef..339f786 100644 --- a/tests/core.py +++ b/tests/core.py @@ -3,9 +3,9 @@ NOTE: Sigh. it's nice to be able to define the tests next to the source code (so However, if you run 'pytest --pyargs my.core', it detects 'core' package name (because there is no my/__init__.py) (see https://docs.pytest.org/en/latest/goodpractices.html#tests-as-part-of-application-code) -This results in relative imports failing (e.g. from ..kython import...). +This results in relative imports failing (e.g. from ..core import...). -By using this helper file, pytest can detect the package name properly. A bit meh, but perhaps after kython is moved into the core, +By using this helper file, pytest can detect the package name properly. A bit meh, but perhaps later, we can run against the tests in my.core directly. ''' From 73e57b52d19d02baa563f936e69052170515bf6a Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 31 May 2022 20:37:16 +0100 Subject: [PATCH 032/302] general: cleanup -- remove main and executable bit where it's not necessary --- my/bluemaestro.py | 0 my/body/blood.py | 8 -------- my/emfit/__init__.py | 3 +-- my/emfit/plot.py | 0 my/foursquare.py | 0 my/jawbone/__init__.py | 1 - my/lastfm.py | 0 my/media/imdb.py | 8 -------- my/pdfs.py | 0 my/polar.py | 24 ++++++++---------------- my/reddit/rexport.py | 9 --------- my/rtm.py | 4 ---- my/youtube/takeout.py | 0 13 files changed, 9 insertions(+), 48 deletions(-) mode change 100755 => 100644 my/bluemaestro.py mode change 100755 => 100644 my/body/blood.py mode change 100755 => 100644 my/emfit/__init__.py mode change 100755 => 100644 my/emfit/plot.py mode change 100755 => 100644 my/foursquare.py mode change 100755 => 100644 my/jawbone/__init__.py mode change 100755 => 100644 my/lastfm.py mode change 100755 => 100644 my/pdfs.py mode change 100755 => 100644 my/polar.py mode change 100755 => 100644 my/reddit/rexport.py mode change 100755 => 100644 my/rtm.py mode change 100755 => 100644 my/youtube/takeout.py diff --git a/my/bluemaestro.py b/my/bluemaestro.py old mode 100755 new mode 100644 diff --git a/my/body/blood.py b/my/body/blood.py old mode 100755 new mode 100644 index 51a5114..c1d66e2 --- a/my/body/blood.py +++ b/my/body/blood.py @@ -130,11 +130,3 @@ def stats(): def test(): print(dataframe()) assert len(dataframe()) > 10 - - -def main(): - print(data()) - - -if __name__ == '__main__': - main() diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py old mode 100755 new mode 100644 index 3ad2b15..997ba6c --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ [[https://shop-eu.emfit.com/products/emfit-qs][Emfit QS]] sleep tracker @@ -29,7 +28,7 @@ def dir_hash(path: Path): # TODO take __file__ into account somehow? -@mcachew(cache_path=cache_dir() / 'emfit.cache', hashf=lambda: dir_hash(config.export_path), logger=dal.log) +@mcachew(cache_path=cache_dir() / 'emfit.cache', hashf=lambda: dir_hash(config.export_path)) def datas() -> Iterable[Res[Emfit]]: import dataclasses diff --git a/my/emfit/plot.py b/my/emfit/plot.py old mode 100755 new mode 100644 diff --git a/my/foursquare.py b/my/foursquare.py old mode 100755 new mode 100644 diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py old mode 100755 new mode 100644 index 28ef937..50932bf --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 from typing import Dict, Any, List, Iterable import json from functools import lru_cache diff --git a/my/lastfm.py b/my/lastfm.py old mode 100755 new mode 100644 diff --git a/my/media/imdb.py b/my/media/imdb.py index 63531fe..b7ecbde 100644 --- a/my/media/imdb.py +++ b/my/media/imdb.py @@ -38,11 +38,3 @@ def get_movies() -> List[Movie]: def test(): assert len(get_movies()) > 10 - - -def main(): - for movie in get_movies(): - print(movie) - -if __name__ == '__main__': - main() diff --git a/my/pdfs.py b/my/pdfs.py old mode 100755 new mode 100644 diff --git a/my/polar.py b/my/polar.py old mode 100755 new mode 100644 index 2218c29..0f2ee82 --- a/my/polar.py +++ b/my/polar.py @@ -7,6 +7,7 @@ from typing import cast, TYPE_CHECKING import my.config +# todo use something similar to tz.via_location for config fallback if not TYPE_CHECKING: user_config = getattr(my.config, 'polar', None) else: @@ -40,9 +41,9 @@ from datetime import datetime from typing import List, Dict, Iterable, NamedTuple, Sequence, Optional import json -from .core import LazyLogger, Json +from .core import LazyLogger, Json, Res from .core.common import isoparse -from .error import Res, echain, sort_res_by +from .core.error import echain, sort_res_by from .core.konsume import wrap, Zoomable, Wdict @@ -108,7 +109,7 @@ class Loader: # TODO something nicer? notes = meta['notes'].zoom() else: - notes = [] # TODO FIXME dict? + notes = [] comments = list(meta['comments'].zoom().values()) if 'comments' in meta else [] meta['questions'].zoom() meta['flashcards'].zoom() @@ -191,7 +192,7 @@ class Loader: ) h.consume() - # TODO FIXME when I add defensive error policy, support it + # TODO when I add defensive error policy, support it # if len(cmap) > 0: # raise RuntimeError(f'Unconsumed comments: {cmap}') # TODO sort by date? @@ -209,10 +210,10 @@ class Loader: # TODO konsume here as well? di = j['docInfo'] added = di['added'] - filename = di['filename'] # TODO here + filename = di['filename'] title = di.get('title', None) tags_dict = di['tags'] - pm = j['pageMetas'] # TODO FIXME handle this too + pm = j['pageMetas'] # todo handle this too? # todo defensive? tags = tuple(t['label'] for t in tags_dict.values()) @@ -247,14 +248,5 @@ def get_entries() -> List[Result]: return list(sort_res_by(iter_entries(), key=lambda e: e.created)) -def main(): - for e in iter_entries(): - if isinstance(e, Exception): - logger.exception(e) - else: - logger.info('processed %s', e.uid) - for i in e.items: - logger.info(i) - - +## deprecated Error = Exception # for backwards compat with Orger; can remove later diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py old mode 100755 new mode 100644 index e7373cd..0924e55 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -238,12 +238,3 @@ def stats() -> Stats: **stat(upvoted ), } - -def main() -> None: - for e in events(parallel=False): - print(e) - - -if __name__ == '__main__': - main() - diff --git a/my/rtm.py b/my/rtm.py old mode 100755 new mode 100644 index 2731049..b4fc7a9 --- a/my/rtm.py +++ b/my/rtm.py @@ -114,7 +114,3 @@ def active_tasks() -> Iterator[MyTodo]: if not t.is_completed(): yield t - -def print_all_todos(): - for t in all_tasks(): - print(t) diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py old mode 100755 new mode 100644 From 1b4ca6ad1b695e0065fd050da5c5543b7acf5110 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 31 May 2022 21:03:41 +0100 Subject: [PATCH 033/302] github.gdpr: prepare for using .tag.gz --- my/github/gdpr.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/my/github/gdpr.py b/my/github/gdpr.py index a676b1b..0d75a87 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -3,10 +3,10 @@ Github data (uses [[https://github.com/settings/admin][official GDPR export]]) """ import json -from typing import Iterable, Dict, Any +from pathlib import Path +from typing import Iterable, Dict, Any, Sequence -from ..core.error import Res -from ..core import get_files +from ..core import get_files, Res from .common import Event, parse_dt, EventIds @@ -27,9 +27,24 @@ from ..core.cfg import make_config config = make_config(github) +def inputs() -> Sequence[Path]: + gdir = config.gdpr_dir + res = get_files(gdir) + schema_json = [f for f in res if f.name == 'schema.json'] + was_unpacked = len(schema_json) > 0 + if was_unpacked: + # legacy behaviour, we've been passed an extracted export directory + return [schema_json[0].parent] + # otherwise, should contain a bunch of archives? + # not sure if need to warn if any of them aren't .tar.gz? + assert False, "TODO not implemented yet" + return res + + def events() -> Iterable[Res[Event]]: - # TODO FIXME allow using archive here? - files = get_files(config.gdpr_dir, glob='*.json') + last = max(inputs()) + # TODO allow using archive here? + files = last.glob('*.json') # looks like all files are in the root handler_map = { 'schema' : None, 'issue_events_': None, # eh, doesn't seem to have any useful bodies From 049820c82739ce4704e98fe059b115db84e65652 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 31 May 2022 21:54:11 +0100 Subject: [PATCH 034/302] my.github.gdpr: support uncompressed .tar.gz files related to https://github.com/karlicoss/HPI/issues/20 --- my/config.py | 2 ++ my/github/gdpr.py | 36 +++++++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/my/config.py b/my/config.py index 1a8e49a..35e22fb 100644 --- a/my/config.py +++ b/my/config.py @@ -33,6 +33,8 @@ class pocket: class github: export_path: Paths = '' + gdpr_dir: Paths = '' + class reddit: class rexport: export_path: Paths = '' diff --git a/my/github/gdpr.py b/my/github/gdpr.py index 0d75a87..c41fb6c 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -4,9 +4,11 @@ Github data (uses [[https://github.com/settings/admin][official GDPR export]]) import json from pathlib import Path +import tarfile from typing import Iterable, Dict, Any, Sequence from ..core import get_files, Res +from ..core.error import notnone from .common import Event, parse_dt, EventIds @@ -23,6 +25,10 @@ class github(user_config): ### +from ..core import LazyLogger +logger = LazyLogger(__name__) + + from ..core.cfg import make_config config = make_config(github) @@ -33,18 +39,31 @@ def inputs() -> Sequence[Path]: schema_json = [f for f in res if f.name == 'schema.json'] was_unpacked = len(schema_json) > 0 if was_unpacked: - # legacy behaviour, we've been passed an extracted export directory + # 'legacy' behaviour, we've been passed an extracted export directory + # although in principle nothing wrong with running against a directory with several unpacked archives + # so need to think how to support that in the future as well return [schema_json[0].parent] # otherwise, should contain a bunch of archives? # not sure if need to warn if any of them aren't .tar.gz? - assert False, "TODO not implemented yet" return res def events() -> Iterable[Res[Event]]: last = max(inputs()) - # TODO allow using archive here? - files = last.glob('*.json') # looks like all files are in the root + + # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples + # another one is zulip archive + if last.is_dir(): + files = list(sorted(last.glob('*.json'))) # looks like all files are in the root + open_file = lambda f: f.open() + else: + # treat as .tar.gz + tfile = tarfile.open(last) + files = list(sorted(map(Path, tfile.getnames()))) + files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json'] + open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ + + handler_map = { 'schema' : None, 'issue_events_': None, # eh, doesn't seem to have any useful bodies @@ -58,6 +77,12 @@ def events() -> Iterable[Res[Event]]: 'projects_' : _parse_project, 'releases_' : _parse_release, 'commit_comments': _parse_commit_comment, + ## TODO need to handle these + 'pull_request_review_comments_': None, + 'pull_request_review_threads_': None, + 'pull_request_reviews_': None, + ## + 'repository_files_': None, # repository artifacts, probs not very useful } for f in files: handler: Any @@ -74,7 +99,8 @@ def events() -> Iterable[Res[Event]]: # ignored continue - j = json.loads(f.read_text()) + with open_file(f) as fo: + j = json.load(fo) for r in j: try: yield handler(r) From 179b657eea029b0433d4389cb648ee01dec8cc7b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 31 May 2022 23:24:15 +0100 Subject: [PATCH 035/302] general: add a test for __init__.py fallback for modules which are switching to namespace packages for now just a manual ad-hoc test, will try to set it up on CI later relevant to the discussion here: https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/extending.20HPI/near/270465792 also potentially relevant to - https://github.com/karlicoss/HPI/issues/89 (will try to apply to this to reddit/__init__.py later) - https://github.com/karlicoss/HPI/issues/102 --- misc/check_legacy_init_py.py | 76 ++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100755 misc/check_legacy_init_py.py diff --git a/misc/check_legacy_init_py.py b/misc/check_legacy_init_py.py new file mode 100755 index 0000000..53eb169 --- /dev/null +++ b/misc/check_legacy_init_py.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# NOTE: prerequisites for this test: +# fbmessengerexport installed +# config configured (can set it to '' though) + +from pathlib import Path +from subprocess import Popen, run, PIPE +from tempfile import TemporaryDirectory + + +import logzero # type: ignore[import] +logger = logzero.logger + + +MSG = 'DEPRECATED! Instead of my.fbmessengerexport' + +def expect(*cmd: str, should_warn: bool=True) -> None: + res = run(cmd, stderr=PIPE) + errb = res.stderr; assert errb is not None + err = errb.decode('utf8') + if should_warn: + assert MSG in err, res + else: + assert MSG not in err, res + assert res.returncode == 0, res + + +def _check(*cmd: str, should_warn: bool, run_as_cmd: bool=True) -> None: + expecter = lambda *cmd: expect(*cmd, should_warn=should_warn) + if cmd[0] == '-c': + [_, code] = cmd + if run_as_cmd: + expecter('python3', '-c', code) + # check as a script + with TemporaryDirectory() as tdir: + script = Path(tdir) / 'script.py' + script.write_text(code) + expecter('python3', str(script)) + else: + expecter('python3', *cmd) + what = 'warns' if should_warn else ' ' # meh + logger.info(f"PASSED: {what}: {repr(cmd)}") + + +def check_warn(*cmd: str, **kwargs) -> None: + _check(*cmd, should_warn=True, **kwargs) + +def check_ok(*cmd: str, **kwargs) -> None: + _check(*cmd, should_warn=False, **kwargs) + + +# NOTE these three are actually sort of OK, they are allowed when it's a proper namespace package with all.py etc. +# but more likely it means legacy behaviour or just misusing the package? +# worst case it's just a warning I guess +check_warn('-c', 'from my import fbmessenger') +check_warn('-c', 'import my.fbmessenger') +check_warn('-c', 'from my.fbmessenger import *') + +# note: dump_chat_history should really be deprecated, but it's a quick way to check we actually fell back to fbmessenger/export.py +# NOTE: this is the most common legacy usecase +check_warn('-c', 'from my.fbmessenger import messages, dump_chat_history') +check_warn('-m', 'my.core', 'query' , 'my.fbmessenger.messages') +check_warn('-m', 'my.core', 'doctor', 'my.fbmessenger') + +# todo kinda annoying it doesn't work when executed as -c (but does as script!) +# presumably because doesn't have proper line number information? +# either way, it'a a bit of a corner case, the script behaviour is more important +check_ok ('-c', 'from my.fbmessenger import export', run_as_cmd=False) +check_ok ('-c', 'import my.fbmessenger.export') +check_ok ('-c', 'from my.fbmessenger.export import *') +check_ok ('-c', 'from my.fbmessenger.export import messages, dump_chat_history') +check_ok ('-m', 'my.core', 'query' , 'my.fbmessenger.export.messages') +check_ok ('-m', 'my.core', 'doctor', 'my.fbmessenger.export') + +# TODO with reddit, currently these don't work properly at all +# only when imported from scripts etc? From 8336d184348b166414f912618ca2ad85653db215 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 1 Jun 2022 21:57:36 +0100 Subject: [PATCH 036/302] general: add an adhoc test for checking mixin behaviour with namespace packages and __init__.py hack also use that hack in my.fbmessenger --- misc/check_legacy_init_py.py | 10 ++++++++-- misc/overlay_for_init_py_test/my/fbmessenger/all.py | 7 +++++++ .../my/fbmessenger/mixin.py | 2 ++ my/fbmessenger/__init__.py | 13 +++++++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 misc/overlay_for_init_py_test/my/fbmessenger/all.py create mode 100644 misc/overlay_for_init_py_test/my/fbmessenger/mixin.py diff --git a/misc/check_legacy_init_py.py b/misc/check_legacy_init_py.py index 53eb169..102b924 100755 --- a/misc/check_legacy_init_py.py +++ b/misc/check_legacy_init_py.py @@ -59,7 +59,7 @@ check_warn('-c', 'from my.fbmessenger import *') # note: dump_chat_history should really be deprecated, but it's a quick way to check we actually fell back to fbmessenger/export.py # NOTE: this is the most common legacy usecase check_warn('-c', 'from my.fbmessenger import messages, dump_chat_history') -check_warn('-m', 'my.core', 'query' , 'my.fbmessenger.messages') +check_warn('-m', 'my.core', 'query' , 'my.fbmessenger.messages', '-o', 'pprint', '--limit=10') check_warn('-m', 'my.core', 'doctor', 'my.fbmessenger') # todo kinda annoying it doesn't work when executed as -c (but does as script!) @@ -69,8 +69,14 @@ check_ok ('-c', 'from my.fbmessenger import export', run_as_cmd=False) check_ok ('-c', 'import my.fbmessenger.export') check_ok ('-c', 'from my.fbmessenger.export import *') check_ok ('-c', 'from my.fbmessenger.export import messages, dump_chat_history') -check_ok ('-m', 'my.core', 'query' , 'my.fbmessenger.export.messages') +check_ok ('-m', 'my.core', 'query' , 'my.fbmessenger.export.messages', '-o', 'pprint', '--limit=10') check_ok ('-m', 'my.core', 'doctor', 'my.fbmessenger.export') +# NOTE: +# to check that overlays work, run something like +# PYTHONPATH=misc/overlay_for_init_py_test/ hpi query my.fbmessenger.all.messages -s -o pprint --limit=10 +# you should see 1, 2, 3 from mixin.py +# TODO would be nice to add an automated test for this + # TODO with reddit, currently these don't work properly at all # only when imported from scripts etc? diff --git a/misc/overlay_for_init_py_test/my/fbmessenger/all.py b/misc/overlay_for_init_py_test/my/fbmessenger/all.py new file mode 100644 index 0000000..848de5f --- /dev/null +++ b/misc/overlay_for_init_py_test/my/fbmessenger/all.py @@ -0,0 +1,7 @@ +from my.fbmessenger import export +from . import mixin + + +def messages(): + yield from mixin.messages() + yield from export.messages() diff --git a/misc/overlay_for_init_py_test/my/fbmessenger/mixin.py b/misc/overlay_for_init_py_test/my/fbmessenger/mixin.py new file mode 100644 index 0000000..2f69480 --- /dev/null +++ b/misc/overlay_for_init_py_test/my/fbmessenger/mixin.py @@ -0,0 +1,2 @@ +def messages(): + yield from ['1', '2', '3'] diff --git a/my/fbmessenger/__init__.py b/my/fbmessenger/__init__.py index 2e60d17..2a3ba7f 100644 --- a/my/fbmessenger/__init__.py +++ b/my/fbmessenger/__init__.py @@ -57,3 +57,16 @@ REQUIRES = [ # to prevent it from apprearing in modules list/doctor from ..core import __NOT_HPI_MODULE__ + +### +# this is to trick mypy into treating this as a proper namespace package +# should only be used for backwards compatibility on packages that are convernted into namespace & all.py pattern +# - https://www.python.org/dev/peps/pep-0382/#namespace-packages-today +# - https://github.com/karlicoss/hpi_namespace_experiment +# - discussion here https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/extending.20HPI/near/269946944 +from pkgutil import extend_path +__path__ = extend_path(__path__, __name__) +# 'this' source tree ends up first in the pythonpath when we extend_path() +# so we need to move 'this' source tree towards the end to make sure we prioritize overlays +__path__ = __path__[1:] + __path__[:1] +### From 9461df6aa5ceb538fc52ecbf338a6ced132e0c0f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 1 Jun 2022 23:02:58 +0100 Subject: [PATCH 037/302] general: extract the hack to warn of legacy imports and fallback to core/legacy.py use it both in my.fbmessenger and my.reddit if in the future any new modules need to be switched to namespace package structure with all.py it should make it easy to do related: - https://github.com/karlicoss/HPI/issues/12 - https://github.com/karlicoss/HPI/issues/89 - https://github.com/karlicoss/HPI/issues/102 --- misc/check_legacy_init_py.py | 2 +- my/core/legacy.py | 55 +++++++++++++++++++++++++++ my/fbmessenger/__init__.py | 72 ++++++++---------------------------- my/reddit/__init__.py | 40 ++++++++------------ 4 files changed, 87 insertions(+), 82 deletions(-) create mode 100644 my/core/legacy.py diff --git a/misc/check_legacy_init_py.py b/misc/check_legacy_init_py.py index 102b924..c100368 100755 --- a/misc/check_legacy_init_py.py +++ b/misc/check_legacy_init_py.py @@ -12,7 +12,7 @@ import logzero # type: ignore[import] logger = logzero.logger -MSG = 'DEPRECATED! Instead of my.fbmessengerexport' +MSG = 'importing my.fbmessenger is DEPRECATED' def expect(*cmd: str, should_warn: bool=True) -> None: res = run(cmd, stderr=PIPE) diff --git a/my/core/legacy.py b/my/core/legacy.py new file mode 100644 index 0000000..21ec056 --- /dev/null +++ b/my/core/legacy.py @@ -0,0 +1,55 @@ +# I think 'compat' should be for python-specific compat stuff, whereas this for HPI specific backwards compatibility +import inspect +import re +from typing import List + +from my.core import warnings as W + + +def handle_legacy_import( + parent_module_name: str, + legacy_submodule_name: str, + parent_module_path: List[str], +) -> bool: + ### + # this is to trick mypy into treating this as a proper namespace package + # should only be used for backwards compatibility on packages that are convernted into namespace & all.py pattern + # - https://www.python.org/dev/peps/pep-0382/#namespace-packages-today + # - https://github.com/karlicoss/hpi_namespace_experiment + # - discussion here https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/extending.20HPI/near/269946944 + from pkgutil import extend_path + parent_module_path[:] = extend_path(parent_module_path, parent_module_name) + # 'this' source tree ends up first in the pythonpath when we extend_path() + # so we need to move 'this' source tree towards the end to make sure we prioritize overlays + parent_module_path[:] = parent_module_path[1:] + parent_module_path[:1] + ### + + # allow stuff like 'import my.module.submodule' and such + imported_as_parent = False + + # allow stuff like 'from my.module import submodule' + importing_submodule = False + + # some hacky traceback to inspect the current stack + # to see if the user is using the old style of importing + for f in inspect.stack(): + # seems that when a submodule is imported, at some point it'll call some internal import machinery + # with 'parent' set to the parent module + # if parent module is imported first (i.e. in case of deprecated usage), it won't be the case + args = inspect.getargvalues(f.frame) + if args.locals.get('parent') == parent_module_name: + imported_as_parent = True + + # this we can only detect from the code I guess + line = '\n'.join(f.code_context or []) + if re.match(rf'from\s+{parent_module_name}\s+import\s+{legacy_submodule_name}', line): + importing_submodule = True + + is_legacy_import = not (imported_as_parent or importing_submodule) + if is_legacy_import: + W.high(f'''\ +importing {parent_module_name} is DEPRECATED! \ +Instead, import from {parent_module_name}.{legacy_submodule_name} or {parent_module_name}.all \ +See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info. +''') + return is_legacy_import diff --git a/my/fbmessenger/__init__.py b/my/fbmessenger/__init__.py index 2a3ba7f..3919c44 100644 --- a/my/fbmessenger/__init__.py +++ b/my/fbmessenger/__init__.py @@ -4,69 +4,29 @@ It should be removed in the future, and you should replace any imports like: from my.fbmessenger import ... to: -from my.fbmessenger.export import ... +from my.fbmessenger.all import ... since that allows for easier overriding using namespace packages -https://github.com/karlicoss/HPI/issues/102 +See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info. """ -# TODO ^^ later, replace the above with from my.fbmessenger.all, when we add more data sources -import re -import inspect +# prevent it from apprearing in modules list/doctor +from ..core import __NOT_HPI_MODULE__ - -mname = __name__.split('.')[-1] - -# allow stuff like 'import my.module.submodule' and such -imported_as_parent = False - -# allow stuff like 'from my.module import submodule' -importing_submodule = False - -# some hacky traceback to inspect the current stack -# to see if the user is using the old style of importing -for f in inspect.stack(): - # seems that when a submodule is imported, at some point it'll call some internal import machinery - # with 'parent' set to the parent module - # if parent module is imported first (i.e. in case of deprecated usage), it won't be the case - args = inspect.getargvalues(f.frame) - if args.locals.get('parent') == f'my.{mname}': - imported_as_parent = True - - # this we can only detect from the code I guess - line = '\n'.join(f.code_context or []) - if re.match(rf'from\s+my\.{mname}\s+import\s+export', line): - # todo 'export' is hardcoded, not sure how to infer allowed objects anutomatically.. - importing_submodule = True - -legacy = not (imported_as_parent or importing_submodule) - -if legacy: - from my.core import warnings as W - # TODO: add link to instructions to migrate - W.high("DEPRECATED! Instead of my.fbmessengerexport, import from my.fbmessengerexport.export") - # only import in legacy mode - # otherswise might have unfortunate side effects (e.g. missing imports) - from .export import * - -# kinda annoying to keep it, but it's so legacy 'hpi module install my.fbmessenger' work -# needs to be on the top level (since it's extracted via ast module), but hopefully it doesn't hurt here +# kinda annoying to keep it, but it's so legacy 'hpi module install my.fbmessenger' works +# needs to be on the top level (since it's extracted via ast module) REQUIRES = [ 'git+https://github.com/karlicoss/fbmessengerexport', ] -# to prevent it from apprearing in modules list/doctor -from ..core import __NOT_HPI_MODULE__ +from my.core.legacy import handle_legacy_import +is_legacy_import = handle_legacy_import( + parent_module_name=__name__, + legacy_submodule_name='export', + parent_module_path=__path__, +) + +if is_legacy_import: + # todo not sure if possible to move this into legacy.py + from .export import * -### -# this is to trick mypy into treating this as a proper namespace package -# should only be used for backwards compatibility on packages that are convernted into namespace & all.py pattern -# - https://www.python.org/dev/peps/pep-0382/#namespace-packages-today -# - https://github.com/karlicoss/hpi_namespace_experiment -# - discussion here https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/extending.20HPI/near/269946944 -from pkgutil import extend_path -__path__ = extend_path(__path__, __name__) -# 'this' source tree ends up first in the pythonpath when we extend_path() -# so we need to move 'this' source tree towards the end to make sure we prioritize overlays -__path__ = __path__[1:] + __path__[:1] -### diff --git a/my/reddit/__init__.py b/my/reddit/__init__.py index aadd6a0..22813f1 100644 --- a/my/reddit/__init__.py +++ b/my/reddit/__init__.py @@ -6,36 +6,26 @@ from my.reddit import ... to: from my.reddit.all import ... since that allows for easier overriding using namespace packages -https://github.com/karlicoss/HPI/issues/102 +See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info. """ -# For now, including this here, since importing the module -# causes .rexport to be imported, which requires rexport +# prevent it from apprearing in modules list/doctor +from ..core import __NOT_HPI_MODULE__ + +# kinda annoying to keep it, but it's so legacy 'hpi module install my.reddit' works +# needs to be on the top level (since it's extracted via ast module) REQUIRES = [ 'git+https://github.com/karlicoss/rexport', ] -import re -import traceback -# some hacky traceback to inspect the current stack -# to see if the user is using the old style of importing -warn = False -for f in traceback.extract_stack(): - line = f.line or '' # just in case it's None, who knows.. +from my.core.legacy import handle_legacy_import +is_legacy_import = handle_legacy_import( + parent_module_name=__name__, + legacy_submodule_name='rexport', + parent_module_path=__path__, +) - # cover the most common ways of previously interacting with the module - if 'import my.reddit ' in (line + ' '): - warn = True - elif 'from my import reddit' in line: - warn = True - elif re.match(r"from my\.reddit\simport\s(comments|saved|submissions|upvoted)", line): - warn = True - -# TODO: add link to instructions to migrate -if warn: - from my.core import warnings as W - W.high("DEPRECATED! Instead of my.reddit, import from my.reddit.all instead.") - - -from .rexport import * +if is_legacy_import: + # todo not sure if possible to move this into legacy.py + from .rexport import * From 186f561018ebc4317ba0c1ef8ec54048628cd3cd Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 2 Jun 2022 10:11:00 +0100 Subject: [PATCH 038/302] core: some cleanup for core/init and doctor; fix issue with compileall --- my/config.py | 2 +- my/core/__main__.py | 98 +++++++++++++++++++++++++++------------------ my/core/init.py | 37 ++++++----------- 3 files changed, 71 insertions(+), 66 deletions(-) diff --git a/my/config.py b/my/config.py index 35e22fb..3d96cc3 100644 --- a/my/config.py +++ b/my/config.py @@ -9,7 +9,7 @@ This file is used for: - mypy: this file provides some type annotations - for loading the actual user config ''' -#### vvvv you won't need this VVV in your personal config +#### NOTE: you won't need this line VVVV in your personal config from my.core import init ### diff --git a/my/core/__main__.py b/my/core/__main__.py index faff852..81242eb 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -2,7 +2,9 @@ import functools import importlib import inspect import os +import shutil import sys +import tempfile import traceback from typing import Optional, Sequence, Iterable, List, Type, Any, Callable from pathlib import Path @@ -16,31 +18,25 @@ def mypy_cmd() -> Optional[Sequence[str]]: try: # preferably, use mypy from current python env import mypy - return [sys.executable, '-m', 'mypy'] except ImportError: pass + else: + return [sys.executable, '-m', 'mypy'] # ok, not ideal but try from PATH - import shutil if shutil.which('mypy'): return ['mypy'] warning("mypy not found, so can't check config with it. See https://github.com/python/mypy#readme if you want to install it and retry") return None -from types import ModuleType -def run_mypy(pkg: ModuleType) -> Optional[CompletedProcess]: - from .preinit import get_mycfg_dir - mycfg_dir = get_mycfg_dir() - # todo ugh. not sure how to extract it from pkg? - +def run_mypy(cfg_path: Path) -> Optional[CompletedProcess]: # todo dunno maybe use the same mypy config in repository? # I'd need to install mypy.ini then?? env = {**os.environ} mpath = env.get('MYPYPATH') - mpath = str(mycfg_dir) + ('' if mpath is None else f':{mpath}') + mpath = str(cfg_path) + ('' if mpath is None else f':{mpath}') env['MYPYPATH'] = mpath - cmd = mypy_cmd() if cmd is None: return None @@ -52,7 +48,7 @@ def run_mypy(pkg: ModuleType) -> Optional[CompletedProcess]: '--show-error-codes', '--show-error-context', '--check-untyped-defs', - '-p', pkg.__name__, + '-p', 'my.config', ], stderr=PIPE, stdout=PIPE, env=env) return mres @@ -128,10 +124,11 @@ class example: sys.exit(1) -# TODO return the config as a result? +# todo return the config as a result? def config_ok() -> bool: errors: List[Exception] = [] + # at this point 'my' should already be imported, so doesn't hurt to extract paths from it import my try: paths: List[str] = list(my.__path__) # type: ignore[attr-defined] @@ -142,23 +139,17 @@ def config_ok() -> bool: else: info(f'import order: {paths}') - try: - import my.config as cfg - except Exception as e: - errors.append(e) - error("failed to import the config") - tb(e) - # todo yield exception here? so it doesn't fail immediately.. - # I guess it's fairly critical and worth exiting immediately - sys.exit(1) + # first try doing as much as possible without actually imporing my.config + from .preinit import get_mycfg_dir + cfg_path = get_mycfg_dir() + # alternative is importing my.config and then getting cfg_path from its __file__/__path__ + # not sure which is better tbh - cfg_path = cfg.__file__# todo might be better to use __path__? - info(f"config file : {cfg_path}") - - import my.core as core + ## check we're not using stub config + import my.core try: - core_pkg_path = str(Path(core.__path__[0]).parent) # type: ignore[attr-defined] - if cfg_path.startswith(core_pkg_path): + core_pkg_path = str(Path(my.core.__path__[0]).parent) # type: ignore[attr-defined] + if str(cfg_path).startswith(core_pkg_path): error(f''' Seems that the stub config is used ({cfg_path}). This is likely not going to work. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-modules for more information @@ -167,25 +158,53 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module except Exception as e: errors.append(e) tb(e) + else: + info(f"config path : {cfg_path}") + ## - # todo for some reason compileall.compile_file always returns true?? - try: - cmd = [sys.executable, '-m', 'compileall', str(cfg_path)] - check_call(cmd) - info('syntax check: ' + ' '.join(cmd)) - except Exception as e: - errors.append(e) + ## check syntax + with tempfile.TemporaryDirectory() as td: + # use a temporary directory, useful because + # - compileall ignores -B, so always craps with .pyc files (annoyng on RO filesystems) + # - compileall isn't following symlinks, just silently ignores them + # note: ugh, annoying that copytree requires a non-existing dir before 3.8. + # once we have min version 3.8, can use dirs_exist_ok=True param + tdir = Path(td) / 'cfg' + # this will resolve symlinks when copying + shutil.copytree(cfg_path, tdir) + # NOTE: compileall still returns code 0 if the path doesn't exist.. + # but in our case hopefully it's not an issue + cmd = [sys.executable, '-m', 'compileall', '-q', str(tdir)] - mres = run_mypy(cfg) - if mres is not None: # has mypy - rc = mres.returncode + try: + check_call(cmd) + info('syntax check: ' + ' '.join( cmd)) + except Exception as e: + errors.append(e) + tb(e) + ## + + ## check types + mypy_res = run_mypy(cfg_path) + if mypy_res is not None: # has mypy + rc = mypy_res.returncode if rc == 0: info('mypy check : success') else: error('mypy check: failed') errors.append(RuntimeError('mypy failed')) - sys.stderr.write(indent(mres.stderr.decode('utf8'))) - sys.stderr.write(indent(mres.stdout.decode('utf8'))) + sys.stderr.write(indent(mypy_res.stderr.decode('utf8'))) + sys.stderr.write(indent(mypy_res.stdout.decode('utf8'))) + ## + + ## finally, try actually importing the config (it should use same cfg_path) + try: + import my.config + except Exception as e: + errors.append(e) + error("failed to import the config") + tb(e) + ## if len(errors) > 0: error(f'config check: {len(errors)} errors') @@ -512,7 +531,6 @@ def main(debug: bool) -> None: # to avoid importing relative modules by accident during development # maybe can be removed later if theres more test coverage/confidence that nothing # would happen? - import tempfile # use a particular directory instead of a random one, since # click being decorator based means its more complicated diff --git a/my/core/init.py b/my/core/init.py index 1fc9e88..9e1fc4d 100644 --- a/my/core/init.py +++ b/my/core/init.py @@ -1,29 +1,15 @@ ''' A hook to insert user's config directory into Python's search path. -- Ideally that would be in __init__.py (so it's executed without having to import explicityly) - But, with namespace packages, we can't have __init__.py in the parent subpackage - (see http://python-notes.curiousefficiency.org/en/latest/python_concepts/import_traps.html#the-init-py-trap) +Ideally that would be in __init__.py (so it's executed without having to import explicityly) +But, with namespace packages, we can't have __init__.py in the parent subpackage +(see http://python-notes.curiousefficiency.org/en/latest/python_concepts/import_traps.html#the-init-py-trap) - Please let me know if you are aware of a better way of dealing with this! +Instead, this is imported in the stub config (in this repository), so if the stub config is used, it triggers import of the 'real' config. + +Please let me know if you are aware of a better way of dealing with this! ''' -from types import ModuleType - -# TODO not ideal to keep it here, but this should really be a leaf in the import tree -# TODO maybe I don't even need it anymore? -def assign_module(parent: str, name: str, module: ModuleType) -> None: - import sys - import importlib - parent_module = importlib.import_module(parent) - sys.modules[parent + '.' + name] = module - if sys.version_info.minor == 6: - # ugh. not sure why it's necessary in py36... - # TODO that crap should be tested... I guess will get it for free when I run rest of tests in the matrix - setattr(parent_module, name, module) - -del ModuleType - # separate function to present namespace pollution def setup_config() -> None: @@ -45,16 +31,17 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-mo # hopefully it doesn't cause any issues sys.path.insert(0, mpath) - # remove the stub and insert reimport hte 'real' config + # remove the stub and reimport the 'real' config + # likely my.config will always be in sys.modules, but defensive just in case if 'my.config' in sys.modules: - # TODO FIXME make sure this method isn't called twice... del sys.modules['my.config'] + # this should import from mpath now try: - # todo import_from instead?? dunno import my.config except ImportError as ex: - # just in case... who knows what crazy setup users have in mind. - # todo log? + # just in case... who knows what crazy setup users have + import logging + logging.exception(ex) warnings.warn(f""" Importing 'my.config' failed! (error: {ex}). This is likely to result in issues. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info. From 3faebdd629361028518e74bbac49121dc7b2791c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 2 Jun 2022 12:43:02 +0100 Subject: [PATCH 039/302] core: add Protocol/TypedDict to compat --- my/core/compat.py | 54 +++++++++++++++++++++++++++------------- my/fbmessenger/common.py | 15 +++-------- my/reddit/common.py | 17 +++---------- tox.ini | 7 +++++- 4 files changed, 51 insertions(+), 42 deletions(-) diff --git a/my/core/compat.py b/my/core/compat.py index a2a627c..a7775c8 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -1,8 +1,8 @@ ''' Some backwards compatibility stuff/deprecation helpers ''' +import sys from types import ModuleType -from typing import Callable from . import warnings from .common import LazyLogger @@ -49,22 +49,6 @@ def _get_dal(cfg, module_name: str): return import_module(f'my.config.repos.{module_name}.dal') -import sys -from typing import TYPE_CHECKING - -if sys.version_info[:2] >= (3, 8): - from typing import Literal -else: - if TYPE_CHECKING: - from typing_extensions import Literal - else: - # erm.. I guess as long as it's not crashing, whatever... - class _Literal: - def __getitem__(self, args): - pass - Literal = _Literal() - - import os windows = os.name == 'nt' @@ -103,3 +87,39 @@ else: return property(functools.lru_cache(maxsize=1)(f)) # type: ignore del Cl del R + + +from typing import TYPE_CHECKING + + +if sys.version_info[:2] >= (3, 8): + from typing import Literal +else: + if TYPE_CHECKING: + from typing_extensions import Literal + else: + # erm.. I guess as long as it's not crashing, whatever... + class _Literal: + def __getitem__(self, args): + pass + Literal = _Literal() + + +if sys.version_info[:2] >= (3, 8): + from typing import Protocol +else: + if TYPE_CHECKING: + from typing_extensions import Protocol # type: ignore[misc] + else: + # todo could also use NamedTuple? + Protocol = object + + +if sys.version_info[:2] >= (3, 8): + from typing import TypedDict +else: + if TYPE_CHECKING: + from typing_extensions import TypedDict # type: ignore[misc] + else: + from typing import Dict + TypedDict = Dict diff --git a/my/fbmessenger/common.py b/my/fbmessenger/common.py index 5f8bd85..748b9d9 100644 --- a/my/fbmessenger/common.py +++ b/my/fbmessenger/common.py @@ -1,16 +1,9 @@ from my.core import __NOT_HPI_MODULE__ -from datetime import datetime -from typing import Iterator, Optional, TYPE_CHECKING +from typing import Iterator, Optional -if TYPE_CHECKING: - try: - from typing import Protocol - except ImportError: - # requirement of mypy - from typing_extensions import Protocol # type: ignore[misc] -else: - Protocol = object +from my.core.compat import Protocol +from my.core import datetime_aware class Thread(Protocol): @@ -26,7 +19,7 @@ class Message(Protocol): def id(self) -> str: ... @property - def dt(self) -> datetime: ... + def dt(self) -> datetime_aware: ... @property def text(self) -> Optional[str]: ... diff --git a/my/reddit/common.py b/my/reddit/common.py index 3bb0279..d33e02b 100644 --- a/my/reddit/common.py +++ b/my/reddit/common.py @@ -2,22 +2,13 @@ This defines Protocol classes, which make sure that each different type of shared models have a standardized interface """ +from my.core import __NOT_HPI_MODULE__ -from typing import Dict, Any, Set, Iterator, TYPE_CHECKING +from typing import Set, Iterator from itertools import chain -from my.core.common import datetime_aware - -Json = Dict[str, Any] - -if TYPE_CHECKING: - try: - from typing import Protocol - except ImportError: - # requirement of mypy - from typing_extensions import Protocol # type: ignore[misc] -else: - Protocol = object +from my.core.compat import Protocol +from my.core import datetime_aware, Json # common fields across all the Protocol classes, so generic code can be written diff --git a/tox.ini b/tox.ini index 33c2c71..427be71 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,11 @@ minversion = 3.5 toxworkdir={env:TOXWORKDIR_BASE:}{toxinidir}/.tox [testenv] -passenv = CI CI_* +passenv = +# useful for tests to know they are running under ci + CI CI_* +# respect user's cache dirs to prevent tox from crapping into project dir + MYPY_CACHE_DIR PYTHONPYCACHEPREFIX # just the very core tests with minimal dependencies @@ -113,6 +117,7 @@ commands = -p my.browser \ -p my.endomondo \ -p my.github.ghexport \ + -p my.github.gdpr \ -p my.hypothesis \ -p my.instapaper \ -p my.pocket \ From b96c9f45344fc79a0c4cf12e52dcbe9f9cc421e1 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 2 Jun 2022 13:22:42 +0100 Subject: [PATCH 040/302] fbmessenger: use both id and timestamp for merging --- my/fbmessenger/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/my/fbmessenger/common.py b/my/fbmessenger/common.py index 748b9d9..a6549d5 100644 --- a/my/fbmessenger/common.py +++ b/my/fbmessenger/common.py @@ -39,5 +39,7 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: if isinstance(r, Exception): return str(r) else: - return r.id + # use both just in case, would be easier to spot tz issues + # similar to twitter, might make sense to generify/document as a pattern + return (r.id, r.dt) yield from unique_everseen(chain(*sources), key=key) From fd1a683d498e69c7a3de87e5d832b7621f93e570 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 2 Jun 2022 13:42:33 +0100 Subject: [PATCH 041/302] my.bumble: merge from all previous android exports --- my/bumble/android.py | 82 +++++++++++++++++++++++++++----------------- tox.ini | 1 + 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/my/bumble/android.py b/my/bumble/android.py index 31625b1..a2d2850 100644 --- a/my/bumble/android.py +++ b/my/bumble/android.py @@ -7,6 +7,7 @@ from dataclasses import dataclass from datetime import datetime from typing import Iterator, Sequence, Optional, Dict +from more_itertools import unique_everseen from my.config import bumble as user_config @@ -53,46 +54,63 @@ class Message(_BaseMessage): import json from typing import Union -from ..core.error import Res +from ..core import Res import sqlite3 from ..core.sqlite import sqlite_connect_immutable -def _entities() -> Iterator[Res[Union[Person, _Message]]]: - last = max(inputs()) # TODO -- need to merge multiple? - with sqlite_connect_immutable(last) as db: - for row in db.execute(f'SELECT user_id, user_name FROM conversation_info'): - (user_id, user_name) = row - yield Person( - user_id=user_id, - user_name=user_name, - ) - # has sender_name, but it's always None - for row in db.execute(f''' - SELECT id, conversation_id, created_timestamp, is_incoming, payload_type, payload, reply_to_id - FROM message - ORDER BY created_timestamp - '''): - (id, conversation_id, created, is_incoming, payload_type, payload, reply_to_id) = row - try: - key = {'TEXT': 'text', 'QUESTION_GAME': 'text', 'IMAGE': 'url', 'GIF': 'url'}[payload_type] - text = json.loads(payload)[key] - yield _Message( - id=id, - # TODO not sure if utc?? - created=datetime.fromtimestamp(created / 1000), - is_incoming=bool(is_incoming), - text=text, - conversation_id=conversation_id, - reply_to_id=reply_to_id, - ) - except Exception as e: - yield e +EntitiesRes = Res[Union[Person, _Message]] + +def _entities() -> Iterator[EntitiesRes]: + for db_file in inputs(): + with sqlite_connect_immutable(db_file) as db: + yield from _handle_db(db) + + +def _handle_db(db) -> Iterator[EntitiesRes]: + for row in db.execute(f'SELECT user_id, user_name FROM conversation_info'): + (user_id, user_name) = row + yield Person( + user_id=user_id, + user_name=user_name, + ) + + # has sender_name, but it's always None + for row in db.execute(f''' + SELECT id, conversation_id, created_timestamp, is_incoming, payload_type, payload, reply_to_id + FROM message + ORDER BY created_timestamp + '''): + (id, conversation_id, created, is_incoming, payload_type, payload, reply_to_id) = row + try: + key = {'TEXT': 'text', 'QUESTION_GAME': 'text', 'IMAGE': 'url', 'GIF': 'url'}[payload_type] + text = json.loads(payload)[key] + yield _Message( + id=id, + # TODO not sure if utc?? + created=datetime.fromtimestamp(created / 1000), + is_incoming=bool(is_incoming), + text=text, + conversation_id=conversation_id, + reply_to_id=reply_to_id, + ) + except Exception as e: + yield e + + +def _key(r: EntitiesRes): + if isinstance(r, _Message): + if '&srv_width=' in r.text: + # ugh. seems that image URLs change all the time in the db? + # can't access them without login anyway + # so use a different key for such messages + return (r.id, r.created) + return r def messages() -> Iterator[Res[Message]]: id2person: Dict[str, Person] = {} id2msg: Dict[str, Message] = {} - for x in _entities(): + for x in unique_everseen(_entities(), key=_key): if isinstance(x, Exception): yield x continue diff --git a/tox.ini b/tox.ini index 427be71..546df2f 100644 --- a/tox.ini +++ b/tox.ini @@ -139,6 +139,7 @@ commands = -p my.coding.commits \ -p my.goodreads \ -p my.pdfs \ + -p my.bumble.android \ --txt-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \ {posargs} From 7a1b7b15547ea747ca1bcf965b8e364e289d827f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 3 Jun 2022 20:31:13 +0100 Subject: [PATCH 042/302] core/general: add assert_never + typing annotations for dataset --- my/bumble/android.py | 11 ++++++++--- my/core/__init__.py | 1 + my/core/common.py | 8 +++++++- my/core/dataset.py | 20 +++++++++++++++++++- my/fbmessenger/android.py | 20 ++++++++++++-------- my/instagram/android.py | 5 +++-- my/instagram/gdpr.py | 4 ++-- my/zulip/organization.py | 4 ++-- 8 files changed, 54 insertions(+), 19 deletions(-) diff --git a/my/bumble/android.py b/my/bumble/android.py index a2d2850..21ac74d 100644 --- a/my/bumble/android.py +++ b/my/bumble/android.py @@ -54,7 +54,7 @@ class Message(_BaseMessage): import json from typing import Union -from ..core import Res +from ..core import Res, assert_never import sqlite3 from ..core.sqlite import sqlite_connect_immutable @@ -66,7 +66,12 @@ def _entities() -> Iterator[EntitiesRes]: yield from _handle_db(db) -def _handle_db(db) -> Iterator[EntitiesRes]: +def _handle_db(db: sqlite3.Connection) -> Iterator[EntitiesRes]: + # todo hmm not sure + # on the one hand kinda nice to use dataset.. + # on the other, it's somewhat of a complication, and + # would be nice to have something type-directed for sql queries though + # e.g. with typeddict or something, so the number of parameter to the sql query matches? for row in db.execute(f'SELECT user_id, user_name FROM conversation_info'): (user_id, user_name) = row yield Person( @@ -136,4 +141,4 @@ def messages() -> Iterator[Res[Message]]: id2msg[m.id] = m yield m continue - assert False, type(x) # should be unreachable + assert_never(x) diff --git a/my/core/__init__.py b/my/core/__init__.py index f680f37..ee80d98 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -5,6 +5,7 @@ from .common import LazyLogger from .common import warn_if_empty from .common import stat, Stats from .common import datetime_naive, datetime_aware +from .common import assert_never from .cfg import make_config from .util import __NOT_HPI_MODULE__ diff --git a/my/core/common.py b/my/core/common.py index c72fc77..92c32f5 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -4,7 +4,7 @@ from datetime import datetime import functools from contextlib import contextmanager import types -from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple, TYPE_CHECKING +from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple, TYPE_CHECKING, NoReturn import warnings from . import warnings as core_warnings @@ -632,5 +632,11 @@ class DummyExecutor(Executor): def shutdown(self, wait: bool=True) -> None: # type: ignore[override] self._shutdown = True + +# see https://hakibenita.com/python-mypy-exhaustive-checking#exhaustiveness-checking +def assert_never(value: NoReturn) -> NoReturn: + assert False, f'Unhandled value: {value} ({type(value).__name__})' + + # legacy deprecated import from .compat import cached_property as cproperty diff --git a/my/core/dataset.py b/my/core/dataset.py index c8591d4..070b9b3 100644 --- a/my/core/dataset.py +++ b/my/core/dataset.py @@ -1,11 +1,29 @@ +from __future__ import annotations from .common import assert_subpackage; assert_subpackage(__name__) from .common import PathIsh +from .compat import Protocol from .sqlite import sqlite_connect_immutable +## sadly dataset doesn't have any type definitions +from typing import Iterable, Iterator, Dict, Optional, Any +from contextlib import AbstractContextManager + + +# NOTE: may not be true in general, but will be in the vast majority of cases +row_type_T = Dict[str, Any] + + +class TableT(Iterable, Protocol): + def find(self, *, order_by: Optional[str]=None) -> Iterator[row_type_T]: ... + + +class DatabaseT(AbstractContextManager['DatabaseT'], Protocol): + def __getitem__(self, table: str) -> TableT: ... +## # TODO wonder if also need to open without WAL.. test this on read-only directory/db file -def connect_readonly(db: PathIsh): +def connect_readonly(db: PathIsh) -> DatabaseT: import dataset # type: ignore # see https://github.com/pudo/dataset/issues/136#issuecomment-128693122 # todo not sure if mode=ro has any benefit, but it doesn't work on read-only filesystems diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index a7ed9d6..6d82002 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -3,6 +3,8 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab """ from __future__ import annotations +REQUIRES = ['dataset'] + from dataclasses import dataclass from datetime import datetime from typing import Iterator, Sequence, Optional, Dict @@ -61,8 +63,8 @@ class Message(_BaseMessage): import json from typing import Union -from ..core.error import Res -from ..core.dataset import connect_readonly +from ..core import Res, assert_never +from ..core.dataset import connect_readonly, DatabaseT Entity = Union[Sender, Thread, _Message] def _entities() -> Iterator[Res[Entity]]: for f in inputs(): @@ -70,11 +72,11 @@ def _entities() -> Iterator[Res[Entity]]: yield from _process_db(db) -def _process_db(db) -> Iterator[Res[Entity]]: +def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]: # works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user threadkey2id = lambda key: key.split(':')[1] - for r in db['threads']: + for r in db['threads'].find(): try: yield Thread( id=threadkey2id(r['thread_key']), @@ -84,8 +86,8 @@ def _process_db(db) -> Iterator[Res[Entity]]: yield e continue - for r in db['messages'].all(order_by='timestamp_ms'): - mtype = r['msg_type'] + for r in db['messages'].find(order_by='timestamp_ms'): + mtype: int = r['msg_type'] if mtype == -1: # likely immediately deleted or something? doesn't have any data at all continue @@ -94,7 +96,7 @@ def _process_db(db) -> Iterator[Res[Entity]]: try: # todo could use thread_users? sj = json.loads(r['sender']) - ukey = sj['user_key'] + ukey: str = sj['user_key'] prefix = 'FACEBOOK:' assert ukey.startswith(prefix), ukey user_id = ukey[len(prefix):] @@ -167,4 +169,6 @@ def messages() -> Iterator[Res[Message]]: msgs[m.id] = m yield m continue - assert False, type(x) # should be unreachable + # NOTE: for some reason mypy coverage highlights it as red? + # but it actually works as expected: i.e. if you omit one of the clauses above, mypy will complain + assert_never(x) diff --git a/my/instagram/android.py b/my/instagram/android.py index c7a86e7..d99b047 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -88,12 +88,13 @@ def _parse_message(j: Json) -> Optional[_Message]: import json from typing import Union -from ..core.error import Res +from ..core import Res, assert_never import sqlite3 from ..core.sqlite import sqlite_connect_immutable def _entities() -> Iterator[Res[Union[User, _Message]]]: # NOTE: definitely need to merge multiple, app seems to recycle old messages # TODO: hmm hard to guarantee timestamp ordering when we use synthetic input data... + # todo use TypedDict? for f in inputs(): with sqlite_connect_immutable(f) as db: @@ -149,4 +150,4 @@ def messages() -> Iterator[Res[Message]]: user=user, ) continue - assert False, type(x) # should not happen + assert_never(x) diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 59b4b07..754a2e9 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -56,7 +56,7 @@ def _decode(s: str) -> str: import json from typing import Union -from ..core.error import Res +from ..core import Res, assert_never def _entities() -> Iterator[Res[Union[User, _Message]]]: from ..core.kompress import ZipPath last = ZipPath(max(inputs())) @@ -165,4 +165,4 @@ def messages() -> Iterator[Res[Message]]: user=user, ) continue - assert False, type(x) # should not happen + assert_never(x) diff --git a/my/zulip/organization.py b/my/zulip/organization.py index 3cfe0df..7ab49a1 100644 --- a/my/zulip/organization.py +++ b/my/zulip/organization.py @@ -79,7 +79,7 @@ class Message: from typing import Union from itertools import count import json -from ..core import Res +from ..core import Res, assert_never # todo cache it def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: # TODO hmm -- not sure if max lexicographically will actually be latest? @@ -169,4 +169,4 @@ def messages() -> Iterator[Res[Message]]: content=x.content, ) continue - assert False # should be unreachable + assert_never(x) From bf3dd6e931d3527272b8b50fd9a22776d1bc16fd Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 3 Jun 2022 21:44:27 +0100 Subject: [PATCH 043/302] core/sqlite: experiment at typing SELECT query (to some extent) ideally would be cool to use TypedDict here somehow, but perhaps it'd only be possible after variadic generics https://peps.python.org/pep-0646 --- my/bumble/android.py | 22 ++++++++++++---------- my/core/sqlite.py | 40 ++++++++++++++++++++++++++++++++++++++++ my/instagram/android.py | 12 +++++------- 3 files changed, 57 insertions(+), 17 deletions(-) diff --git a/my/bumble/android.py b/my/bumble/android.py index 21ac74d..2fa6bd8 100644 --- a/my/bumble/android.py +++ b/my/bumble/android.py @@ -56,7 +56,7 @@ import json from typing import Union from ..core import Res, assert_never import sqlite3 -from ..core.sqlite import sqlite_connect_immutable +from ..core.sqlite import sqlite_connect_immutable, select EntitiesRes = Res[Union[Person, _Message]] @@ -72,20 +72,22 @@ def _handle_db(db: sqlite3.Connection) -> Iterator[EntitiesRes]: # on the other, it's somewhat of a complication, and # would be nice to have something type-directed for sql queries though # e.g. with typeddict or something, so the number of parameter to the sql query matches? - for row in db.execute(f'SELECT user_id, user_name FROM conversation_info'): - (user_id, user_name) = row + for (user_id, user_name) in select( + ('user_id', 'user_name'), + 'FROM conversation_info', + db=db, + ): yield Person( user_id=user_id, user_name=user_name, ) - # has sender_name, but it's always None - for row in db.execute(f''' - SELECT id, conversation_id, created_timestamp, is_incoming, payload_type, payload, reply_to_id - FROM message - ORDER BY created_timestamp - '''): - (id, conversation_id, created, is_incoming, payload_type, payload, reply_to_id) = row + # note: has sender_name, but it's always None + for ( id, conversation_id , created , is_incoming , payload_type , payload , reply_to_id) in select( + ('id', 'conversation_id', 'created_timestamp', 'is_incoming', 'payload_type', 'payload', 'reply_to_id'), + 'FROM message ORDER BY created_timestamp', + db=db + ): try: key = {'TEXT': 'text', 'QUESTION_GAME': 'text', 'IMAGE': 'url', 'GIF': 'url'}[payload_type] text = json.loads(payload)[key] diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 5253607..0f4a416 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -50,3 +50,43 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: sqlite_backup(source=conn, dest=dest) conn.close() return dest + + +from typing import Tuple, Any, Iterator + +# NOTE hmm, so this kinda works +# V = TypeVar('V', bound=Tuple[Any, ...]) +# def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]: +# but sadly when we pass columns (Tuple[str, ...]), it seems to bind this type to V? +# and then the return type ends up as Iterator[Tuple[str, ...]], which isn't desirable :( +# a bit annoying to have this copy-pasting, but hopefully not a big issue + +from typing import overload +@overload +def select(cols: Tuple[str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[Tuple[Any ]]: ... +@overload +def select(cols: Tuple[str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[Tuple[Any, Any ]]: ... +@overload +def select(cols: Tuple[str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[Tuple[Any, Any, Any ]]: ... +@overload +def select(cols: Tuple[str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[Tuple[Any, Any, Any, Any ]]: ... +@overload +def select(cols: Tuple[str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[Tuple[Any, Any, Any, Any, Any ]]: ... +@overload +def select(cols: Tuple[str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[Tuple[Any, Any, Any, Any, Any, Any ]]: ... +@overload +def select(cols: Tuple[str, str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[Tuple[Any, Any, Any, Any, Any, Any, Any ]]: ... +@overload +def select(cols: Tuple[str, str, str, str, str, str, str, str], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[Tuple[Any, Any, Any, Any, Any, Any, Any, Any]]: ... + +def select(cols, rest, *, db): + # db arg is last cause that results in nicer code formatting.. + return db.execute('SELECT ' + ','.join(cols) + ' ' + rest) diff --git a/my/instagram/android.py b/my/instagram/android.py index d99b047..21b9288 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -90,7 +90,7 @@ import json from typing import Union from ..core import Res, assert_never import sqlite3 -from ..core.sqlite import sqlite_connect_immutable +from ..core.sqlite import sqlite_connect_immutable, select def _entities() -> Iterator[Res[Union[User, _Message]]]: # NOTE: definitely need to merge multiple, app seems to recycle old messages # TODO: hmm hard to guarantee timestamp ordering when we use synthetic input data... @@ -98,15 +98,14 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: for f in inputs(): with sqlite_connect_immutable(f) as db: - for row in db.execute(f'SELECT user_id, thread_info FROM threads'): - (self_uid, js,) = row + for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db): # ugh wtf?? no easier way to extract your own user id/name?? yield User( id=str(self_uid), full_name='You', username='you', ) - j = json.loads(js) + j = json.loads(thread_json) for r in j['recipients']: yield User( id=str(r['id']), # for some reason it's int in the db @@ -114,10 +113,9 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: username=r['username'], ) - for row in db.execute(f'SELECT message FROM messages ORDER BY timestamp'): + for (msg_json,) in select(('message',), 'FROM messages ORDER BY timestamp', db=db): # eh, seems to contain everything in json? - (js,) = row - j = json.loads(js) + j = json.loads(msg_json) try: m = _parse_message(j) if m is not None: From b5f266c2bd2cb463f97bc68c627d754d6e7b6377 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 3 Jun 2022 23:26:04 +0100 Subject: [PATCH 044/302] my.instagram: add initial all.py + some experiments on nicer errors --- my/fbmessenger/all.py | 3 +-- my/instagram/all.py | 33 +++++++++++++++++++++++++++++++++ my/instagram/android.py | 24 ++++++++++++++++++++++-- my/instagram/common.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 my/instagram/all.py create mode 100644 my/instagram/common.py diff --git a/my/fbmessenger/all.py b/my/fbmessenger/all.py index ca7f064..f98b5f3 100644 --- a/my/fbmessenger/all.py +++ b/my/fbmessenger/all.py @@ -1,6 +1,5 @@ from typing import Iterator -from my.core import Res -from my.core.common import Stats +from my.core import Res, stat, Stats from my.core.source import import_source from .common import Message, _merge_messages diff --git a/my/instagram/all.py b/my/instagram/all.py new file mode 100644 index 0000000..4be2b5b --- /dev/null +++ b/my/instagram/all.py @@ -0,0 +1,33 @@ +from typing import Iterator + +from my.core import Res, stat, Stats +from my.core.source import import_source + +from .common import Message, _merge_messages + + +src_gdpr = import_source(module_name='my.instagram.gdpr') +@src_gdpr +def _messages_gdpr() -> Iterator[Res[Message]]: + from . import gdpr + yield from gdpr.messages() + + +src_android = import_source(module_name='my.instagram.android') +@src_android +def _messages_android() -> Iterator[Res[Message]]: + from . import android + yield from android.messages() + + +def messages() -> Iterator[Res[Message]]: + # TODO in general best to prefer android, it has more data + # but for now prefer gdpr prefix until we figure out how to correlate conversation threads + yield from _merge_messages( + _messages_gdpr(), + _messages_android(), + ) + + +def stats() -> Stats: + return stat(messages) diff --git a/my/instagram/android.py b/my/instagram/android.py index 21b9288..fc2ac38 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -55,6 +55,27 @@ class Message(_BaseMessage): # reply_to: Optional[Message] +# this is kinda expecrimental +# basically just using RuntimeError(msg_id, *rest) has an unfortunate consequence: +# there are way too many 'similar' errors (on different msg_id) +# however passing msg_id is nice as a means of supplying extra context +# so this is a compromise, the 'duplicate' errors will be filtered out by unique_everseen + + +class MessageError(RuntimeError): + def __init__(self, msg_id: str, *rest: str) -> None: + super().__init__(msg_id, *rest) + self.rest = rest + + def __hash__(self, other): + return hash(self.rest) + + def __eq__(self, other) -> bool: + if not isinstance(other, MessageError): + return False + return self.rest == other.rest + + from ..core import Json def _parse_message(j: Json) -> Optional[_Message]: id = j['item_id'] @@ -74,7 +95,7 @@ def _parse_message(j: Json) -> Optional[_Message]: # something like "X liked message" -- hardly useful? return None else: - raise RuntimeError(f"{id}: {t} isn't handled yet") + raise MessageError(id, f"{t} isn't handled yet") return _Message( id=id, @@ -125,7 +146,6 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: def messages() -> Iterator[Res[Message]]: - # TODO would be nicer to use a decorator for unique_everseen? id2user: Dict[str, User] = {} for x in unique_everseen(_entities()): if isinstance(x, Exception): diff --git a/my/instagram/common.py b/my/instagram/common.py new file mode 100644 index 0000000..23cefe5 --- /dev/null +++ b/my/instagram/common.py @@ -0,0 +1,30 @@ +from datetime import datetime +from itertools import chain +from typing import Iterator + +from my.core import warn_if_empty, Res +from my.core.compat import Protocol + +from more_itertools import unique_everseen + + +class Message(Protocol): + created: datetime + text: str + # TODO add some sort of thread id + + +@warn_if_empty +def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: + def key(r: Res[Message]): + if isinstance(r, Exception): + # NOTE: using str() against Exception is nice so exceptions with same args are treated the same.. + return str(r) + + dt = r.created + # seems that GDPR has millisecond resolution.. so best to strip them off when merging + round_us = dt.microsecond // 1000 * 1000 + without_us = r.created.replace(microsecond=round_us) + # using text as key is a bit crap.. but atm there are no better shared fields + return (without_us, r.text) + return unique_everseen(chain(*sources), key=key) From 7323e99504f92252b638b5afecfc87e7bed5e4cb Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 4 Jun 2022 01:43:34 -0700 Subject: [PATCH 045/302] zulip: add stats function --- my/zulip/organization.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/my/zulip/organization.py b/my/zulip/organization.py index 7ab49a1..64b5ae3 100644 --- a/my/zulip/organization.py +++ b/my/zulip/organization.py @@ -170,3 +170,11 @@ def messages() -> Iterator[Res[Message]]: ) continue assert_never(x) + + +from my.core import Stats +def stats() -> Stats: + from my.core import stat + return { + **stat(messages) + } From b9d788efd0e70d89ddbfd0ddc57a03e0f6c4f826 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 4 Jun 2022 10:29:50 +0100 Subject: [PATCH 046/302] some enhancements for facebook/instagram modules figured out that datetimes are naive better username handling + investigation of thread names --- my/fbmessenger/android.py | 4 +++- my/fbmessenger/common.py | 6 ++++++ my/instagram/android.py | 16 +++++++--------- my/instagram/gdpr.py | 16 +++++++++++++--- 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index 6d82002..a8078d6 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -38,10 +38,12 @@ class Thread: name: Optional[str] # todo not sure about order of fields... +from ..core import datetime_naive @dataclass class _BaseMessage: id: str - dt: datetime + # checked against a message sent on 4 may 2022, and it does look naive + dt: datetime_naive text: Optional[str] diff --git a/my/fbmessenger/common.py b/my/fbmessenger/common.py index a6549d5..1f82327 100644 --- a/my/fbmessenger/common.py +++ b/my/fbmessenger/common.py @@ -43,3 +43,9 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: # similar to twitter, might make sense to generify/document as a pattern return (r.id, r.dt) yield from unique_everseen(chain(*sources), key=key) + + +# TODO some notes about gdpr export (since there is no module yet) +# ugh, messages seem to go from new to old in messages_N.json files as N increases :facepalm: +# seems like it's storing local timestamp :facepalm: +# checked against a message sent on 4 may 2022 diff --git a/my/instagram/android.py b/my/instagram/android.py index fc2ac38..a34660c 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -32,11 +32,13 @@ class User: full_name: str +from ..core import datetime_naive # todo not sure about order of fields... @dataclass class _BaseMessage: id: str - created: datetime + # NOTE: ffs, looks like they keep naive timestamps in the db (checked some random messages) + created: datetime_naive text: str thread_id: str @@ -82,7 +84,6 @@ def _parse_message(j: Json) -> Optional[_Message]: t = j['item_type'] tid = j['thread_key']['thread_id'] uid = j['user_id'] - # TODO not sure if utc?? created = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000) text: str if t == 'text': @@ -120,14 +121,11 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: with sqlite_connect_immutable(f) as db: for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db): - # ugh wtf?? no easier way to extract your own user id/name?? - yield User( - id=str(self_uid), - full_name='You', - username='you', - ) j = json.loads(thread_json) - for r in j['recipients']: + # todo in principle should leave the thread attached to the message? + # since thread is a group of users? + # inviter usually contains our own user + for r in [j['inviter'], *j['recipients']]: yield User( id=str(r['id']), # for some reason it's int in the db full_name=r['full_name'], diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 754a2e9..3dfe352 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -31,12 +31,15 @@ class User: full_name: str +from ..core import datetime_naive @dataclass class _BaseMessage: - # TODO id is missing? - created: datetime + # ugh, this is insane, but does look like it's just keeping local device time??? + # checked against a message sent on 3 June, which should be UTC+1, but timestamp seems local + created: datetime_naive text: str thread_id: str + # NOTE: doesn't look like there aren't any meaningful message ids in the export @dataclass(unsafe_hash=True) @@ -100,7 +103,14 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: j = json.loads(ffile.read_text()) id_len = 10 - # NOTE: no match in android db/api responses? + # NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole converstation + # but I stared a bit at these ids vs database ids and can't see any way to find the correspondence :( + # so basically the only way to merge is to actually try some magic and correlate timestamps/message texts? + # another option is perhaps to query user id from username with some free API + # it's still fragile: e.g. if user deletes themselves there is no more username (it becomes "instagramuser") + # if we use older exports we might be able to figure it out though... so think about it? + # it also names grouped ones like instagramuserchrisfoodishblogand25others_einihreoog + # so I feel like there is just not guaranteed way to correlate :( other_id = fname[-id_len:] # NOTE: no match in android db? other_username = fname[:-id_len - 1] From fd0c65d17696b49a0fc76a8b446045e5a6d8f65f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 4 Jun 2022 16:57:10 +0100 Subject: [PATCH 047/302] my.tinder: initial module for android databases --- my/config.py | 5 + my/tinder/android.py | 218 +++++++++++++++++++++++++++++++++++++++++++ tox.ini | 2 + 3 files changed, 225 insertions(+) create mode 100644 my/tinder/android.py diff --git a/my/config.py b/my/config.py index 3d96cc3..52af04d 100644 --- a/my/config.py +++ b/my/config.py @@ -122,6 +122,11 @@ class bumble: export_path: Paths +class tinder: + class android: + export_path: Paths + + class instagram: class android: export_path: Paths diff --git a/my/tinder/android.py b/my/tinder/android.py new file mode 100644 index 0000000..e92f316 --- /dev/null +++ b/my/tinder/android.py @@ -0,0 +1,218 @@ +""" +Tinder data from Android app database (in =/data/data/com.tinder/databases/tinder-3.db=) +""" +from __future__ import annotations + +REQUIRES = ['dataset'] + +from collections import defaultdict +from dataclasses import dataclass +from datetime import datetime, timezone +from itertools import chain +from pathlib import Path +from typing import Sequence, Iterator, Union, Dict, List, Mapping + +from more_itertools import unique_everseen + +from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware +from my.core.dataset import connect_readonly, DatabaseT + + +from my.config import tinder as user_config +@dataclass +class config(user_config.android): + # paths[s]/glob to the exported sqlite databases + export_path: Paths + + +@dataclass(unsafe_hash=True) +class Person: + id: str + name: str + # todo bio? it might change, not sure what do we want here + + +@dataclass(unsafe_hash=True) +class _BaseMatch: + # for android, checked directly shortly after a match + when: datetime_aware + id: str + + +@dataclass +class _Match(_BaseMatch): + person_id: str + + +@dataclass(unsafe_hash=True) +class Match(_BaseMatch): + person: Person + + +# todo again, not sure what's the 'optimal' field order? perhaps the one which gives the most natural sort? +# so either match id or datetime +@dataclass +class _BaseMessage: + # looks like gdpr takeout does contain GMT (compared against google maps data) + sent: datetime_aware + id: str + text: str + + +@dataclass +class _Message(_BaseMessage): + match_id: str + from_id: str + to_id: str + + +@dataclass +class Message(_BaseMessage): + match: Match + from_: Person + to: Person + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +_Entity = Union[Person, _Match, _Message] +Entity = Union[Person, Match, Message] + + +def _entities() -> Iterator[Res[_Entity]]: + for db_file in inputs(): + with connect_readonly(db_file) as db: + yield from _handle_db(db) + + +def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]: + # profile_user_view contains our own user id + for row in chain(db['profile_user_view'], db['match_person']): + try: + yield _parse_person(row) + except Exception as e: + # todo attach error contex? + yield e + + for row in db['match']: + try: + yield _parse_match(row) + except Exception as e: + yield e + + for row in db['message']: + try: + yield _parse_msg(row) + except Exception as e: + yield e + + +def _parse_person(row) -> Person: + return Person( + id=row['id'], + name=row['name'], + ) + + +def _parse_match(row) -> _Match: + return _Match( + id=row['id'], + person_id=row['person_id'], + when=datetime.fromtimestamp(row['creation_date'] / 1000, tz=timezone.utc), + ) + + +def _parse_msg(row) -> _Message: + # note it also has raw_message_data -- not sure which is best to use.. + sent = row['sent_date'] + return _Message( + sent=datetime.fromtimestamp(sent / 1000, tz=timezone.utc), + id=row['id'], + text=row['text'], + match_id=row['match_id'], + from_id=row['from_id'], + to_id=row['to_id'], + ) + + +# todo maybe it's rich_entities method? +def entities() -> Iterator[Res[Entity]]: + id2person: Dict[str, Person] = {} + id2match : Dict[str, Match ] = {} + for x in unique_everseen(_entities()): + if isinstance(x, Exception): + yield x + continue + if isinstance(x, Person): + id2person[x.id] = x + yield x + continue + if isinstance(x, _Match): + try: + person = id2person[x.person_id] + except Exception as e: + yield e + continue + m = Match( + id=x.id, + when=x.when, + person=person, + ) + id2match[x.id] = m + yield m + continue + if isinstance(x, _Message): + try: + match = id2match[x.match_id] + from_ = id2person[x.from_id] + to = id2person[x.to_id] + except Exception as e: + yield e + continue + yield Message( + sent=x.sent, + match=match, + id=x.id, + text=x.text, + from_=from_, + to=to, + ) + continue + assert_never(x) + + +def messages() -> Iterator[Res[Message]]: + for x in entities(): + if isinstance(x, (Exception, Message)): + yield x + continue + + +# todo not sure, maybe it's not fundamental enough to keep here... +def match2messages() -> Iterator[Res[Mapping[Match, Sequence[Message]]]]: + res: Dict[Match, List[Message]] = defaultdict(list) + for x in entities(): + if isinstance(x, Exception): + yield x + continue + if isinstance(x, Match): + # match might happen without messages so makes sense to handle here + res[x] # just trigger creation + continue + if isinstance(x, Message): + try: + ml = res[x.match] + except Exception as e: + yield e + continue + ml.append(x) + continue + yield res +# TODO maybe a more natural return type is Iterator[Res[Tuple[Key, Value]]] +# but this doesn't work straight away because the key might have no corresponding values + + +def stats() -> Stats: + return stat(messages) diff --git a/tox.ini b/tox.ini index 546df2f..bfa612a 100644 --- a/tox.ini +++ b/tox.ini @@ -101,6 +101,7 @@ commands = hpi module install my.reddit.rexport hpi module install my.reddit.pushshift hpi module install my.stackexchange.stexport + hpi module install my.tinder.android hpi module install my.pinboard hpi module install my.arbtt hpi module install my.coding.commits @@ -140,6 +141,7 @@ commands = -p my.goodreads \ -p my.pdfs \ -p my.bumble.android \ + -p my.tinder.android \ --txt-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \ {posargs} From 016f28250b3ef42b42bf07908532f08c8b969bc3 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 5 Jun 2022 18:43:36 +0100 Subject: [PATCH 048/302] general: initial flake8 checks (for now manual) fix fairly uncontroversial stuff in my.core like - line spacing, which isn't too annoying (e.g. unlike many inline whitespace checks that break vertical formatting) - unused imports/variables - too broad except --- misc/.flake8-karlicoss | 37 +++++++++++++++++++++++++++++++++++++ my/core/__init__.py | 25 +++++++++++++++++++++++-- my/core/__main__.py | 8 +++++--- my/core/cachew.py | 4 ++-- my/core/common.py | 22 +++++++++++++--------- my/core/compat.py | 4 +++- my/core/core_config.py | 4 +--- my/core/error.py | 4 +--- my/core/influxdb.py | 2 +- my/core/kompress.py | 2 +- my/core/konsume.py | 13 ++++++++----- my/core/pandas.py | 7 ++++--- my/core/query.py | 5 +---- my/core/query_range.py | 14 +++++++++----- my/core/serialize.py | 3 --- my/core/source.py | 9 +++++---- my/core/stats.py | 10 +++++----- my/core/util.py | 2 +- my/core/warnings.py | 7 ++++--- 19 files changed, 124 insertions(+), 58 deletions(-) create mode 100644 misc/.flake8-karlicoss diff --git a/misc/.flake8-karlicoss b/misc/.flake8-karlicoss new file mode 100644 index 0000000..3c98b96 --- /dev/null +++ b/misc/.flake8-karlicoss @@ -0,0 +1,37 @@ +[flake8] +ignore = + ## these mess up vertical aligment + E126 # continuation line over-indented + E202 # whitespace before ) + E203 # whitespace before ':' (e.g. in dict) + E221 # multiple spaces before operator + E241 # multiple spaces after , + E251 # unexpected spaces after = + E261 # 2 spaces before comment. I actually think it's fine so TODO enable back later (TODO or not? still alignment) + E271 # multiple spaces after keyword + E272 # multiple spaces before keyword + ## + E266 # 'too many leading # in the comment' -- this is just unnecessary pickiness, sometimes it's nice to format a comment + E302 # 2 blank lines + E501 # 'line too long' -- kinda annoying and the default 79 is shit anyway + E702 E704 # multiple statements on one line -- messes with : ... type declataions + sometimes asserts + E731 # suggests always using def instead of lambda + + E402 # FIXME module level import -- we want it later + E252 # TODO later -- whitespace around equals? +# F541: f-string is missing placeholders -- perhaps too picky? + +# F841 is pretty useful (unused variables). maybe worth making it an error on CI + + +# for imports: we might want to check these +# F401 good: unused imports +# E401: import order +# F811: redefinition of unused import +# todo from my.core import __NOT_HPI_MODULE__ this needs to be excluded from 'unused' +# + +# as a reference: +# https://github.com/seanbreckenridge/cookiecutter-template/blob/master/%7B%7Bcookiecutter.module_name%7D%7D/setup.cfg +# and this https://github.com/karlicoss/HPI/pull/151 +# find ./my | entr flake8 --ignore=E402,E501,E741,W503,E266,E302,E305,E203,E261,E252,E251,E221,W291,E225,E303,E702,E202,F841,E731,E306,E127 E722,E231 my | grep -v __NOT_HPI_MODULE__ diff --git a/my/core/__init__.py b/my/core/__init__.py index ee80d98..78e20e7 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -1,6 +1,6 @@ # this file only keeps the most common & critical types/utility functions -from .common import PathIsh, Paths, Json -from .common import get_files +from .common import get_files, PathIsh, Paths +from .common import Json from .common import LazyLogger from .common import warn_if_empty from .common import stat, Stats @@ -8,11 +8,32 @@ from .common import datetime_naive, datetime_aware from .common import assert_never from .cfg import make_config + from .util import __NOT_HPI_MODULE__ from .error import Res, unwrap # just for brevity in modules +# todo not sure about these.. maybe best to rely on regular imports.. perhaps compare? from dataclasses import dataclass from pathlib import Path + + +__all__ = [ + 'get_files', 'PathIsh', 'Paths', + 'Json', + 'LazyLogger', + 'warn_if_empty', + 'stat', 'Stats', + 'datetime_aware', 'datetime_naive', + 'assert_never', + + 'make_config', + + '__NOT_HPI_MODULE__', + + 'Res', 'unwrap', + + 'dataclass', 'Path', +] diff --git a/my/core/__main__.py b/my/core/__main__.py index 81242eb..eb0921d 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -17,7 +17,7 @@ import click def mypy_cmd() -> Optional[Sequence[str]]: try: # preferably, use mypy from current python env - import mypy + import mypy # noqa: F401 fine not to use it except ImportError: pass else: @@ -63,6 +63,7 @@ def eprint(x: str) -> None: def indent(x: str) -> str: return ''.join(' ' + l for l in x.splitlines(keepends=True)) + OK = '✅' OFF = '🔲' @@ -178,7 +179,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module try: check_call(cmd) - info('syntax check: ' + ' '.join( cmd)) + info('syntax check: ' + ' '.join(cmd)) except Exception as e: errors.append(e) tb(e) @@ -258,7 +259,7 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: Li continue try: - mod = importlib.import_module(m) + mod = importlib.import_module(m) # noqa: F841 except Exception as e: # todo more specific command? error(f'{click.style("FAIL", fg="red")}: {m:<50} loading failed{vw}') @@ -322,6 +323,7 @@ def tabulate_warnings() -> None: ''' import warnings orig = warnings.formatwarning + def override(*args, **kwargs) -> str: res = orig(*args, **kwargs) return ''.join(' ' + x for x in res.splitlines(keepends=True)) diff --git a/my/core/cachew.py b/my/core/cachew.py index 4ecf51d..9959120 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -6,7 +6,7 @@ from typing import Optional def disable_cachew() -> None: try: - import cachew + import cachew # noqa: F401 # unused, it's fine except ImportError: # nothing to disable return @@ -19,7 +19,7 @@ from typing import Iterator @contextmanager def disabled_cachew() -> Iterator[None]: try: - import cachew + import cachew # noqa: F401 # unused, it's fine except ImportError: # nothing to disable yield diff --git a/my/core/common.py b/my/core/common.py index 92c32f5..a4dd4c9 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -12,8 +12,7 @@ from . import warnings as core_warnings PathIsh = Union[Path, str] # TODO only used in tests? not sure if useful at all. -# TODO port annotations to kython?.. -def import_file(p: PathIsh, name: Optional[str]=None) -> types.ModuleType: +def import_file(p: PathIsh, name: Optional[str] = None) -> types.ModuleType: p = Path(p) if name is None: name = p.stem @@ -48,11 +47,12 @@ T = TypeVar('T') K = TypeVar('K') V = TypeVar('V') +# TODO deprecate? more_itertools.one should be used def the(l: Iterable[T]) -> T: it = iter(l) try: first = next(it) - except StopIteration as ee: + except StopIteration: raise RuntimeError('Empty iterator?') assert all(e == first for e in it) return first @@ -234,6 +234,7 @@ if TYPE_CHECKING: # I guess, later just define pass through once this is fixed: https://github.com/python/typing/issues/270 # ok, that's actually a super nice 'pattern' F = TypeVar('F') + class McachewType(Protocol): def __call__( self, @@ -273,7 +274,7 @@ def mcachew(cache_path=_cache_path_dflt, **kwargs): # type: ignore[no-redef] try: # check that it starts with 'hack' path Path(cache_path).relative_to(_CACHE_DIR_NONE_HACK) - except: + except: # noqa: E722 bare except pass # no action needed, doesn't start with 'hack' string else: # todo show warning? tbh unclear how to detect when user stopped using 'old' way and using suffix instead? @@ -336,8 +337,7 @@ class classproperty(Generic[_R]): # def __get__(self) -> _R: # return self.f() -# for now just serves documentation purposes... but one day might make it statically verifiable where possible? -# TODO e.g. maybe use opaque mypy alias? +# TODO deprecate in favor of datetime_aware tzdatetime = datetime @@ -352,6 +352,8 @@ def isoparse(s: str) -> tzdatetime: s = s[:-1] + '+00:00' return datetime.fromisoformat(s) + +# legacy import -- we should use compat directly instead from .compat import Literal @@ -412,6 +414,7 @@ def warn_if_empty(f: FI) -> FI: ... def warn_if_empty(f): from functools import wraps + @wraps(f) def wrapped(*args, **kwargs): res = f(*args, **kwargs) @@ -474,6 +477,7 @@ def _stat_iterable(it: Iterable[C], quick: bool=False) -> Any: total = 0 errors = 0 last = None + def funcit(): nonlocal errors, last, total for x in it: @@ -542,7 +546,7 @@ def guess_datetime(x: Any) -> Optional[datetime]: # todo hmm implement withoutexception.. try: d = asdict(x) - except: + except: # noqa: E722 bare except return None for k, v in d.items(): if isinstance(v, datetime): @@ -591,8 +595,8 @@ def asdict(thing: Any) -> Json: raise TypeError(f'Could not convert object {thing} to dict') - - +# for now just serves documentation purposes... but one day might make it statically verifiable where possible? +# TODO e.g. maybe use opaque mypy alias? datetime_naive = datetime datetime_aware = datetime diff --git a/my/core/compat.py b/my/core/compat.py index a7775c8..3c825f2 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -29,7 +29,7 @@ def pre_pip_dal_handler( Specifying modules' dependencies in the config or in my/config/repos is deprecated! Please install {' '.join(requires)} as PIP packages (see the corresponding README instructions). '''.strip(), stacklevel=2) - except ModuleNotFoundError as ee: + except ModuleNotFoundError: dal = None if dal is None: @@ -83,7 +83,9 @@ else: from typing import TypeVar, Callable Cl = TypeVar('Cl') R = TypeVar('R') + def cached_property(f: Callable[[Cl], R]) -> R: + import functools return property(functools.lru_cache(maxsize=1)(f)) # type: ignore del Cl del R diff --git a/my/core/core_config.py b/my/core/core_config.py index cc8b527..48f3eb4 100644 --- a/my/core/core_config.py +++ b/my/core/core_config.py @@ -95,8 +95,6 @@ class Config(user_config): return spec return None - enabled = self.enabled_modules - disabled = self.disabled_modules on = matches(self.enabled_modules or []) off = matches(self.disabled_modules or []) @@ -153,7 +151,7 @@ def test_active_modules() -> None: with reset() as cc: # if both are set, enable all cc.disabled_modules = ['my.body.*'] - cc.enabled_modules = ['my.body.exercise'] + cc.enabled_modules = ['my.body.exercise'] assert cc._is_module_active('my.whatever' ) is None assert cc._is_module_active('my.core' ) is None with pytest.warns(UserWarning, match=r"conflicting regexes") as record_warnings: diff --git a/my/core/error.py b/my/core/error.py index cf3feac..ba6368e 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -64,7 +64,7 @@ def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> List[Res[T] k: Optional[K] try: k = key(i) - except Exception as e: + except Exception: # error white computing key? dunno, might be nice to handle... k = None group.append(i) if k is not None: @@ -193,7 +193,6 @@ See {help_url} or check the corresponding module.py file for an example\ return False - def test_datetime_errors() -> None: import pytz dt_notz = datetime.now() @@ -207,7 +206,6 @@ def test_datetime_errors() -> None: e2 = RuntimeError(f'something something {dt} something else') assert extract_error_datetime(e2) == dt - e3 = RuntimeError(str(['one', '2019-11-27T08:56:00', 'three'])) assert extract_error_datetime(e3) is not None diff --git a/my/core/influxdb.py b/my/core/influxdb.py index 3800dae..8407264 100644 --- a/my/core/influxdb.py +++ b/my/core/influxdb.py @@ -38,6 +38,7 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_c # TODO need to take schema here... cache: Dict[str, bool] = {} + def good(f, v) -> bool: c = cache.get(f) if c is not None: @@ -79,7 +80,6 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_c fields=fields, ) - from more_itertools import chunked # "The optimal batch size is 5000 lines of line protocol." # some chunking is def necessary, otherwise it fails diff --git a/my/core/kompress.py b/my/core/kompress.py index e5c910d..26e0bbd 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -78,7 +78,7 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]: tf = tarfile.open(pp) # TODO pass encoding? x = tf.extractfile(*args); assert x is not None - return x # type: ignore[return-value] + return x # type: ignore[return-value] else: return pp.open(mode, *args, **kwargs) diff --git a/my/core/konsume.py b/my/core/konsume.py index 76d629f..b4cf7b6 100644 --- a/my/core/konsume.py +++ b/my/core/konsume.py @@ -92,6 +92,7 @@ class Wvalue(Zoomable): def __repr__(self): return 'WValue{' + repr(self.value) + '}' + from typing import Tuple def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]: res: Zoomable @@ -118,6 +119,7 @@ def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]: else: raise RuntimeError(f'Unexpected type: {type(j)} {j}') + from contextlib import contextmanager from typing import Iterator @@ -142,8 +144,9 @@ Expected {c} to be fully consumed by the parser. # TODO log? pass + from typing import cast -def test_unconsumed(): +def test_unconsumed() -> None: import pytest # type: ignore with pytest.raises(UnconsumedError): with wrap({'a': 1234}) as w: @@ -155,7 +158,7 @@ def test_unconsumed(): w = cast(Wdict, w) d = w['c']['d'].zoom() -def test_consumed(): +def test_consumed() -> None: with wrap({'a': 1234}) as w: w = cast(Wdict, w) a = w['a'].zoom() @@ -165,7 +168,7 @@ def test_consumed(): c = w['c'].zoom() d = c['d'].zoom() -def test_types(): +def test_types() -> None: # (string, number, object, array, boolean or nul with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w: w = cast(Wdict, w) @@ -176,14 +179,14 @@ def test_types(): for x in list(w['list'].zoom()): # TODO eh. how to avoid the extra list thing? x.consume() -def test_consume_all(): +def test_consume_all() -> None: with wrap({'aaa': {'bbb': {'hi': 123}}}) as w: w = cast(Wdict, w) aaa = w['aaa'].zoom() aaa['bbb'].consume_all() -def test_consume_few(): +def test_consume_few() -> None: import pytest pytest.skip('Will think about it later..') with wrap({ diff --git a/my/core/pandas.py b/my/core/pandas.py index 9cf037f..370c119 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -26,7 +26,7 @@ else: def check_dateish(s) -> Iterable[str]: - import pandas as pd # type: ignore + import pandas as pd # type: ignore # noqa: F811 not actually a redefinition ctype = s.dtype if str(ctype).startswith('datetime64'): return @@ -140,7 +140,7 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataF # https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562 # same for NamedTuple -- seems that it takes whatever schema the first NT has # so we need to convert each individually... sigh - import pandas as pd + import pandas as pd # noqa: F811 not actually a redefinition columns = None if schema is None else list(_as_columns(schema).keys()) return pd.DataFrame(to_jsons(it), columns=columns) @@ -148,7 +148,7 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataF def test_as_dataframe() -> None: import pytest it = (dict(i=i, s=f'str{i}') for i in range(10)) - with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: + with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841 df = as_dataframe(it) # todo test other error col policies assert list(df.columns) == ['i', 's', 'error'] @@ -156,6 +156,7 @@ def test_as_dataframe() -> None: assert len(as_dataframe([])) == 0 from dataclasses import dataclass + @dataclass class X: x: int diff --git a/my/core/query.py b/my/core/query.py index 385fe5f..43574d0 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -520,7 +520,6 @@ Will attempt to call iter() on the value""") return itr - # classes to use in tests, need to be defined at the top level # because of a mypy bug class _Int(NamedTuple): @@ -550,7 +549,7 @@ def test_basic_orders() -> None: random.shuffle(input_items) res = list(select(input_items, order_key="x")) - assert res == [_Int(1),_Int(2),_Int(3),_Int(4),_Int(5)] + assert res == [_Int(1), _Int(2), _Int(3), _Int(4), _Int(5)] # default int ordering def custom_order_by(obj: Any) -> Any: @@ -571,12 +570,10 @@ def test_order_key_multi_type() -> None: for v in range(1, 6): yield _Int(v) - def floaty_iter() -> Iterator[_Float]: for v in range(1, 6): yield _Float(float(v + 0.5)) - res = list(select(itertools.chain(basic_iter(), floaty_iter()), order_key="x")) assert res == [ _Int(1), _Float(1.5), diff --git a/my/core/query_range.py b/my/core/query_range.py index 04952d4..ea625e5 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -133,7 +133,8 @@ def _parse_range( end_parser: Converter, within_parser: Converter, parsed_range: Optional[RangeTuple] = None, - error_message: Optional[str] = None) -> Optional[RangeTuple]: + error_message: Optional[str] = None +) -> Optional[RangeTuple]: if parsed_range is not None: return parsed_range @@ -388,7 +389,6 @@ def test_filter_in_timeframe() -> None: _A(x=datetime(2009, 5, 10, 4, 10, 1), y=5, z=10), _B(y=datetime(year=2015, month=5, day=10, hour=4, minute=10, second=1))] - rng = RangeTuple(before=str(jan_1_2016), within="52w", after=None) # from 2016, going back 52 weeks (about a year?) @@ -438,8 +438,13 @@ def test_range_predicate() -> None: # convert any float values to ints coerce_int_parser = lambda o: int(float(o)) - int_filter_func = partial(_create_range_filter, attr_func=identity, end_parser=coerce_int_parser, - within_parser=coerce_int_parser, value_coercion_func=coerce_int_parser) + int_filter_func = partial( + _create_range_filter, + attr_func=identity, + end_parser=coerce_int_parser, + within_parser=coerce_int_parser, + value_coercion_func=coerce_int_parser, + ) # filter from 0 to 5 rn: Optional[RangeTuple] = RangeTuple("0", "5", None) @@ -517,4 +522,3 @@ def test_parse_datetime_float() -> None: # test parsing isoformat assert dt.timestamp() == parse_datetime_float(str(dt)) - diff --git a/my/core/serialize.py b/my/core/serialize.py index db65adb..fa038ae 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -151,8 +151,6 @@ def dumps( def test_serialize_fallback() -> None: import json as jsn # dont cause possible conflicts with module code - import pytest - # cant use a namedtuple here, since the default json.dump serializer # serializes namedtuples as tuples, which become arrays # just test with an array of mixed objects @@ -168,7 +166,6 @@ def test_serialize_fallback() -> None: assert res == [5, 5.0] - # this needs to be defined here to prevent a mypy bug # see https://github.com/python/mypy/issues/7281 class _A(NamedTuple): diff --git a/my/core/source.py b/my/core/source.py index 07ead1e..1882dd6 100644 --- a/my/core/source.py +++ b/my/core/source.py @@ -3,9 +3,11 @@ Decorator to gracefully handle importing a data source, or warning and yielding nothing (or a default) when its not available """ -from typing import Any, Iterator, TypeVar, Callable, Optional, Iterable, Any, cast -from my.core.warnings import medium, warn from functools import wraps +from typing import Any, Iterator, TypeVar, Callable, Optional, Iterable +import warnings + +from .warnings import medium # The factory function may produce something that has data # similar to the shared model, but not exactly, so not @@ -55,7 +57,7 @@ def import_source( medium(f"Module {factory_func.__qualname__} could not be imported, or isn't configured properly") else: medium(f"Module {module_name} ({factory_func.__qualname__}) could not be imported, or isn't configured properly") - warn(f"""If you don't want to use this module, to hide this message, add '{module_name}' to your core config disabled_modules in your config, like: + warnings.warn(f"""If you don't want to use this module, to hide this message, add '{module_name}' to your core config disabled_modules in your config, like: class core: disabled_modules = [{repr(module_name)}] @@ -71,4 +73,3 @@ class core: yield from default return wrapper return decorator - diff --git a/my/core/stats.py b/my/core/stats.py index 3a93f68..ba32be7 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -6,7 +6,7 @@ import importlib import inspect import sys import typing -from typing import Optional, Callable, Any, Iterator, Sequence, Dict +from typing import Optional, Callable, Any, Iterator, Sequence, Dict, List from .common import StatsFun, Stats, stat @@ -17,6 +17,7 @@ def guess_stats(module_name: str, quick: bool=False) -> Optional[StatsFun]: providers = guess_data_providers(module_name) if len(providers) == 0: return None + def auto_stats() -> Stats: return {k: stat(v, quick=quick) for k, v in providers.items()} return auto_stats @@ -69,17 +70,17 @@ def test_is_data_provider() -> None: assert not idp("x") def no_return_type(): - return [1, 2 ,3] + return [1, 2, 3] assert not idp(no_return_type) lam = lambda: [1, 2] assert not idp(lam) - def has_extra_args(count) -> typing.List[int]: + def has_extra_args(count) -> List[int]: return list(range(count)) assert not idp(has_extra_args) - def has_return_type() -> typing.Sequence[str]: + def has_return_type() -> Sequence[str]: return ['a', 'b', 'c'] assert idp(has_return_type) @@ -96,7 +97,6 @@ def test_is_data_provider() -> None: assert not idp(producer_inputs) - # return any parameters the user is required to provide - those which don't have default values def sig_required_params(sig: inspect.Signature) -> Iterator[inspect.Parameter]: for param in sig.parameters.values(): diff --git a/my/core/util.py b/my/core/util.py index 0ffc3a7..64bf6fe 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -161,7 +161,7 @@ def _walk_packages(path: Iterable[str], prefix: str='', onerror=None) -> Iterabl path = getattr(sys.modules[mname], '__path__', None) or [] # don't traverse path items we've seen before path = [p for p in path if not seen(p)] - yield from _walk_packages(path, mname+'.', onerror) + yield from _walk_packages(path, mname + '.', onerror) # deprecate? def get_modules() -> List[HPIModule]: diff --git a/my/core/warnings.py b/my/core/warnings.py index 9446fc0..b5c1a9b 100644 --- a/my/core/warnings.py +++ b/my/core/warnings.py @@ -12,9 +12,6 @@ import warnings import click -# just bring in the scope of this module for convenience -from warnings import warn - def _colorize(x: str, color: Optional[str]=None) -> str: if color is None: return x @@ -49,3 +46,7 @@ def high(message: str, *args, **kwargs) -> None: ''' kwargs['color'] = 'red' _warn(message, *args, **kwargs) + + +# NOTE: deprecated -- legacy import +from warnings import warn \ No newline at end of file From 5f0231c5ee3c27d5f35a8ac74feb8fe4a579425b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 5 Jun 2022 22:44:26 +0100 Subject: [PATCH 049/302] core/main: allow passing multiple packages to 'module install'/'module requires' subcommands --- my/core/__main__.py | 61 ++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index eb0921d..e08378d 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -1,6 +1,7 @@ import functools import importlib import inspect +from itertools import chain import os import shutil import sys @@ -331,31 +332,41 @@ def tabulate_warnings() -> None: # TODO loggers as well? -def _requires(module: str) -> Sequence[str]: +def _requires(modules: Sequence[str]) -> Sequence[str]: from .discovery_pure import module_by_name - mod = module_by_name(module) - # todo handle when module is missing - r = mod.requires - if r is None: - error(f"Module {module} has no REQUIRES specification") - sys.exit(1) - return r + mods = [module_by_name(module) for module in modules] + res = [] + for mod in mods: + reqs = mod.requires + if reqs is None: + error(f"Module {mod.name} has no REQUIRES specification") + sys.exit(1) + for r in reqs: + if r not in res: + res.append(r) + return res -def module_requires(*, module: str) -> None: - rs = [f"'{x}'" for x in _requires(module)] +def module_requires(*, module: Sequence[str]) -> None: + if isinstance(module, str): + # legacy behavior, used to take a since argument + module = [module] + rs = [f"'{x}'" for x in _requires(modules=module)] eprint(f'dependencies of {module}') for x in rs: click.echo(x) -def module_install(*, user: bool, module: str) -> None: +def module_install(*, user: bool, module: Sequence[str]) -> None: + if isinstance(module, str): + # legacy behavior, used to take a since argument + module = [module] # TODO hmm. not sure how it's gonna work -- presumably people use different means of installing... # how do I install into the 'same' environment?? import shlex cmd = [ sys.executable, '-m', 'pip', 'install', - *(['--user'] if user else []), # meh + *(['--user'] if user else []), # todo maybe instead, forward all the remaining args to pip? *_requires(module), ] eprint('Running: ' + ' '.join(map(shlex.quote, cmd))) @@ -456,9 +467,6 @@ def query_hpi_functions( raise_exceptions: bool, drop_exceptions: bool, ) -> None: - - from itertools import chain - from .query_range import select_range, RangeTuple # chain list of functions from user, in the order they wrote them on the CLI @@ -608,27 +616,27 @@ def module_grp() -> None: @module_grp.command(name='requires', short_help='print module reqs') -@click.argument('MODULE', shell_complete=_module_autocomplete) -def module_requires_cmd(module: str) -> None: +@click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True) +def module_requires_cmd(modules: Sequence[str]) -> None: ''' - Print MODULE requirements + Print MODULES requirements - MODULE is a specific module name (e.g. my.reddit.rexport) + MODULES is one or more specific module names (e.g. my.reddit.rexport) ''' - module_requires(module=module) + module_requires(module=modules) @module_grp.command(name='install', short_help='install module deps') @click.option('--user', is_flag=True, help='same as pip --user') -@click.argument('MODULE', shell_complete=_module_autocomplete) -def module_install_cmd(user: bool, module: str) -> None: +@click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True) +def module_install_cmd(user: bool, modules: Sequence[str]) -> None: ''' - Install dependencies for a module using pip + Install dependencies for modules using pip - MODULE is a specific module name (e.g. my.reddit.rexport) + MODULES is one or more specific module names (e.g. my.reddit.rexport) ''' # todo could add functions to check specific module etc.. - module_install(user=user, module=module) + module_install(user=user, module=modules) @main.command(name='query', short_help='query the results of a HPI function') @@ -793,9 +801,10 @@ def query_cmd( def test_requires() -> None: from click.testing import CliRunner - result = CliRunner().invoke(main, ['module', 'requires', 'my.github.ghexport']) + result = CliRunner().invoke(main, ['module', 'requires', 'my.github.ghexport', 'my.browser.export']) assert result.exit_code == 0 assert "github.com/karlicoss/ghexport" in result.output + assert "browserexport" in result.output if __name__ == '__main__': From f0397b00ff0a1723825316f8d491ac2c27246f23 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 6 Jun 2022 00:00:40 +0100 Subject: [PATCH 050/302] core/main: experimental --parallel flag for hpi module install --- my/core/__main__.py | 49 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index e08378d..ba9245a 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -1,15 +1,17 @@ +from contextlib import ExitStack import functools import importlib import inspect from itertools import chain import os +import shlex import shutil import sys import tempfile import traceback from typing import Optional, Sequence, Iterable, List, Type, Any, Callable from pathlib import Path -from subprocess import check_call, run, PIPE, CompletedProcess +from subprocess import check_call, run, PIPE, CompletedProcess, Popen import click @@ -357,20 +359,42 @@ def module_requires(*, module: Sequence[str]) -> None: click.echo(x) -def module_install(*, user: bool, module: Sequence[str]) -> None: +def module_install(*, user: bool, module: Sequence[str], parallel: bool=False) -> None: if isinstance(module, str): # legacy behavior, used to take a since argument module = [module] - # TODO hmm. not sure how it's gonna work -- presumably people use different means of installing... - # how do I install into the 'same' environment?? - import shlex - cmd = [ - sys.executable, '-m', 'pip', 'install', + + requirements = _requires(module) + + pre_cmd = [ + sys.executable, '-m', 'pip', + 'install', *(['--user'] if user else []), # todo maybe instead, forward all the remaining args to pip? - *_requires(module), ] - eprint('Running: ' + ' '.join(map(shlex.quote, cmd))) - check_call(cmd) + + cmds = [] + if parallel: + # todo not really sure if it's safe to install in parallel like this + # but definitely doesn't hurt to experiment for e.g. mypy pipelines + # pip has '--use-feature=fast-deps', but it doesn't really work + # I think it only helps for pypi artifacts (not git!), + # and only if they weren't cached + for r in requirements: + cmds.append(pre_cmd + [r]) + else: + # install everything in one cmd + cmds.append(pre_cmd + list(requirements)) + + with ExitStack() as exit_stack: + popens = [] + for cmd in cmds: + eprint('Running: ' + ' '.join(map(shlex.quote, cmd))) + popen = exit_stack.enter_context(Popen(cmd)) + popens.append(popen) + + for popen in popens: + ret = popen.wait() + assert ret == 0, popen def _ui_getchar_pick(choices: Sequence[str], prompt: str = 'Select from: ') -> int: @@ -628,15 +652,16 @@ def module_requires_cmd(modules: Sequence[str]) -> None: @module_grp.command(name='install', short_help='install module deps') @click.option('--user', is_flag=True, help='same as pip --user') +@click.option('--parallel', is_flag=True, help='EXPERIMENTAL. Install dependencies in parallel.') @click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True) -def module_install_cmd(user: bool, modules: Sequence[str]) -> None: +def module_install_cmd(user: bool, parallel: bool, modules: Sequence[str]) -> None: ''' Install dependencies for modules using pip MODULES is one or more specific module names (e.g. my.reddit.rexport) ''' # todo could add functions to check specific module etc.. - module_install(user=user, module=modules) + module_install(user=user, module=modules, parallel=parallel) @main.command(name='query', short_help='query the results of a HPI function') From cef9b4c6d35eaad81ee6dba6625e78d3eca29a0c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 6 Jun 2022 00:02:06 +0100 Subject: [PATCH 051/302] ci: try using --parallel install for mypy pipeline `time tox -e mypy-misc` (removed the actual mypy call) before (each module in a separate 'hpi install' command) ``` real 1m45.901s user 1m19.555s sys 0m5.491s ``` in a single 'hpi install' command (multiple modules) ``` real 1m31.252s user 1m6.028s sys 0m5.065s ``` single 'hpi install' command with --parallel ``` real 0m15.674s user 0m50.986s sys 0m3.249s ``` --- tox.ini | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/tox.ini b/tox.ini index bfa612a..99c6ee3 100644 --- a/tox.ini +++ b/tox.ini @@ -91,26 +91,27 @@ commands = commands = pip install -e .[testing,optional] - hpi module install my.browser.export - hpi module install my.orgmode - hpi module install my.endomondo - hpi module install my.github.ghexport - hpi module install my.hypothesis - hpi module install my.instapaper - hpi module install my.pocket - hpi module install my.reddit.rexport - hpi module install my.reddit.pushshift - hpi module install my.stackexchange.stexport - hpi module install my.tinder.android - hpi module install my.pinboard - hpi module install my.arbtt - hpi module install my.coding.commits - hpi module install my.goodreads - hpi module install my.pdfs - hpi module install my.smscalls - hpi module install my.location.gpslogger - hpi module install my.location.via_ip - hpi module install my.google.takeout.parser + hpi module install --parallel \ + my.browser.export \ + my.orgmode \ + my.endomondo \ + my.github.ghexport \ + my.hypothesis \ + my.instapaper \ + my.pocket \ + my.reddit.rexport \ + my.reddit.pushshift \ + my.stackexchange.stexport \ + my.tinder.android \ + my.pinboard \ + my.arbtt \ + my.coding.commits \ + my.goodreads \ + my.pdfs \ + my.smscalls \ + my.location.gpslogger \ + my.location.via_ip \ + my.google.takeout.parser # todo fuck. -p my.github isn't checking the subpackages?? wtf... # guess it wants .pyi file?? From dbd15a7ee8fc3ec26fd8c012ee5f1b1f491dc2d2 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Tue, 7 Jun 2022 11:45:37 -0700 Subject: [PATCH 052/302] source: propogate help url for config errors --- my/core/error.py | 4 +++- my/core/source.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/my/core/error.py b/my/core/error.py index ba6368e..e6f76cd 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -155,7 +155,7 @@ def error_to_json(e: Exception) -> Json: MODULE_SETUP_URL = 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#private-configuration-myconfig' -def warn_my_config_import_error(err: Union[ImportError, AttributeError], help_url: str = MODULE_SETUP_URL) -> bool: +def warn_my_config_import_error(err: Union[ImportError, AttributeError], help_url: Optional[str] = None) -> bool: """ If the user tried to import something from my.config but it failed, possibly due to missing the config block in my.config? @@ -164,6 +164,8 @@ def warn_my_config_import_error(err: Union[ImportError, AttributeError], help_ur """ import re import click + if help_url is None: + help_url = MODULE_SETUP_URL if type(err) == ImportError: if err.name != 'my.config': return False diff --git a/my/core/source.py b/my/core/source.py index 1882dd6..6d0f0fd 100644 --- a/my/core/source.py +++ b/my/core/source.py @@ -27,6 +27,7 @@ def import_source( *, default: Iterable[T] = _DEFAULT_ITR, module_name: Optional[str] = None, + help_url: Optional[str] = None, ) -> Callable[..., Callable[..., Iterator[T]]]: """ doesn't really play well with types, but is used to catch @@ -64,7 +65,7 @@ class core: """) # try to check if this is a config error or based on dependencies not being installed if isinstance(err, (ImportError, AttributeError)): - matched_config_err = warn_my_config_import_error(err) + matched_config_err = warn_my_config_import_error(err, help_url=help_url) # if we determined this wasn't a config error, and it was an attribute error # it could be *any* attribute error -- we should raise this since its otherwise a fatal error # from some code in the module failing From 119b295d719d418a2821d3ed07a8f472110403f2 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 7 Jun 2022 22:37:45 +0100 Subject: [PATCH 053/302] core: allow legacy modules to be used in 'hpi module install' for backwards compatibility but show warning kinda hacky, but hopefully we will simplify it further when we have more such legacy modules --- misc/check_legacy_init_py.py | 4 +++- my/core/__main__.py | 3 +++ my/core/discovery_pure.py | 30 +++++++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/misc/check_legacy_init_py.py b/misc/check_legacy_init_py.py index c100368..1ba087e 100755 --- a/misc/check_legacy_init_py.py +++ b/misc/check_legacy_init_py.py @@ -12,7 +12,7 @@ import logzero # type: ignore[import] logger = logzero.logger -MSG = 'importing my.fbmessenger is DEPRECATED' +MSG = 'my.fbmessenger is DEPRECATED' def expect(*cmd: str, should_warn: bool=True) -> None: res = run(cmd, stderr=PIPE) @@ -61,6 +61,7 @@ check_warn('-c', 'from my.fbmessenger import *') check_warn('-c', 'from my.fbmessenger import messages, dump_chat_history') check_warn('-m', 'my.core', 'query' , 'my.fbmessenger.messages', '-o', 'pprint', '--limit=10') check_warn('-m', 'my.core', 'doctor', 'my.fbmessenger') +check_warn('-m', 'my.core', 'module', 'requires', 'my.fbmessenger') # todo kinda annoying it doesn't work when executed as -c (but does as script!) # presumably because doesn't have proper line number information? @@ -71,6 +72,7 @@ check_ok ('-c', 'from my.fbmessenger.export import *') check_ok ('-c', 'from my.fbmessenger.export import messages, dump_chat_history') check_ok ('-m', 'my.core', 'query' , 'my.fbmessenger.export.messages', '-o', 'pprint', '--limit=10') check_ok ('-m', 'my.core', 'doctor', 'my.fbmessenger.export') +check_ok ('-m', 'my.core', 'module', 'requires', 'my.fbmessenger.export') # NOTE: # to check that overlays work, run something like diff --git a/my/core/__main__.py b/my/core/__main__.py index ba9245a..4dfc021 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -339,6 +339,9 @@ def _requires(modules: Sequence[str]) -> Sequence[str]: mods = [module_by_name(module) for module in modules] res = [] for mod in mods: + if mod.legacy is not None: + warning(mod.legacy) + reqs = mod.requires if reqs is None: error(f"Module {mod.name} has no REQUIRES specification") diff --git a/my/core/discovery_pure.py b/my/core/discovery_pure.py index dbd07b9..5c9dbed 100644 --- a/my/core/discovery_pure.py +++ b/my/core/discovery_pure.py @@ -34,6 +34,7 @@ class HPIModule(NamedTuple): doc: Optional[str] = None file: Optional[Path] = None requires: Requires = None + legacy: Optional[str] = None # contains reason/deprecation warning def ignored(m: str) -> bool: @@ -75,9 +76,19 @@ def _is_not_module_src(src: Path) -> bool: def _is_not_module_ast(a: ast.Module) -> bool: + marker = NOT_HPI_MODULE_VAR return any( - getattr(node, 'name', None) == NOT_HPI_MODULE_VAR # direct definition - or any(getattr(n, 'name', None) == NOT_HPI_MODULE_VAR for n in getattr(node, 'names', [])) # import from + getattr(node, 'name', None) == marker # direct definition + or any(getattr(n, 'name', None) == marker for n in getattr(node, 'names', [])) # import from + for node in a.body + ) + + +def _is_legacy_module(a: ast.Module) -> bool: + marker = 'handle_legacy_import' + return any( + getattr(node, 'name', None) == marker # direct definition + or any(getattr(n, 'name', None) == marker for n in getattr(node, 'names', [])) # import from for node in a.body ) @@ -156,7 +167,11 @@ def _modules_under_root(my_root: Path) -> Iterable[HPIModule]: if ignored(m): continue a: ast.Module = ast.parse(f.read_text()) - if _is_not_module_ast(a): + + # legacy modules are 'forced' to be modules so 'hpi module install' still works for older modules + # a bit messy, will think how to fix it properly later + legacy_module = _is_legacy_module(a) + if _is_not_module_ast(a) and not legacy_module: continue doc = ast.get_docstring(a, clean=False) @@ -166,12 +181,15 @@ def _modules_under_root(my_root: Path) -> Iterable[HPIModule]: except Exception as e: logging.exception(e) + legacy = f'{m} is DEPRECATED. Please refer to the module documentation.' if legacy_module else None + yield HPIModule( name=m, skip_reason=None, doc=doc, file=f.relative_to(my_root.parent), requires=requires, + legacy=legacy, ) @@ -209,6 +227,12 @@ def test_requires() -> None: assert len(r) == 2 # fragile, but ok for now +def test_legacy_modules() -> None: + # shouldn't crash + module_by_name('my.reddit') + module_by_name('my.fbmessenger') + + def test_pure() -> None: """ We want to keep this module clean of other HPI imports From 7925ec81b622c30bc69b638a658f064df7080099 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Thu, 14 Jul 2022 23:16:08 -0700 Subject: [PATCH 054/302] docs: browser - fix examples for config --- doc/MODULES.org | 2 +- my/browser/active_browser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/MODULES.org b/doc/MODULES.org index 2bcb052..9e2dbcf 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -118,7 +118,7 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http # paths to sqlite database files which you use actively # to read from. For example: # from browserexport.browsers.all import Firefox - # active_databases = Firefox.locate_database() + # export_path = Firefox.locate_database() export_path: Paths #+end_src ** [[file:../my/location][my.location]] diff --git a/my/browser/active_browser.py b/my/browser/active_browser.py index 7005573..4dc52e4 100644 --- a/my/browser/active_browser.py +++ b/my/browser/active_browser.py @@ -14,7 +14,7 @@ class config(user_config.active_browser): # paths to sqlite database files which you use actively # to read from. For example: # from browserexport.browsers.all import Firefox - # active_databases = Firefox.locate_database() + # export_path = Firefox.locate_database() export_path: Paths From c8cf0272f96079b57efddc89f29a9b2b87277185 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 28 Aug 2022 23:08:35 +0100 Subject: [PATCH 055/302] instagram.gdpr: use new path to personal information --- my/instagram/gdpr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 3dfe352..4c54fbf 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -77,7 +77,12 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # whereas here I don't need it.. # so for now will just implement this adhoc thing and think about properly fixing later - j = json.loads((last / 'account_information/personal_information.json').read_text()) + personal_info = last / 'personal_information' + if not personal_info.exists(): + # old path, used up to somewhere between feb-aug 2022 + personal_info = last / 'account_information' + + j = json.loads((personal_info / 'personal_information.json').read_text()) [profile] = j['profile_user'] pdata = profile['string_map_data'] username = pdata['Username']['value'] From ca91be8154bd74c928e21e5bad700fe3f487330c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 29 Aug 2022 01:27:26 +0100 Subject: [PATCH 056/302] twitter.archive: fix legacy config detection apparently .name contains the parent module so previously it was throwing the exception instead --- my/twitter/archive.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 9975e6e..5c37fd3 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -8,7 +8,8 @@ Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-do try: from my.config import twitter_archive as user_config except ImportError as ie: - if ie.name != 'twitter_archive': + if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)): + # must be caused by something else raise ie try: from my.config import twitter as user_config # type: ignore[misc] From 5f1d41fa527923e93cb91b181652e0aadfbb94f7 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 18 Oct 2022 23:01:38 +0100 Subject: [PATCH 057/302] my.twitter.archive: fix for newer format (tweets filename changed to tweets.js) --- my/twitter/archive.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 5c37fd3..c59d7a1 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -174,10 +174,10 @@ class ZipExport: if not (self.zpath / 'Your archive.html').exists(): self.old_format = True - def raw(self, what: str) -> Iterator[Json]: + def raw(self, what: str, *, fname: Optional[str]=None) -> Iterator[Json]: logger.info('processing: %s %s', self.zpath, what) - path = what + path = fname or what if not self.old_format: path = 'data/' + path path += '.js' @@ -195,20 +195,22 @@ class ZipExport: @cached_property def screen_name(self) -> str: - [acc] = self.raw('account') + [acc] = self.raw(what='account') return acc['username'] def tweets(self) -> Iterator[Tweet]: + fname = 'tweets' # since somewhere between mar and oct 2022 + if not (self.zpath / f'data/{fname}.js').exists(): + fname = 'tweet' # old name # NOTE: for some reason, created_at doesn't seem to be in order # it mostly is, but there are a bunch of one-off random tweets where the time decreases (typically at the very end) - for r in self.raw('tweet'): + for r in self.raw(what='tweet', fname=fname): yield Tweet(r, screen_name=self.screen_name) - def likes(self) -> Iterator[Like]: # TODO ugh. would be nice to unify Tweet/Like interface # however, akeout only got tweetId, full text and url - for r in self.raw('like'): + for r in self.raw(what='like'): yield Like(r, screen_name=self.screen_name) From 7098d6831f37667f7dd874704f23bf594abfa198 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 18 Oct 2022 23:33:20 +0100 Subject: [PATCH 058/302] fix mypy in _identity seems easier to just ignore considering it's "internal" function also a couple of tests to make sure it infers types correctly --- my/core/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/my/core/common.py b/my/core/common.py index a4dd4c9..6ad8146 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -69,7 +69,7 @@ def group_by_key(l: Iterable[T], key: Callable[[T], K]) -> Dict[K, List[T]]: return res -def _identity(v: T) -> V: +def _identity(v: T) -> V: # type: ignore[type-var] return cast(V, v) @@ -127,6 +127,10 @@ def test_make_dict() -> None: d = make_dict(it, key=lambda i: i, value=lambda i: i % 2) assert d == {0: 0, 1: 1, 2: 0, 3: 1, 4: 0} + # check type inference + d2: Dict[str, int ] = make_dict(it, key=lambda i: str(i)) + d3: Dict[str, bool] = make_dict(it, key=lambda i: str(i), value=lambda i: i % 2 == 0) + # https://stackoverflow.com/a/12377059/706389 def listify(fn=None, wrapper=list): From 716a2c82bab94d1ced734c736803286798206ca7 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 15 Oct 2022 01:50:53 -0700 Subject: [PATCH 059/302] core/serialize: serialize stdlib Decimal class --- my/core/serialize.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/my/core/serialize.py b/my/core/serialize.py index fa038ae..c0cbae9 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -1,6 +1,7 @@ import datetime import dataclasses from pathlib import Path +from decimal import Decimal from typing import Any, Optional, Callable, NamedTuple from functools import lru_cache @@ -36,6 +37,11 @@ def _default_encode(obj: Any) -> Any: return dataclasses.asdict(obj) if isinstance(obj, Exception): return error_to_json(obj) + # if something was stored as 'decimal', you likely + # don't want to convert it to float since you're + # storing as decimal to not lose the precision + if isinstance(obj, Decimal): + return str(obj) # note: _serialize would only be called for items which aren't already # serialized as a dataclass or namedtuple # discussion: https://github.com/karlicoss/HPI/issues/138#issuecomment-801704929 From ad52e131a0a2446a18ef2e26944264becc6ef5b9 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 11 Nov 2022 18:53:07 -0800 Subject: [PATCH 060/302] google.takeout.parser: recreate cache on upgrade https://github.com/seanbreckenridge/google_takeout_parser/pull/37 --- my/google/takeout/parser.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index a6ea81c..09cbe57 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -27,6 +27,7 @@ from my.core.time import user_forced from google_takeout_parser.parse_html.html_time_utils import ABBR_TIMEZONES ABBR_TIMEZONES.extend(user_forced()) +import google_takeout_parser from google_takeout_parser.path_dispatch import TakeoutParser from google_takeout_parser.merge import GoogleEventSet, CacheResults @@ -75,8 +76,13 @@ EXPECTED = ( ) +google_takeout_version = str(getattr(google_takeout_parser, '__version__', 'unknown')) + def _cachew_depends_on() -> List[str]: - return sorted([str(p) for p in inputs()]) + exports = sorted([str(p) for p in inputs()]) + # add google takeout parser pip version to hash, so this re-creates on breaking changes + exports.insert(0, f"google_takeout_version: {google_takeout_version}") + return exports # ResultsType is a Union of all of the models in google_takeout_parser From 54e6fe6ab5c3bbe20cba79c253f0dee88ab22464 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 11 Nov 2022 21:48:52 -0800 Subject: [PATCH 061/302] ci: try disabling parallel pip installs on windows --- my/core/__main__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index 4dfc021..d8e9ebd 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -376,7 +376,9 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool=False) - ] cmds = [] - if parallel: + # disable parallel on windows, sometimes throws a + # '[WinError 32] The process cannot access the file because it is being used by another process' + if parallel and sys.platform not in ['win32', 'cygwin']: # todo not really sure if it's safe to install in parallel like this # but definitely doesn't hurt to experiment for e.g. mypy pipelines # pip has '--use-feature=fast-deps', but it doesn't really work From 11b6e51c90949fc850ce2f47bb911a464d2c3ac2 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Mon, 30 Jan 2023 23:37:43 +0000 Subject: [PATCH 062/302] ci: fix tox config seems that after version 4.0 it's necessary to specify environments to run previosly it was picking them up automatically --- tox.ini | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 99c6ee3..ed2a084 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,6 @@ [tox] minversion = 3.5 +envlist = tests-core,tests-all,demo,mypy-core,mypy-misc # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333 # hack to prevent .tox from crapping to the project directory toxworkdir={env:TOXWORKDIR_BASE:}{toxinidir}/.tox @@ -7,9 +8,11 @@ toxworkdir={env:TOXWORKDIR_BASE:}{toxinidir}/.tox [testenv] passenv = # useful for tests to know they are running under ci - CI CI_* + CI + CI_* # respect user's cache dirs to prevent tox from crapping into project dir - MYPY_CACHE_DIR PYTHONPYCACHEPREFIX + MYPY_CACHE_DIR + PYTHONPYCACHEPREFIX # just the very core tests with minimal dependencies From 9c432027b58cde178fee9f058053d9ba319f436c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 23 Dec 2022 02:27:11 +0000 Subject: [PATCH 063/302] instagram.android: fix missing id --- my/instagram/android.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/my/instagram/android.py b/my/instagram/android.py index a34660c..8e44ebe 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -119,15 +119,17 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # todo use TypedDict? for f in inputs(): with sqlite_connect_immutable(f) as db: - for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db): j = json.loads(thread_json) # todo in principle should leave the thread attached to the message? # since thread is a group of users? # inviter usually contains our own user for r in [j['inviter'], *j['recipients']]: + # id disappeared and seems that pk_id is in use now (around december 2022) + uid = r.get('id') or r.get('pk_id') + assert uid is not None yield User( - id=str(r['id']), # for some reason it's int in the db + id=str(uid), # for some reason it's int in the db full_name=r['full_name'], username=r['username'], ) From 5c82d0faa9c2d4f783b71ba5af62b91d3b6d30ba Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 7 Feb 2023 01:28:45 +0000 Subject: [PATCH 064/302] switch from using dataset to raw sqlite3 module dataset is kinda unmaintaned and currently broken due to sqlalchemy 2.0 changes resolves https://github.com/karlicoss/HPI/issues/264 --- my/core/sqlite.py | 44 ++++++++++++++++++++++++++++++---- my/fbmessenger/android.py | 28 ++++++++++------------ my/hackernews/dogsheep.py | 18 +++++++------- my/hackernews/materialistic.py | 19 ++++++--------- my/taplog.py | 14 +++++------ my/tinder/android.py | 26 +++++++++++--------- my/twitter/talon.py | 38 ++++++++++++++--------------- my/twitter/twint.py | 39 ++++++++++++------------------ 8 files changed, 123 insertions(+), 103 deletions(-) diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 0f4a416..3c1902d 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -1,17 +1,19 @@ from .common import assert_subpackage; assert_subpackage(__name__) +from contextlib import contextmanager from pathlib import Path import shutil import sqlite3 from tempfile import TemporaryDirectory +from typing import Tuple, Any, Iterator, Callable, Optional, Union -from .common import PathIsh +from .common import PathIsh, assert_never +from .compat import Literal def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection: - # https://www.sqlite.org/draft/uri.html#uriimmutable return sqlite3.connect(f'file:{db}?immutable=1', uri=True) @@ -30,6 +32,42 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None: conn.execute('DROP TABLE testtable') +SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any] + +def dict_factory(cursor, row): + fields = [column[0] for column in cursor.description] + return {key: value for key, value in zip(fields, row)} + + +Factory = Union[SqliteRowFactory, Literal['row', 'dict']] + +@contextmanager +def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]: + dbp = f'file:{db}' + # https://www.sqlite.org/draft/uri.html#uriimmutable + if immutable: + dbp = f'{dbp}?immutable=1' + row_factory_: Any = None + if row_factory is not None: + if callable(row_factory): + row_factory_ = row_factory + elif row_factory == 'row': + row_factory_ = sqlite3.Row + elif row_factory == 'dict': + row_factory_ = dict_factory + else: + assert_never() + + conn = sqlite3.connect(dbp, uri=True) + try: + conn.row_factory = row_factory_ + with conn: + yield conn + finally: + # Connection context manager isn't actually closing the connection, only keeps transaction + conn.close() + + # TODO come up with a better name? # NOTE: this is tested by tests/sqlite.py::test_sqlite_read_with_wal def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: @@ -52,8 +90,6 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: return dest -from typing import Tuple, Any, Iterator - # NOTE hmm, so this kinda works # V = TypeVar('V', bound=Tuple[Any, ...]) # def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]: diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index a8078d6..ef3711a 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -3,25 +3,27 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab """ from __future__ import annotations -REQUIRES = ['dataset'] - from dataclasses import dataclass from datetime import datetime -from typing import Iterator, Sequence, Optional, Dict +import json +from pathlib import Path +import sqlite3 +from typing import Iterator, Sequence, Optional, Dict, Union +from more_itertools import unique_everseen + +from my.core import get_files, Paths, datetime_naive, Res, assert_never +from my.core.sqlite import sqlite_connection from my.config import fbmessenger as user_config -from ..core import Paths @dataclass class config(user_config.android): # paths[s]/glob to the exported sqlite databases export_path: Paths -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -38,7 +40,6 @@ class Thread: name: Optional[str] # todo not sure about order of fields... -from ..core import datetime_naive @dataclass class _BaseMessage: id: str @@ -63,22 +64,18 @@ class Message(_BaseMessage): reply_to: Optional[Message] -import json -from typing import Union -from ..core import Res, assert_never -from ..core.dataset import connect_readonly, DatabaseT Entity = Union[Sender, Thread, _Message] def _entities() -> Iterator[Res[Entity]]: for f in inputs(): - with connect_readonly(f) as db: + with sqlite_connection(f, immutable=True, row_factory='row') as db: yield from _process_db(db) -def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]: +def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: # works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user threadkey2id = lambda key: key.split(':')[1] - for r in db['threads'].find(): + for r in db.execute('SELECT * FROM threads'): try: yield Thread( id=threadkey2id(r['thread_key']), @@ -88,7 +85,7 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]: yield e continue - for r in db['messages'].find(order_by='timestamp_ms'): + for r in db.execute('SELECT * FROM messages ORDER BY timestamp_ms'): mtype: int = r['msg_type'] if mtype == -1: # likely immediately deleted or something? doesn't have any data at all @@ -133,7 +130,6 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]: yield e -from more_itertools import unique_everseen def messages() -> Iterator[Res[Message]]: senders: Dict[str, Sender] = {} msgs: Dict[str, Message] = {} diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py index 7329690..462cbc0 100644 --- a/my/hackernews/dogsheep.py +++ b/my/hackernews/dogsheep.py @@ -5,13 +5,15 @@ from __future__ import annotations from dataclasses import dataclass from datetime import datetime -from typing import Iterator, Sequence, Optional, Dict +from pathlib import Path +from typing import Iterator, Sequence, Optional +from my.core import get_files, Paths, Res +from my.core.sqlite import sqlite_connection from my.config import hackernews as user_config -from ..core import Paths @dataclass class config(user_config.dogsheep): # paths[s]/glob to the dogsheep database @@ -20,8 +22,6 @@ class config(user_config.dogsheep): # todo so much boilerplate... really need some common wildcard imports?... # at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -44,15 +44,15 @@ class Item: @property def permalink(self) -> str: return hackernews_link(self.id) +# TODO hmm kinda annoying that permalink isn't getting serialized +# maybe won't be such a big problem if we used hpi query directly on objects, without jsons? +# so we could just take .permalink thing -from ..core.error import Res -from ..core.dataset import connect_readonly def items() -> Iterator[Res[Item]]: f = max(inputs()) - with connect_readonly(f) as db: - items = db['items'] - for r in items.all(order_by='time'): + with sqlite_connection(f, immutable=True, row_factory='row') as conn: + for r in conn.execute('SELECT * FROM items ORDER BY time'): yield Item( id=r['id'], type=r['type'], diff --git a/my/hackernews/materialistic.py b/my/hackernews/materialistic.py index 65a1cb6..e0d634a 100644 --- a/my/hackernews/materialistic.py +++ b/my/hackernews/materialistic.py @@ -1,20 +1,17 @@ """ [[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews """ - -REQUIRES = ['dataset'] - -from datetime import datetime +from datetime import datetime, timezone +from pathlib import Path from typing import Any, Dict, Iterator, NamedTuple, Sequence -import pytz +from my.core import get_files +from my.core.sqlite import sqlite_connection from my.config import materialistic as config # todo migrate config to my.hackernews.materialistic -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -28,7 +25,7 @@ class Saved(NamedTuple): @property def when(self) -> datetime: ts = int(self.row['time']) / 1000 - return datetime.fromtimestamp(ts, tz=pytz.utc) + return datetime.fromtimestamp(ts, tz=timezone.utc) @property def uid(self) -> str: @@ -47,13 +44,11 @@ class Saved(NamedTuple): return hackernews_link(self.uid) -from ..core.dataset import connect_readonly def raw() -> Iterator[Row]: last = max(inputs()) - with connect_readonly(last) as db: - saved = db['saved'] + with sqlite_connection(last, immutable=True, row_factory='dict') as conn: + yield from conn.execute('SELECT * FROM saved ORDER BY time') # TODO wonder if it's 'save time' or creation time? - yield from saved.all(order_by='time') def saves() -> Iterator[Saved]: diff --git a/my/taplog.py b/my/taplog.py index f668a10..6353c14 100644 --- a/my/taplog.py +++ b/my/taplog.py @@ -1,11 +1,11 @@ ''' [[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data ''' - from datetime import datetime from typing import NamedTuple, Dict, Optional, Iterable -from .core import get_files +from my.core import get_files, stat, Stats +from my.core.sqlite import sqlite_connection from my.config import taplog as user_config @@ -46,11 +46,10 @@ class Entry(NamedTuple): def entries() -> Iterable[Entry]: last = max(get_files(user_config.export_path)) - from .core.dataset import connect_readonly - db = connect_readonly(last) - # todo is it sorted by timestamp? - for row in db['Log'].all(): - yield Entry(row) + with sqlite_connection(last, immutable=True, row_factory='dict') as db: + # todo is it sorted by timestamp? + for row in db.execute('SELECT * FROM Log'): + yield Entry(row) # I guess worth having as top level considering it would be quite common? @@ -60,6 +59,5 @@ def by_button(button: str) -> Iterable[Entry]: yield e -from .core import stat, Stats def stats() -> Stats: return stat(entries) diff --git a/my/tinder/android.py b/my/tinder/android.py index e92f316..9f68992 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -3,19 +3,18 @@ Tinder data from Android app database (in =/data/data/com.tinder/databases/tinde """ from __future__ import annotations -REQUIRES = ['dataset'] - from collections import defaultdict from dataclasses import dataclass from datetime import datetime, timezone from itertools import chain from pathlib import Path +import sqlite3 from typing import Sequence, Iterator, Union, Dict, List, Mapping from more_itertools import unique_everseen from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware -from my.core.dataset import connect_readonly, DatabaseT +from my.core.sqlite import sqlite_connection from my.config import tinder as user_config @@ -73,6 +72,8 @@ class Message(_BaseMessage): to: Person +# todo hmm I have a suspicion it might be cumulative? +# although still possible that the user might remove/install app back, so need to keep that in mind def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -83,40 +84,43 @@ Entity = Union[Person, Match, Message] def _entities() -> Iterator[Res[_Entity]]: for db_file in inputs(): - with connect_readonly(db_file) as db: + with sqlite_connection(db_file, immutable=True, row_factory='row') as db: yield from _handle_db(db) -def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]: +def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]: # profile_user_view contains our own user id - for row in chain(db['profile_user_view'], db['match_person']): + for row in chain( + db.execute('SELECT * FROM profile_user_view'), + db.execute('SELECT * FROM match_person'), + ): try: yield _parse_person(row) except Exception as e: # todo attach error contex? yield e - for row in db['match']: + for row in db.execute('SELECT * FROM match'): try: yield _parse_match(row) except Exception as e: yield e - for row in db['message']: + for row in db.execute('SELECT * FROM message'): try: yield _parse_msg(row) except Exception as e: yield e -def _parse_person(row) -> Person: +def _parse_person(row: sqlite3.Row) -> Person: return Person( id=row['id'], name=row['name'], ) -def _parse_match(row) -> _Match: +def _parse_match(row: sqlite3.Row) -> _Match: return _Match( id=row['id'], person_id=row['person_id'], @@ -124,7 +128,7 @@ def _parse_match(row) -> _Match: ) -def _parse_msg(row) -> _Message: +def _parse_msg(row: sqlite3.Row) -> _Message: # note it also has raw_message_data -- not sure which is best to use.. sent = row['sent_date'] return _Message( diff --git a/my/twitter/talon.py b/my/twitter/talon.py index 81137d6..e43f600 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -4,31 +4,32 @@ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter from __future__ import annotations from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone import re -from typing import Iterator, Sequence, Optional, Dict +import sqlite3 +from typing import Iterator, Sequence, Union -import pytz +from more_itertools import unique_everseen + +from my.core import Paths, Res, datetime_aware, get_files +from my.core.sqlite import sqlite_connection + +from .common import TweetId, permalink from my.config import twitter as user_config -from ..core import Paths, Res, datetime_aware @dataclass class config(user_config.talon): # paths[s]/glob to the exported sqlite databases export_path: Paths -from ..core import get_files from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) -from .common import TweetId, permalink - - @dataclass(unsafe_hash=True) class Tweet: id_str: TweetId @@ -51,8 +52,6 @@ class _IsFavorire: tweet: Tweet -from typing import Union -from ..core.dataset import connect_readonly Entity = Union[_IsTweet, _IsFavorire] def _entities() -> Iterator[Res[Entity]]: for f in inputs(): @@ -67,35 +66,36 @@ def _process_one(f: Path) -> Iterator[Res[Entity]]: fname = f.name handler = handlers.get(fname) if handler is None: - yield RuntimeError(f"Coulnd't find handler for {fname}") + yield RuntimeError(f"Could not find handler for {fname}") return - with connect_readonly(f) as db: + with sqlite_connection(f, immutable=True, row_factory='row') as db: yield from handler(db) -def _process_user_tweets(db) -> Iterator[Res[Entity]]: +def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]: # dunno why it's called 'lists' - for r in db['lists'].all(order_by='time'): + for r in db.execute('SELECT * FROM lists ORDER BY time'): try: yield _IsTweet(_parse_tweet(r)) except Exception as e: yield e -def _process_favorite_tweets(db) -> Iterator[Res[Entity]]: - for r in db['favorite_tweets'].all(order_by='time'): +def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]: + for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'): try: yield _IsFavorire(_parse_tweet(r)) except Exception as e: yield e -def _parse_tweet(row) -> Tweet: + +def _parse_tweet(row: sqlite3.Row) -> Tweet: # ok so looks like it's tz aware.. # https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95 # uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime() # and it's created here, so looks like it's properly parsed from the api # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79 - created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc) + created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc) text = row['text'] # try explanding URLs.. sadly there are no positions in the db @@ -132,7 +132,6 @@ def _parse_tweet(row) -> Tweet: ) -from more_itertools import unique_everseen def tweets() -> Iterator[Res[Tweet]]: for x in unique_everseen(_entities()): if isinstance(x, Exception): @@ -140,6 +139,7 @@ def tweets() -> Iterator[Res[Tweet]]: elif isinstance(x, _IsTweet): yield x.tweet + def likes() -> Iterator[Res[Tweet]]: for x in unique_everseen(_entities()): if isinstance(x, Exception): diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 5ba0460..54c7f91 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -1,12 +1,16 @@ """ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export. """ - -REQUIRES = ['dataset'] - -from ..core.common import Paths -from ..core.error import Res from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import NamedTuple, Iterator, List + + +from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats +from my.core.cfg import make_config +from my.core.sqlite import sqlite_connection + from my.config import twint as user_config # TODO move to twitter.twint config structure @@ -17,16 +21,9 @@ class twint(user_config): #### -from ..core.cfg import make_config config = make_config(twint) -from datetime import datetime, timezone -from typing import NamedTuple, Iterator, List -from pathlib import Path - -from ..core.common import get_files, LazyLogger, Json, datetime_aware - log = LazyLogger(__name__) @@ -110,25 +107,19 @@ WHERE {where} ORDER BY T.created_at ''' -def _get_db(): - from ..core.dataset import connect_readonly - db_path = get_db_path() - return connect_readonly(db_path) - def tweets() -> Iterator[Res[Tweet]]: - db = _get_db() - res = db.query(_QUERY.format(where='F.tweet_id IS NULL')) - yield from map(Tweet, res) + with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db: + res = db.execute(_QUERY.format(where='F.tweet_id IS NULL')) + yield from map(Tweet, res) def likes() -> Iterator[Res[Tweet]]: - db = _get_db() - res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL')) - yield from map(Tweet, res) + with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db: + res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL')) + yield from map(Tweet, res) -from ..core import stat, Stats def stats() -> Stats: return { **stat(tweets), From bb5ad2b6ac61b402d5b73e921f11df3d99f9a802 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 7 Feb 2023 01:39:34 +0000 Subject: [PATCH 065/302] core: make hpi install more defensive, just warn on no requirements this is useful for backwards compatibility if modules remove their requirements --- my/core/__main__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index d8e9ebd..11f32fc 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -344,8 +344,8 @@ def _requires(modules: Sequence[str]) -> Sequence[str]: reqs = mod.requires if reqs is None: - error(f"Module {mod.name} has no REQUIRES specification") - sys.exit(1) + warning(f"Module {mod.name} has no REQUIRES specification") + continue for r in reqs: if r not in res: res.append(r) @@ -369,6 +369,10 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool=False) - requirements = _requires(module) + if len(requirements) == 0: + warning('requirements list is empty, no need to install anything') + return + pre_cmd = [ sys.executable, '-m', 'pip', 'install', From fb0c1289f06e8ddf9edc2434e986cd3af86d72f4 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 8 Feb 2023 01:44:20 +0000 Subject: [PATCH 066/302] my.fbmessenger.export: use context manager to properly close sqlite connection --- my/fbmessenger/export.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/my/fbmessenger/export.py b/my/fbmessenger/export.py index 0edb571..3a9d227 100644 --- a/my/fbmessenger/export.py +++ b/my/fbmessenger/export.py @@ -7,10 +7,13 @@ REQUIRES = [ 'git+https://github.com/karlicoss/fbmessengerexport', ] +from contextlib import ExitStack, contextmanager from dataclasses import dataclass from pathlib import Path from typing import Iterator +from my.core import PathIsh, Res, stat, Stats +from my.core.warnings import high from my.config import fbmessenger as user_config import fbmessengerexport.dal as messenger @@ -22,7 +25,6 @@ _new_section = getattr(user_config, 'fbmessengerexport', None) _old_attr = getattr(user_config, 'export_db', None) if _new_section is None and _old_attr is not None: - from my.core.warnings import high high("""DEPRECATED! Please modify your fbmessenger config to look like: class fbmessenger: @@ -35,24 +37,26 @@ class fbmessenger: ### -from ..core import PathIsh @dataclass class config(user_config.fbmessengerexport): export_db: PathIsh -def _dal() -> messenger.DAL: - return messenger.DAL(config.export_db) +@contextmanager +def _dal() -> Iterator[messenger.DAL]: + model = messenger.DAL(config.export_db) + with ExitStack() as stack: + if hasattr(model, '__dal__'): # defensive to support legacy fbmessengerexport + stack.enter_context(model) + yield model -from ..core import Res def messages() -> Iterator[Res[messenger.Message]]: - model = _dal() - for t in model.iter_threads(): - yield from t.iter_messages() + with _dal() as model: + for t in model.iter_threads(): + yield from t.iter_messages() -from ..core import stat, Stats def stats() -> Stats: return stat(messages) @@ -75,11 +79,9 @@ def dump_chat_history(where: PathIsh) -> None: p = Path(where) assert not p.exists() or p.is_dir() - model = _dal() - from shutil import rmtree from tempfile import TemporaryDirectory - with TemporaryDirectory() as tdir: + with TemporaryDirectory() as tdir, _dal() as model: td = Path(tdir) _dump_helper(model, td) From 5ac5636e7f366d75e103f66038b89bab7b0e2880 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 9 Feb 2023 00:13:56 +0000 Subject: [PATCH 067/302] core: better support for ad-hoc configs properly reload/unload the relevant modules so hopefully no more weird hacks should be required relevant - https://github.com/karlicoss/promnesia/issues/340 - https://github.com/karlicoss/HPI/issues/46 --- my/core/cfg.py | 51 ++++++++++++++++++++++++++++++++++++---- my/simple.py | 21 +++++++++++++++++ tests/test_tmp_config.py | 33 ++++++++++++++++++++++++++ tox.ini | 7 +++--- 4 files changed, 104 insertions(+), 8 deletions(-) create mode 100644 my/simple.py create mode 100644 tests/test_tmp_config.py diff --git a/my/core/cfg.py b/my/core/cfg.py index 4b5cbed..d69f356 100644 --- a/my/core/cfg.py +++ b/my/core/cfg.py @@ -44,12 +44,53 @@ def override_config(config: F) -> Iterator[F]: delattr(config, k) -# helper for tests? not sure if could be useful elsewhere +import importlib +import sys +from typing import Optional, Set +ModuleRegex = str @contextmanager -def tmp_config(): - import my.config as C - with override_config(C): - yield C # todo not sure? +def _reload_modules(modules: ModuleRegex) -> Iterator[None]: + def loaded_modules() -> Set[str]: + return {name for name in sys.modules if re.fullmatch(modules, name)} + + modules_before = loaded_modules() + + for m in modules_before: + importlib.reload(sys.modules[m]) + + try: + yield + finally: + modules_after = loaded_modules() + for m in modules_after: + if m in modules_before: + # was previously loaded, so need to reload to pick up old config + importlib.reload(sys.modules[m]) + else: + # wasn't previously loaded, so need to unload it + # otherwise it might fail due to missing config etc + sys.modules.pop(m, None) + + +from contextlib import ExitStack +import re +@contextmanager +def tmp_config(*, modules: Optional[ModuleRegex]=None, config=None): + if modules is None: + assert config is None + if modules is not None: + assert config is not None + + import my.config + with ExitStack() as module_reload_stack, override_config(my.config) as new_config: + if config is not None: + overrides = {k: v for k, v in vars(config).items() if not k.startswith('__')} + for k, v in overrides.items(): + setattr(new_config, k, v) + + if modules is not None: + module_reload_stack.enter_context(_reload_modules(modules)) + yield new_config def test_tmp_config() -> None: diff --git a/my/simple.py b/my/simple.py new file mode 100644 index 0000000..7462291 --- /dev/null +++ b/my/simple.py @@ -0,0 +1,21 @@ +''' +Just a demo module for testing and documentation purposes +''' +from dataclasses import dataclass +from typing import Iterator + +from my.core import make_config + +from my.config import simple as user_config + + +@dataclass +class simple(user_config): + count: int + + +config = make_config(simple) + + +def items() -> Iterator[int]: + yield from range(config.count) diff --git a/tests/test_tmp_config.py b/tests/test_tmp_config.py new file mode 100644 index 0000000..eb26e54 --- /dev/null +++ b/tests/test_tmp_config.py @@ -0,0 +1,33 @@ +from pathlib import Path +import tempfile + +from my.core.cfg import tmp_config + +import pytest + + +def _init_default_config(): + import my.config + class default_config: + count = 5 + my.config.simple = default_config # type: ignore[attr-defined] + + +def test_tmp_config() -> None: + ## ugh. ideally this would be on the top level (would be a better test) + ## but pytest imports eveything first, executes hooks, and some reset_modules() fictures mess stuff up + ## later would be nice to be a bit more careful about them + _init_default_config() + from my.simple import items + ## + + assert len(list(items())) == 5 + + class config: + class simple: + count = 3 + + with tmp_config(modules='my.simple', config=config): + assert len(list(items())) == 3 + + assert len(list(items())) == 5 diff --git a/tox.ini b/tox.ini index ed2a084..5ae76f3 100644 --- a/tox.ini +++ b/tox.ini @@ -20,9 +20,10 @@ passenv = commands = pip install -e .[testing] {envpython} -m pytest \ - tests/core.py \ - tests/sqlite.py \ - tests/get_files.py \ + tests/core.py \ + tests/sqlite.py \ + tests/get_files.py \ + tests/test_tmp_config.py \ {posargs} From 0e884fe166cc8086e114aa34be5220932b76370a Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 9 Feb 2023 00:53:24 +0000 Subject: [PATCH 068/302] core/modules: switch away from using override_config to tmp_config in some tests & faka data generators --- my/core/cfg.py | 9 +++++++-- my/core/core_config.py | 4 ++-- my/emfit/__init__.py | 21 +++++++++++++++------ my/endomondo.py | 18 +++++++++++------- my/rescuetime.py | 21 +++++++++++++-------- 5 files changed, 48 insertions(+), 25 deletions(-) diff --git a/my/core/cfg.py b/my/core/cfg.py index d69f356..3321a4c 100644 --- a/my/core/cfg.py +++ b/my/core/cfg.py @@ -28,7 +28,7 @@ F = TypeVar('F') from contextlib import contextmanager from typing import Iterator @contextmanager -def override_config(config: F) -> Iterator[F]: +def _override_config(config: F) -> Iterator[F]: ''' Temporary override for config's parameters, useful for testing/fake data/etc. ''' @@ -82,7 +82,7 @@ def tmp_config(*, modules: Optional[ModuleRegex]=None, config=None): assert config is not None import my.config - with ExitStack() as module_reload_stack, override_config(my.config) as new_config: + with ExitStack() as module_reload_stack, _override_config(my.config) as new_config: if config is not None: overrides = {k: v for k, v in vars(config).items() if not k.startswith('__')} for k, v in overrides.items(): @@ -104,3 +104,8 @@ def test_tmp_config() -> None: # todo hmm. not sure what should do about new properties?? assert not hasattr(c, 'extra') assert c.google != 'whatever' + + +### +# todo properly deprecate, this isn't really meant for public use +override_config = _override_config diff --git a/my/core/core_config.py b/my/core/core_config.py index 48f3eb4..f87a1ba 100644 --- a/my/core/core_config.py +++ b/my/core/core_config.py @@ -123,8 +123,8 @@ from contextlib import contextmanager as ctx @ctx def _reset_config() -> Iterator[Config]: # todo maybe have this decorator for the whole of my.config? - from .cfg import override_config - with override_config(config) as cc: + from .cfg import _override_config + with _override_config(config) as cc: cc.enabled_modules = None cc.disabled_modules = None cc.cache_dir = None diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index 997ba6c..a081416 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -3,6 +3,11 @@ Consumes data exported by https://github.com/karlicoss/emfitexport """ + +REQUIRES = [ + 'git+https://github.com/karlicoss/emfitexport', +] + from pathlib import Path from typing import Dict, List, Iterable, Any, Optional @@ -140,16 +145,20 @@ def stats() -> Stats: from contextlib import contextmanager from typing import Iterator @contextmanager -def fake_data(nights: int=500) -> Iterator[None]: - from ..core.cfg import override_config +def fake_data(nights: int=500) -> Iterator: + from my.core.cfg import tmp_config from tempfile import TemporaryDirectory - with override_config(config) as cfg, TemporaryDirectory() as td: + with TemporaryDirectory() as td: tdir = Path(td) - cfg.export_path = tdir - gen = dal.FakeData() gen.fill(tdir, count=nights) - yield + + class override: + class emfit: + export_path = tdir + + with tmp_config(modules=__name__, config=override) as cfg: + yield cfg # TODO remove/deprecate it? I think used by timeline diff --git a/my/endomondo.py b/my/endomondo.py index 0df7aa9..0fa396f 100644 --- a/my/endomondo.py +++ b/my/endomondo.py @@ -87,20 +87,24 @@ def stats() -> Stats: # TODO make sure it's possible to 'advise' functions and override stuff from contextlib import contextmanager +from typing import Iterator @contextmanager -def fake_data(count: int=100): - from .core.cfg import override_config +def fake_data(count: int=100) -> Iterator: + from my.core.cfg import tmp_config from tempfile import TemporaryDirectory import json - with override_config(endomondo) as cfg, TemporaryDirectory() as td: + with TemporaryDirectory() as td: tdir = Path(td) - cfg.export_path = tdir - - # todo would be nice to somehow expose the generator so it's possible to hack from the outside? fd = dal.FakeData() data = fd.generate(count=count) jf = tdir / 'data.json' jf.write_text(json.dumps(data)) - yield + class override: + class endomondo: + export_path = tdir + + with tmp_config(modules=__name__, config=override) as cfg: + # todo would be nice to somehow expose the generator so it's possible to hack from the outside? + yield cfg diff --git a/my/rescuetime.py b/my/rescuetime.py index 5d64375..40aa6b7 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -58,22 +58,27 @@ def stats() -> Stats: # basically, hack config and populate it with fake data? fake data generated by DAL, but the rest is handled by this? -from typing import Iterator from contextlib import contextmanager +from typing import Iterator # todo take seed, or what? @contextmanager -def fake_data(rows: int=1000) -> Iterator[None]: +def fake_data(rows: int=1000) -> Iterator: # todo also disable cachew automatically for such things? - from .core.cachew import disabled_cachew - from .core.cfg import override_config + from my.core.cfg import tmp_config + from my.core.cachew import disabled_cachew from tempfile import TemporaryDirectory - with disabled_cachew(), override_config(config) as cfg, TemporaryDirectory() as td: + import json + with disabled_cachew(), TemporaryDirectory() as td: tdir = Path(td) - cfg.export_path = tdir f = tdir / 'rescuetime.json' - import json f.write_text(json.dumps(dal.fake_data_generator(rows=rows))) - yield + + class override: + class rescuetime: + export_path = tdir + + with tmp_config(modules=__name__, config=override) as cfg: + yield cfg # TODO ok, now it's something that actually could run on CI! # todo would be kinda nice if doctor could run against the fake data, to have a basic health check of the module? From 458633ea966a558128252886d0e7f36523cd3e5d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 18 Feb 2023 18:57:07 +0000 Subject: [PATCH 069/302] my.tinder.android: add a bit of logging --- my/tinder/android.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/my/tinder/android.py b/my/tinder/android.py index 9f68992..5b5fdf0 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -13,10 +13,13 @@ from typing import Sequence, Iterator, Union, Dict, List, Mapping from more_itertools import unique_everseen -from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware +from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware, LazyLogger from my.core.sqlite import sqlite_connection +logger = LazyLogger(__name__) + + from my.config import tinder as user_config @dataclass class config(user_config.android): @@ -83,7 +86,9 @@ Entity = Union[Person, Match, Message] def _entities() -> Iterator[Res[_Entity]]: - for db_file in inputs(): + dbs = inputs() + for i, db_file in enumerate(dbs): + logger.debug(f'processing {db_file} {i}/{len(dbs)}') with sqlite_connection(db_file, immutable=True, row_factory='row') as db: yield from _handle_db(db) From 6594ad24dc28575aa7204db46ddabf094e59438e Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 18 Feb 2023 22:34:06 +0000 Subject: [PATCH 070/302] my.tinder.android: speedup unique_everseen by adding unsafe_hash --- my/tinder/android.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/my/tinder/android.py b/my/tinder/android.py index 5b5fdf0..18b59d8 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -41,7 +41,7 @@ class _BaseMatch: id: str -@dataclass +@dataclass(unsafe_hash=True) class _Match(_BaseMatch): person_id: str @@ -61,7 +61,7 @@ class _BaseMessage: text: str -@dataclass +@dataclass(unsafe_hash=True) class _Message(_BaseMessage): match_id: str from_id: str From 6493859ba52e87ffe66c710fc6c2ee353d25d2a9 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 19 Feb 2023 00:23:28 +0000 Subject: [PATCH 071/302] my.telegram: initial module from telegram_backup --- my/config.py | 7 ++++ my/core/sqlite.py | 2 + my/telegram/telegram_backup.py | 77 ++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+) create mode 100644 my/telegram/telegram_backup.py diff --git a/my/config.py b/my/config.py index 52af04d..7bfae09 100644 --- a/my/config.py +++ b/my/config.py @@ -159,3 +159,10 @@ class browser: export_path: Paths = '' class active_browser: export_path: Paths = '' + + +class telegram: + class telegram_backup: + export_path: PathIsh = '' + + diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 3c1902d..7c02940 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -46,6 +46,8 @@ def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Option dbp = f'file:{db}' # https://www.sqlite.org/draft/uri.html#uriimmutable if immutable: + # assert results in nicer error than sqlite3.OperationalError + assert Path(db).exists(), db dbp = f'{dbp}?immutable=1' row_factory_: Any = None if row_factory is not None: diff --git a/my/telegram/telegram_backup.py b/my/telegram/telegram_backup.py new file mode 100644 index 0000000..6c33e39 --- /dev/null +++ b/my/telegram/telegram_backup.py @@ -0,0 +1,77 @@ +""" +Telegram data via [fabianonline/telegram_backup](https://github.com/fabianonline/telegram_backup) tool +""" + +from dataclasses import dataclass +from datetime import datetime, timezone +import sqlite3 +from typing import Dict, Iterator + +from my.core import datetime_aware, PathIsh +from my.core.sqlite import sqlite_connection + +from my.config import telegram as user_config + + +@dataclass +class config(user_config.telegram_backup): + # path to the export database.sqlite + export_path: PathIsh + + +@dataclass +class Chat: + id: str + name: str + # not sure if need type? + + +@dataclass +class User: + id: str + name: str + + +@dataclass +class Message: + id: int + time: datetime_aware + chat: Chat + sender: User + text: str + + +Chats = Dict[str, Chat] +def _message_from_row(r: sqlite3.Row, *, chats: Chats) -> Message: + ts = r['time'] + time = datetime.fromtimestamp(ts, tz=timezone.utc) + chat = chats[r['source_id']] + sender = chats[r['sender_id']] + return Message( + id=r['message_id'], + time=time, + chat=chat, + sender=sender, + text=r['text'], + ) + + +def messages() -> Iterator[Message]: + with sqlite_connection(config.export_path, immutable=True, row_factory='row') as db: + + chats: Chats = {} + for r in db.execute('SELECT * FROM chats'): + chat = Chat(id=r['id'], name=r['name']) + assert chat.id not in chats + chats[chat.id] = chat + + for r in db.execute('SELECT * FROM users'): + chat = Chat(id=r['id'], name=f'{r["first_name"]} {r["last_name"]}') + assert chat.id not in chats + chats[chat.id] = chat + + # TODO order by? not sure + for r in db.execute('SELECT * FROM messages WHERE message_type NOT IN ("service_message", "empty_message")'): + # seems like the only remaining have message_type = 'message' + yield _message_from_row(r, chats=chats) + From af874d2d759a91ecec8e6f0a53541a18a169385e Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 19 Feb 2023 02:45:08 +0000 Subject: [PATCH 072/302] my.fbmessenger.android: minor refactoring, comments & error handling --- my/fbmessenger/android.py | 119 +++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 61 deletions(-) diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index ef3711a..99afc15 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -5,19 +5,22 @@ from __future__ import annotations from dataclasses import dataclass from datetime import datetime -import json from pathlib import Path import sqlite3 from typing import Iterator, Sequence, Optional, Dict, Union from more_itertools import unique_everseen -from my.core import get_files, Paths, datetime_naive, Res, assert_never +from my.core import get_files, Paths, datetime_naive, Res, assert_never, LazyLogger +from my.core.error import echain from my.core.sqlite import sqlite_connection from my.config import fbmessenger as user_config +logger = LazyLogger(__name__) + + @dataclass class config(user_config.android): # paths[s]/glob to the exported sqlite databases @@ -66,68 +69,62 @@ class Message(_BaseMessage): Entity = Union[Sender, Thread, _Message] def _entities() -> Iterator[Res[Entity]]: - for f in inputs(): + dbs = inputs() + for i, f in enumerate(dbs): + logger.debug(f'processing {f} {i}/{len(dbs)}') with sqlite_connection(f, immutable=True, row_factory='row') as db: - yield from _process_db(db) + try: + yield from _process_db(db) + except Exception as e: + yield echain(RuntimeError(f'While processing {f}'), cause=e) + + +def _normalise_user_id(ukey: str) -> str: + # trying to match messages.author from fbchat + prefix = 'FACEBOOK:' + assert ukey.startswith(prefix), ukey + return ukey[len(prefix):] + + +def _normalise_thread_id(key) -> str: + # works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user + return key.split(':')[1] def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: - # works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user - threadkey2id = lambda key: key.split(':')[1] for r in db.execute('SELECT * FROM threads'): - try: - yield Thread( - id=threadkey2id(r['thread_key']), - name=r['name'], - ) - except Exception as e: - yield e - continue + yield Thread( + id=_normalise_thread_id(r['thread_key']), + name=r['name'], + ) + + for r in db.execute('''SELECT * FROM thread_users'''): + # for messaging_actor_type == 'REDUCED_MESSAGING_ACTOR', name is None + # but they are still referenced, so need to keep + name = r['name'] or '' + yield Sender( + id=_normalise_user_id(r['user_key']), + name=name, + ) - for r in db.execute('SELECT * FROM messages ORDER BY timestamp_ms'): - mtype: int = r['msg_type'] - if mtype == -1: - # likely immediately deleted or something? doesn't have any data at all - continue - - user_id = None - try: - # todo could use thread_users? - sj = json.loads(r['sender']) - ukey: str = sj['user_key'] - prefix = 'FACEBOOK:' - assert ukey.startswith(prefix), ukey - user_id = ukey[len(prefix):] - yield Sender( - id=user_id, - name=sj['name'], - ) - except Exception as e: - yield e - continue - - thread_id = None - try: - thread_id = threadkey2id(r['thread_key']) - except Exception as e: - yield e - continue - - try: - assert user_id is not None - assert thread_id is not None - yield _Message( - id=r['msg_id'], - dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000), - # is_incoming=False, TODO?? - text=r['text'], - thread_id=thread_id, - sender_id=user_id, - reply_to_id=r['message_replied_to_id'] - ) - except Exception as e: - yield e + for r in db.execute(''' + SELECT *, json_extract(sender, "$.user_key") AS user_key FROM messages + WHERE msg_type NOT IN ( + -1, /* these don't have any data at all, likely immediately deleted or something? */ + 2 /* these are 'left group' system messages, also a bit annoying since they might reference nonexistent users */ + ) + ORDER BY timestamp_ms /* they aren't in order in the database, so need to sort */ + '''): + yield _Message( + id=r['msg_id'], + dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000), + # is_incoming=False, TODO?? + text=r['text'], + thread_id=_normalise_thread_id(r['thread_key']), + sender_id=_normalise_user_id(r['user_key']), + reply_to_id=r['message_replied_to_id'] + ) def messages() -> Iterator[Res[Message]]: @@ -146,12 +143,12 @@ def messages() -> Iterator[Res[Message]]: continue if isinstance(x, _Message): reply_to_id = x.reply_to_id + # hmm, reply_to be missing due to the synthetic nature of export, so have to be defensive + reply_to = None if reply_to_id is None else msgs.get(reply_to_id) + # also would be interesting to merge together entities rather than resuling messages from different sources.. + # then the merging thing could be moved to common? try: sender = senders[x.sender_id] - # hmm, reply_to be missing due to the synthetic nature of export - # also would be interesting to merge together entities rather than resuling messages from different sources.. - # then the merging thing could be moved to common? - reply_to = None if reply_to_id is None else msgs[reply_to_id] thread = threads[x.thread_id] except Exception as e: yield e From eff9c02886bbab89fbdb17a78b77a147699fe232 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 19 Feb 2023 03:55:11 +0000 Subject: [PATCH 073/302] my.fbmessenger.android: add optional facebook_id --- my/config.py | 1 + my/fbmessenger/android.py | 59 ++++++++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/my/config.py b/my/config.py index 7bfae09..313b464 100644 --- a/my/config.py +++ b/my/config.py @@ -142,6 +142,7 @@ class hackernews: class fbmessenger: class fbmessengerexport: export_db: PathIsh + facebook_id: Optional[str] class android: export_path: Paths diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index 99afc15..6a8a0eb 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -7,11 +7,11 @@ from dataclasses import dataclass from datetime import datetime from pathlib import Path import sqlite3 -from typing import Iterator, Sequence, Optional, Dict, Union +from typing import Iterator, Sequence, Optional, Dict, Union, List from more_itertools import unique_everseen -from my.core import get_files, Paths, datetime_naive, Res, assert_never, LazyLogger +from my.core import get_files, Paths, datetime_naive, Res, assert_never, LazyLogger, make_config from my.core.error import echain from my.core.sqlite import sqlite_connection @@ -22,10 +22,17 @@ logger = LazyLogger(__name__) @dataclass -class config(user_config.android): +class Config(user_config.android): # paths[s]/glob to the exported sqlite databases export_path: Paths + facebook_id: Optional[str] = None + + +# hmm. this is necessary for default value (= None) to work +# otherwise Config.facebook_id is always None.. +config = make_config(Config) + def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -40,7 +47,7 @@ class Sender: @dataclass(unsafe_hash=True) class Thread: id: str - name: Optional[str] + name: Optional[str] # isn't set for groups or one to one messages # todo not sure about order of fields... @dataclass @@ -92,19 +99,45 @@ def _normalise_thread_id(key) -> str: def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: - - for r in db.execute('SELECT * FROM threads'): - yield Thread( - id=_normalise_thread_id(r['thread_key']), - name=r['name'], - ) - + senders: Dict[str, Sender] = {} for r in db.execute('''SELECT * FROM thread_users'''): # for messaging_actor_type == 'REDUCED_MESSAGING_ACTOR', name is None # but they are still referenced, so need to keep name = r['name'] or '' - yield Sender( - id=_normalise_user_id(r['user_key']), + user_key = r['user_key'] + s = Sender( + id=_normalise_user_id(user_key), + name=name, + ) + senders[user_key] = s + yield s + + self_id = config.facebook_id + thread_users: Dict[str, List[str]] = {} + for r in db.execute('SELECT * from thread_participants'): + thread_key = r['thread_key'] + user_key = r['user_key'] + if self_id is not None and user_key == f'FACEBOOK:{self_id}': + # exclude yourself, otherwise it's just spammy to show up in all participants + continue + + ll = thread_users.get(thread_key) + if ll is None: + ll = [] + thread_users[thread_key] = ll + ll.append(senders[user_key]) + + for r in db.execute('SELECT * FROM threads'): + thread_key = r['thread_key'] + thread_type = thread_key.split(':')[0] + if thread_type == 'MONTAGE': # no idea what this is? + continue + name = r['name'] # seems that it's only set for some groups + if name is None: + users = thread_users[thread_key] + name = ', '.join([u.name for u in users]) + yield Thread( + id=_normalise_thread_id(thread_key), name=name, ) From c63177e18687c5a39dbcf36b0ee2aa723fc33189 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 20 Feb 2023 23:40:13 +0000 Subject: [PATCH 074/302] general/ci: clean up mypy-misc pipeline, only exclude specific files instead marked some module configs which aren't really ready for public use as type: ignore --- my/body/blood.py | 2 +- my/body/weight.py | 2 +- my/books/kobo.py | 2 +- my/coding/codeforces.py | 4 +- my/coding/topcoder.py | 4 +- my/config.py | 82 ++++++++++++++++++++++++++++++++++ my/fbmessenger/android.py | 2 +- my/jawbone/__init__.py | 2 +- my/jawbone/plots.py | 6 +-- my/photos/main.py | 2 +- my/runnerup.py | 2 +- my/telegram/telegram_backup.py | 2 +- my/twitter/archive.py | 2 +- my/vk/favorites.py | 3 +- tox.ini | 79 +++++++++++++------------------- 15 files changed, 132 insertions(+), 64 deletions(-) diff --git a/my/body/blood.py b/my/body/blood.py index c1d66e2..e282068 100644 --- a/my/body/blood.py +++ b/my/body/blood.py @@ -13,7 +13,7 @@ import pandas as pd # type: ignore import orgparse -from my.config import blood as config +from my.config import blood as config # type: ignore[attr-defined] class Entry(NamedTuple): diff --git a/my/body/weight.py b/my/body/weight.py index 28688b6..659b759 100644 --- a/my/body/weight.py +++ b/my/body/weight.py @@ -10,7 +10,7 @@ from ..core.error import Res, set_error_datetime, extract_error_datetime from .. import orgmode -from my.config import weight as config +from my.config import weight as config # type: ignore[attr-defined] log = LazyLogger('my.body.weight') diff --git a/my/books/kobo.py b/my/books/kobo.py index d5f5416..2a469d0 100644 --- a/my/books/kobo.py +++ b/my/books/kobo.py @@ -4,4 +4,4 @@ warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!') from ..core.util import __NOT_HPI_MODULE__ -from ..kobo import * +from ..kobo import * # type: ignore[no-redef] diff --git a/my/coding/codeforces.py b/my/coding/codeforces.py index 3793988..a4c7de2 100644 --- a/my/coding/codeforces.py +++ b/my/coding/codeforces.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 -from my.config import codeforces as config +from my.config import codeforces as config # type: ignore[attr-defined] + from datetime import datetime, timezone from typing import NamedTuple import json from typing import Dict, Iterator + from ..core import get_files, Res, unwrap from ..core.compat import cached_property from ..core.konsume import ignore, wrap diff --git a/my/coding/topcoder.py b/my/coding/topcoder.py index 5711254..32a9ff8 100644 --- a/my/coding/topcoder.py +++ b/my/coding/topcoder.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 -from my.config import topcoder as config +from my.config import topcoder as config # type: ignore[attr-defined] + from datetime import datetime from typing import NamedTuple import json from typing import Dict, Iterator + from ..core import get_files, Res, unwrap, Json from ..core.compat import cached_property from ..core.error import Res, unwrap diff --git a/my/config.py b/my/config.py index 313b464..bfae86e 100644 --- a/my/config.py +++ b/my/config.py @@ -14,8 +14,14 @@ from my.core import init ### +from datetime import tzinfo +from pathlib import Path +from typing import List + + from my.core import Paths, PathIsh + class hypothesis: # expects outputs from https://github.com/karlicoss/hypexport # (it's just the standard Hypothes.is export format) @@ -139,6 +145,10 @@ class hackernews: export_path: Paths +class materialistic: + export_path: Paths + + class fbmessenger: class fbmessengerexport: export_db: PathIsh @@ -155,6 +165,11 @@ class twitter: class talon: export_path: Paths + +class twint: + export_path: Paths + + class browser: class export: export_path: Paths = '' @@ -167,3 +182,70 @@ class telegram: export_path: PathIsh = '' +class demo: + data_path: Paths + username: str + timezone: tzinfo + + +class simple: + count: int + + +class vk_messages_backup: + storage_path: Path + + +class kobo: + export_path: Paths + + +class feedly: + export_path: Paths + + +class feedbin: + export_path: Paths + + +class taplog: + export_path: Paths + + +class lastfm: + export_path: Paths + + +class rescuetime: + export_path: Paths + + +class runnerup: + export_path: Paths + + +class emfit: + export_path: Path + timezone: tzinfo + excluded_sids: List[str] + + +class foursquare: + export_path: Paths + + +class rtm: + export_path: Paths + + +class imdb: + export_path: Paths + + +class roamresearch: + export_path: Paths + username: str + + + + diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index 6a8a0eb..69555cb 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -113,7 +113,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: yield s self_id = config.facebook_id - thread_users: Dict[str, List[str]] = {} + thread_users: Dict[str, List[Sender]] = {} for r in db.execute('SELECT * from thread_participants'): thread_key = r['thread_key'] user_key = r['user_key'] diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 50932bf..89f104a 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -10,7 +10,7 @@ from ..core.common import LazyLogger logger = LazyLogger(__name__) -from my.config import jawbone as config +from my.config import jawbone as config # type: ignore[attr-defined] BDIR = config.export_dir diff --git a/my/jawbone/plots.py b/my/jawbone/plots.py index 195ddb5..5332fe6 100755 --- a/my/jawbone/plots.py +++ b/my/jawbone/plots.py @@ -85,7 +85,7 @@ def iter_useful(data_file: str): # TODO <<< hmm. these files do contain deep and light sleep?? # also steps stats?? -from my.config import jawbone as config +from my.config import jawbone as config # type: ignore[attr-defined] p = config.export_dir / 'old_csv' # TODO with_my? @@ -95,7 +95,7 @@ files = [ p / "2017.csv", ] -from kython import concat, parse_date +from kython import concat, parse_date # type: ignore useful = concat(*(list(iter_useful(str(f))) for f in files)) # for u in useful: @@ -108,7 +108,7 @@ dates = [parse_date(u.date, yearfirst=True, dayfirst=False) for u in useful] # TODO filter outliers? # TODO don't need this anymore? it's gonna be in dashboards package -from kython.plotting import plot_timestamped +from kython.plotting import plot_timestamped # type: ignore for attr, lims, mavg, fig in [ # type: ignore ('light', (0, 400), 5, None), ('deep', (0, 600), 5, None), diff --git a/my/photos/main.py b/my/photos/main.py index 6be3163..69e5a46 100644 --- a/my/photos/main.py +++ b/my/photos/main.py @@ -19,7 +19,7 @@ from ..core.common import LazyLogger, mcachew, fastermime from ..core.error import Res, sort_res_by from ..core.cachew import cache_dir -from my.config import photos as config +from my.config import photos as config # type: ignore[attr-defined] logger = LazyLogger(__name__) diff --git a/my/runnerup.py b/my/runnerup.py index 8e31770..6140236 100644 --- a/my/runnerup.py +++ b/my/runnerup.py @@ -13,7 +13,7 @@ from typing import Iterable from .core import Res, get_files from .core.common import isoparse, Json -import tcxparser +import tcxparser # type: ignore[import] from my.config import runnerup as config diff --git a/my/telegram/telegram_backup.py b/my/telegram/telegram_backup.py index 6c33e39..cd82577 100644 --- a/my/telegram/telegram_backup.py +++ b/my/telegram/telegram_backup.py @@ -51,7 +51,7 @@ def _message_from_row(r: sqlite3.Row, *, chats: Chats) -> Message: id=r['message_id'], time=time, chat=chat, - sender=sender, + sender=User(id=sender.id, name=sender.name), text=r['text'], ) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index c59d7a1..bdd1497 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -12,7 +12,7 @@ except ImportError as ie: # must be caused by something else raise ie try: - from my.config import twitter as user_config # type: ignore[misc] + from my.config import twitter as user_config # type: ignore[misc,assignment] except ImportError: raise ie # raise the original exception.. must be something else else: diff --git a/my/vk/favorites.py b/my/vk/favorites.py index e6ccbf3..eb1a89b 100644 --- a/my/vk/favorites.py +++ b/my/vk/favorites.py @@ -3,7 +3,8 @@ from datetime import datetime import json from typing import NamedTuple, Iterable, Sequence, Optional -from my.config import vk as config + +from my.config import vk as config # type: ignore[attr-defined] class Favorite(NamedTuple): diff --git a/tox.ini b/tox.ini index 5ae76f3..a8af102 100644 --- a/tox.ini +++ b/tox.ini @@ -96,57 +96,38 @@ commands = pip install -e .[testing,optional] hpi module install --parallel \ - my.browser.export \ - my.orgmode \ - my.endomondo \ - my.github.ghexport \ - my.hypothesis \ - my.instapaper \ - my.pocket \ - my.reddit.rexport \ - my.reddit.pushshift \ - my.stackexchange.stexport \ - my.tinder.android \ - my.pinboard \ - my.arbtt \ - my.coding.commits \ - my.goodreads \ - my.pdfs \ - my.smscalls \ - my.location.gpslogger \ - my.location.via_ip \ - my.google.takeout.parser + my.arbtt \ + my.coding.commits \ + my.browser.export \ + my.github.ghexport \ + my.emfit \ + my.endomondo \ + my.fbmessenger.export \ + my.goodreads \ + my.google.takeout.parser \ + my.orgmode \ + my.hypothesis \ + my.instapaper \ + my.kobo \ + my.location.gpslogger \ + my.location.via_ip \ + my.pdfs \ + my.pinboard \ + my.pocket \ + my.reddit.pushshift \ + my.reddit.rexport \ + my.rescuetime \ + my.runnerup \ + my.stackexchange.stexport \ + my.smscalls \ + my.tinder.android + - # todo fuck. -p my.github isn't checking the subpackages?? wtf... - # guess it wants .pyi file?? {envpython} -m mypy --install-types --non-interactive \ - -p my.browser \ - -p my.endomondo \ - -p my.github.ghexport \ - -p my.github.gdpr \ - -p my.hypothesis \ - -p my.instapaper \ - -p my.pocket \ - -p my.smscalls \ - -p my.reddit \ - -p my.stackexchange.stexport \ - -p my.pinboard \ - -p my.body.exercise.cardio \ - -p my.body.exercise.cross_trainer \ - -p my.bluemaestro \ - -p my.location.google \ - -p my.location.google_takeout \ - -p my.location.via_ip \ - -p my.location.gpslogger \ - -p my.ip.common \ - -p my.time.tz.via_location \ - -p my.calendar.holidays \ - -p my.arbtt \ - -p my.coding.commits \ - -p my.goodreads \ - -p my.pdfs \ - -p my.bumble.android \ - -p my.tinder.android \ + -p my \ + --exclude 'my/coding/codeforces.py' \ + --exclude 'my/coding/topcoder.py' \ + --exclude 'my/jawbone/.*' \ --txt-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \ {posargs} From 07e7c62d02e9a53426b7050869ba72649ed9f564 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 20 Feb 2023 23:57:31 +0000 Subject: [PATCH 075/302] general/ci: mypy check tests --- .github/workflows/main.yml | 4 ++-- tests/bluemaestro.py | 24 ++++++++++++++++++------ tests/jawbone.py | 2 +- tests/pdfs.py | 3 ++- tests/takeout.py | 2 +- tests/test_tmp_config.py | 4 ++-- tox.ini | 12 +++++++++++- 7 files changed, 37 insertions(+), 14 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c45d99a..8b23921 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -50,12 +50,12 @@ jobs: - run: bash scripts/ci/run - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: .coverage.mypy-misc_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-misc/ - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: .coverage.mypy-core_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-core/ diff --git a/tests/bluemaestro.py b/tests/bluemaestro.py index 1416900..283bd77 100644 --- a/tests/bluemaestro.py +++ b/tests/bluemaestro.py @@ -1,13 +1,26 @@ -#!/usr/bin/env python3 from pathlib import Path +from typing import TYPE_CHECKING, Iterator, Any + from more_itertools import one -import pytest # type: ignore +import pytest + + +if TYPE_CHECKING: + from my.bluemaestro import Measurement +else: + Measurement = Any + + +def ok_measurements() -> Iterator[Measurement]: + from my.bluemaestro import measurements + for m in measurements(): + assert not isinstance(m, Exception) + yield m def test() -> None: - from my.bluemaestro import measurements - res2020 = [m for m in measurements() if '2020' in str(m.dt)] + res2020 = [m for m in ok_measurements() if '2020' in str(m.dt)] tp = [x for x in res2020 if x.temp == 2.1] assert len(tp) > 0 @@ -24,8 +37,7 @@ def test() -> None: def test_old_db() -> None: - from my.bluemaestro import measurements - res = list(measurements()) + res = list(ok_measurements()) r1 = one(x for x in res if x.dt.strftime('%Y%m%d %H:%M:%S') == '20181003 09:07:00') r2 = one(x for x in res if x.dt.strftime('%Y%m%d %H:%M:%S') == '20181003 09:19:00') diff --git a/tests/jawbone.py b/tests/jawbone.py index c53459d..776ac50 100644 --- a/tests/jawbone.py +++ b/tests/jawbone.py @@ -4,7 +4,7 @@ from datetime import date, time # todo private test.. move away def test_tz() -> None: - from my.jawbone import sleeps_by_date + from my.jawbone import sleeps_by_date # type: ignore[attr-defined] sleeps = sleeps_by_date() for s in sleeps.values(): assert s.sleep_start.tzinfo is not None diff --git a/tests/pdfs.py b/tests/pdfs.py index d5134bf..ae6318d 100644 --- a/tests/pdfs.py +++ b/tests/pdfs.py @@ -23,7 +23,8 @@ def test_with_error(with_config, tmp_path: Path) -> None: g = root / 'garbage.pdf' g.write_text('garbage') from my.config import pdfs - del pdfs.roots # meh. otherwise legacy config value 'wins' + # meh. otherwise legacy config value 'wins' + del pdfs.roots # type: ignore[attr-defined] pdfs.paths = (root,) from my.pdfs import annotations diff --git a/tests/takeout.py b/tests/takeout.py index f45a51d..7cc2164 100644 --- a/tests/takeout.py +++ b/tests/takeout.py @@ -13,7 +13,7 @@ from more_itertools import ilen def test_location_perf() -> None: # 2.80 s for 10 iterations and 10K points # TODO try switching to jq and see how it goes? not sure.. - print(ilen(islice(LT.iter_locations(), 0, 10000))) + print(ilen(islice(LT.iter_locations(), 0, 10000))) # type: ignore # in theory should support any HTML takeout file? diff --git a/tests/test_tmp_config.py b/tests/test_tmp_config.py index eb26e54..197d3f7 100644 --- a/tests/test_tmp_config.py +++ b/tests/test_tmp_config.py @@ -6,11 +6,11 @@ from my.core.cfg import tmp_config import pytest -def _init_default_config(): +def _init_default_config() -> None: import my.config class default_config: count = 5 - my.config.simple = default_config # type: ignore[attr-defined] + my.config.simple = default_config # type: ignore[attr-defined,assignment,misc] def test_tmp_config() -> None: diff --git a/tox.ini b/tox.ini index a8af102..6e7ca23 100644 --- a/tox.ini +++ b/tox.ini @@ -80,7 +80,7 @@ allowlist_externals = cat commands = pip install -e .[testing,optional] pip install orgparse # used it core.orgmode? - # todo add tests? + {envpython} -m mypy --install-types --non-interactive \ -p my.core \ --txt-report .coverage.mypy-core \ @@ -88,10 +88,16 @@ commands = {posargs} cat .coverage.mypy-core/index.txt + # todo hmm might be better to move modules test in a separate subpackage? + {envpython} -m mypy --install-types --non-interactive \ + tests \ + --exclude 'tests/(bluemaestro|emfit|takeout|pdfs|jawbone).py' + # specific modules that are known to be mypy compliant (to avoid false negatives) # todo maybe split into separate jobs? need to add comment how to run [testenv:mypy-misc] +allowlist_externals = cat commands = pip install -e .[testing,optional] @@ -132,6 +138,10 @@ commands = --html-report .coverage.mypy-misc \ {posargs} # txt report is a bit more convenient to view on CI + cat .coverage.mypy-misc/index.txt + + {envpython} -m mypy --install-types --non-interactive \ + tests # note: this comment doesn't seem relevant anymore, but keeping it in case the issue happens again # > ugh ... need to reset HOME, otherwise user's site-packages are somehow leaking into mypy's path... From 130c273513ce082f7d12f06793b8f55f5ac99714 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 21 Feb 2023 02:29:36 +0000 Subject: [PATCH 076/302] my.telegram.telegram_backup enhancements - add chat handle - add permalink - more precise types --- my/telegram/telegram_backup.py | 36 +++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/my/telegram/telegram_backup.py b/my/telegram/telegram_backup.py index cd82577..3e2d6a7 100644 --- a/my/telegram/telegram_backup.py +++ b/my/telegram/telegram_backup.py @@ -5,7 +5,7 @@ Telegram data via [fabianonline/telegram_backup](https://github.com/fabianonline from dataclasses import dataclass from datetime import datetime, timezone import sqlite3 -from typing import Dict, Iterator +from typing import Dict, Iterator, Optional from my.core import datetime_aware, PathIsh from my.core.sqlite import sqlite_connection @@ -22,24 +22,42 @@ class config(user_config.telegram_backup): @dataclass class Chat: id: str - name: str + name: Optional[str] + # not all users have short handle + groups don't have them either? + # TODO hmm some groups have it -- it's just the tool doesn't dump them?? + handle: Optional[str] # not sure if need type? @dataclass class User: id: str - name: str + name: Optional[str] @dataclass class Message: + # NOTE: message id is NOT unique globally -- only with respect to chat! id: int time: datetime_aware chat: Chat sender: User text: str + @property + def permalink(self) -> str: + handle = self.chat.handle + if handle is None: + clink = str(self.chat.id) + else: + # FIXME add c/ + clink = f'{handle}' + + # NOTE: don't think deep links to messages work for private conversations sadly https://core.telegram.org/api/links#message-links + # NOTE: doesn't look like this works with private groups at all, doesn't even jump into it + return f'https://t.me/{clink}/{self.id}' + + Chats = Dict[str, Chat] def _message_from_row(r: sqlite3.Row, *, chats: Chats) -> Message: @@ -61,12 +79,20 @@ def messages() -> Iterator[Message]: chats: Chats = {} for r in db.execute('SELECT * FROM chats'): - chat = Chat(id=r['id'], name=r['name']) + chat = Chat(id=r['id'], name=r['name'], handle=None) assert chat.id not in chats chats[chat.id] = chat for r in db.execute('SELECT * FROM users'): - chat = Chat(id=r['id'], name=f'{r["first_name"]} {r["last_name"]}') + first = r["first_name"] + last = r["last_name"] + name: Optional[str] + if first is not None and last is not None: + name = f'{first} {last}' + else: + name = first or last + + chat = Chat(id=r['id'], name=name, handle=r['username']) assert chat.id not in chats chats[chat.id] = chat From 02c98143d51509d028ac6496476bab24a39eb7dd Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 28 Feb 2023 02:49:14 +0000 Subject: [PATCH 077/302] vk_messages_backup: better structure & exract richer information --- my/config.py | 1 + my/vk/vk_messages_backup.py | 117 ++++++++++++++++++++++++------------ 2 files changed, 78 insertions(+), 40 deletions(-) diff --git a/my/config.py b/my/config.py index bfae86e..e9eafec 100644 --- a/my/config.py +++ b/my/config.py @@ -194,6 +194,7 @@ class simple: class vk_messages_backup: storage_path: Path + user_id: int class kobo: diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index 0e8dc45..df1d18e 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -2,95 +2,132 @@ VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) ''' # note: could reuse the original repo, but little point I guess since VK closed their API - - from datetime import datetime +from dataclasses import dataclass import json from typing import Dict, Iterable, NamedTuple import pytz -from ..core import Json +from my.core import stat, Stats, Json, Res, datetime_aware from my.config import vk_messages_backup as config -Uid = str -Name = str +# I think vk_messages_backup used this tz? +# not sure if vk actually used to return this tz in api? +TZ = pytz.timezone('Europe/Moscow') -Users = Dict[Uid, Name] +Uid = int +@dataclass(frozen=True) +class User: + id: Uid + first_name: str + last_name: str + + +@dataclass(frozen=True) +class Chat: + chat_id: str + title: str + + +@dataclass +class Message: + dt: datetime_aware + chat: Chat + id: str # todo not sure it's unique? + user: User + body: str + + +Users = Dict[Uid, User] def users() -> Users: - # todo cache? files = list(sorted(config.storage_path.glob('user_*.json'))) res = {} for f in files: j = json.loads(f.read_text()) uid = j['id'] - uf = j['first_name'] - ul = j['last_name'] - res[uid] = f'{uf} {ul}' + res[uid] = User( + id=uid, + first_name=j['first_name'], + last_name=j['last_name'], + ) return res -class Message(NamedTuple): - chat_id: str - dt: datetime - user: Name - body: str +# USERCHAT_TITLE = " ... " +def _parse_chat(*, msg: Json, udict: Users) -> Chat: + group_chat_id = msg.get('chat_id') + if group_chat_id is not None: + chat_id = group_chat_id + title = msg['title'] + else: + user_id = msg.get('user_id') or msg.get('from_id') + assert user_id is not None + user = udict[user_id] + chat_id = user_id + title = f'{user.first_name} {user.last_name}' + return Chat( + chat_id=chat_id, + title=title, + ) -msk_tz = pytz.timezone('Europe/Moscow') -# todo hmm, vk_messages_backup used this tz? not sure if vk actually used to return this tz in api? +def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: + mid = msg['id'] + md = msg['date'] -def _parse(x: Json, chat_id: str, udict: Users) -> Message: - mid = x['id'] # todo not sure if useful? - md = x['date'] - - dt = datetime.fromtimestamp(md, msk_tz) + dt = datetime.fromtimestamp(md, tz=TZ) # todo attachments? e.g. url could be an attachment # todo might be forwarded? - mb = x.get('body') + mb = msg.get('body') if mb is None: - mb = x.get('text') - assert mb is not None - - mu = x.get('user_id') or x.get('peer_id') - assert mu is not None - out = x['out'] == 1 - # todo use name from the config? - user = 'you' if out else udict[mu] - - # todo conversation id?? + mb = msg.get('text') + assert mb is not None, msg + out = msg['out'] == 1 + if out: + user = udict[config.user_id] + else: + mu = msg.get('user_id') or msg.get('from_id') + assert mu is not None, msg + user = udict[mu] return Message( - chat_id=chat_id, dt=dt, + chat=chat, + id=mid, user=user, body=mb, ) -from ..core.error import Res def messages() -> Iterable[Res[Message]]: udict = users() uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ list(sorted(config.storage_path.glob('groupchat_*.json'))) for f in uchats: - chat_id = f.stem.split('_')[-1] j = json.loads(f.read_text()) - for x in j: + # extract chat from last message + try: + last = j[-1] + chat = _parse_chat(msg=last, udict=udict) + except Exception as e: + yield e + continue + + for msg in j: try: - yield _parse(x, chat_id=chat_id, udict=udict) + yield _parse_msg(msg=msg, chat=chat, udict=udict) except Exception as e: yield e -def stats(): - from ..core import stat +def stats() -> Stats: return { **stat(users), **stat(messages), From a7099e2efcc989b4d420489c2a6dca830988956c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 28 Feb 2023 03:38:11 +0000 Subject: [PATCH 078/302] vk_messages_backup: more correct handling of group chats & better chat ids --- my/vk/vk_messages_backup.py | 39 ++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index df1d18e..78b595e 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -58,17 +58,26 @@ def users() -> Users: return res -# USERCHAT_TITLE = " ... " +GROUP_CHAT_MIN_ID = 2000000000 def _parse_chat(*, msg: Json, udict: Users) -> Chat: - group_chat_id = msg.get('chat_id') - if group_chat_id is not None: - chat_id = group_chat_id + # exported with newer api, peer_id is a proper identifier both for users and chats + peer_id = msg.get('peer_id') + if peer_id is not None: + chat_id = peer_id + else: + group_chat_id = msg.get('chat_id') + if group_chat_id is not None: + chat_id = GROUP_CHAT_MIN_ID + group_chat_id + else: + chat_id = msg['user_id'] + + is_group_chat = chat_id >= GROUP_CHAT_MIN_ID + if is_group_chat: title = msg['title'] else: user_id = msg.get('user_id') or msg.get('from_id') assert user_id is not None user = udict[user_id] - chat_id = user_id title = f'{user.first_name} {user.last_name}' return Chat( chat_id=chat_id, @@ -112,12 +121,20 @@ def messages() -> Iterable[Res[Message]]: list(sorted(config.storage_path.glob('groupchat_*.json'))) for f in uchats: j = json.loads(f.read_text()) - # extract chat from last message - try: - last = j[-1] - chat = _parse_chat(msg=last, udict=udict) - except Exception as e: - yield e + # ugh. very annoying, sometimes not possible to extract title from last message + # due to newer api... + # so just do in defensively until we succeed... + chat = None + ex = None + for m in reversed(j): + try: + chat = _parse_chat(msg=m, udict=udict) + except Exception as e: + ex = e + continue + if chat is None: + assert ex is not None + yield ex continue for msg in j: From 6dc5e7575ffa8ffee3c4aa3cedcb70e99ad6a7dd Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 28 Feb 2023 03:44:10 +0000 Subject: [PATCH 079/302] vk_messages_backup: add unique_everseen to prevent duplicate messages --- my/vk/vk_messages_backup.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index 78b595e..089605b 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -5,8 +5,9 @@ VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totkton from datetime import datetime from dataclasses import dataclass import json -from typing import Dict, Iterable, NamedTuple +from typing import Dict, Iterator, NamedTuple +from more_itertools import unique_everseen import pytz from my.core import stat, Stats, Json, Res, datetime_aware @@ -34,7 +35,7 @@ class Chat: title: str -@dataclass +@dataclass(frozen=True) class Message: dt: datetime_aware chat: Chat @@ -114,7 +115,7 @@ def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: ) -def messages() -> Iterable[Res[Message]]: +def _messages() -> Iterator[Res[Message]]: udict = users() uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ @@ -144,6 +145,11 @@ def messages() -> Iterable[Res[Message]]: yield e +def messages() -> Iterator[Res[Message]]: + # seems that during backup messages were sometimes duplicated.. + yield from unique_everseen(_messages()) + + def stats() -> Stats: return { **stat(users), From 98b086f746fdc519cf0d51b86a777a265c359552 Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Mon, 27 Feb 2023 20:30:06 -0800 Subject: [PATCH 080/302] location fallback (#263) see https://github.com/karlicoss/HPI/issues/262 * move home to fallback/via_home.py * move via_ip to fallback * add fallback model * add stub via_ip file * add fallback_locations for via_ip * use protocol for locations * estimate_from helper, via_home estimator, all.py * via_home: add accuracy, cache history * add datasources to gpslogger/google_takeout * tz/via_location.py: update import to fallback * denylist docs/installation instructions * tz.via_location: let user customize cachew refresh time * add via_ip.estimate_location using binary search * use estimate_location in via_home.get_location * tests: add gpslogger to location config stub * tests: install tz related libs in test env * tz: add regression test for broken windows dates * vendorize bisect_left from python src doesnt have a 'key' parameter till python3.10 --- doc/DENYLIST.md | 130 ++++++++++++++++++++++ doc/MODULE_DESIGN.org | 3 +- my/config.py | 4 +- my/core/compat.py | 32 ++++++ my/core/denylist.py | 178 +++++++++++++++++++++++++++++++ my/ip/all.py | 2 +- my/ip/common.py | 8 +- my/location/all.py | 1 + my/location/common.py | 23 +++- my/location/fallback/all.py | 53 +++++++++ my/location/fallback/common.py | 120 +++++++++++++++++++++ my/location/fallback/via_home.py | 104 ++++++++++++++++++ my/location/fallback/via_ip.py | 99 +++++++++++++++++ my/location/google_takeout.py | 7 +- my/location/gpslogger.py | 10 +- my/location/home.py | 74 +------------ my/location/via_ip.py | 38 +------ my/time/tz/via_location.py | 95 ++++++++++++----- setup.py | 1 + tests/core/test_denylist.py | 106 ++++++++++++++++++ tests/location.py | 23 +--- tests/location_fallback.py | 125 ++++++++++++++++++++++ tests/shared_config.py | 65 +++++++++++ tests/tz.py | 47 +++----- tox.ini | 8 +- 25 files changed, 1166 insertions(+), 190 deletions(-) create mode 100644 doc/DENYLIST.md create mode 100644 my/core/denylist.py create mode 100644 my/location/fallback/all.py create mode 100644 my/location/fallback/common.py create mode 100644 my/location/fallback/via_home.py create mode 100644 my/location/fallback/via_ip.py create mode 100644 tests/core/test_denylist.py create mode 100644 tests/location_fallback.py create mode 100644 tests/shared_config.py diff --git a/doc/DENYLIST.md b/doc/DENYLIST.md new file mode 100644 index 0000000..d57b8b1 --- /dev/null +++ b/doc/DENYLIST.md @@ -0,0 +1,130 @@ +For code reference, see: [`my.core.denylist.py`](../my/core/denylist.py) + +A helper module for defining denylists for sources programmatically (in layman's terms, this lets you remove some particular output from a module you don't want) + +Lets you specify a class, an attribute to match on, +and a JSON file containing a list of values to deny/filter out + +As an example, this will use the `my.ip` module, as filtering incorrect IPs was the original use case for this module: + +```python +class IP(NamedTuple): + addr: str + dt: datetime +``` + +A possible denylist file would contain: + +```json +[ + { + "addr": "192.168.1.1", + }, + { + "dt": "2020-06-02T03:12:00+00:00", + } +] +``` + +Note that if the value being compared to is not a single (non-array/object) JSON primitive +(str, int, float, bool, None), it will be converted to a string before comparison + +To use this in code: + +```python +from my.ip.all import ips +filtered = DenyList("~/data/ip_denylist.json").filter(ips()) +``` + +To add items to the denylist, in python (in a one-off script): + +```python +from my.ip.all import ips +from my.core.denylist import DenyList + +d = DenyList("~/data/ip_denylist.json") + +for ip in ips(): + # some custom code you define + if ip.addr == ...: + d.deny(key="ip", value=ip.ip) + d.write() +``` + +... or interactively, which requires [`fzf`](https://github.com/junegunn/fzf) and [`pyfzf-iter`](https://pypi.org/project/pyfzf-iter/) (`python3 -m pip install pyfzf-iter`) to be installed: + +```python +from my.ip.all import ips +from my.core.denylist import DenyList + +d = DenyList("~/data/ip_denylist.json") +d.deny_cli(ips()) # automatically writes after each selection +``` + +That will open up an interactive `fzf` prompt, where you can select an item to add to the denylist + +This is meant for relatively simple filters, where you want to filter items out +based on a single attribute of a namedtuple/dataclass. If you want to do something +more complex, I would recommend overriding the `all.py` file for that source and +writing your own filter function there. + +For more info on all.py: + +https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy + +This would typically be used in an overridden `all.py` file, or in a one-off script +which you may want to filter out some items from a source, progressively adding more +items to the denylist as you go. + +A potential `my/ip/all.py` file might look like (Sidenote: `discord` module from [here](https://github.com/seanbreckenridge/HPI)): + +```python +from typing import Iterator + +from my.ip.common import IP +from my.core.denylist import DenyList + +deny = DenyList("~/data/ip_denylist.json") + +# all possible data from the source +def _ips() -> Iterator[IP]: + from my.ip import discord + # could add other imports here + + yield from discord.ips() + + +# filtered data +def ips() -> Iterator[IP]: + yield from deny.filter(_ips()) +``` + +To add items to the denylist, you could create a `__main__.py` in your namespace package (in this case, `my/ip/__main__.py`), with contents like: + +```python +from my.ip import all + +if __name__ == "__main__": + all.deny.deny_cli(all.ips()) +``` + +Which could then be called like: `python3 -m my.ip` + +Or, you could just run it from the command line: + +``` +python3 -c 'from my.ip import all; all.deny.deny_cli(all.ips())' +``` + +To edit the `all.py`, you could either: + +- install it as editable (`python3 -m pip install --user -e ./HPI`), and then edit the file directly +- or, create a namespace package, which splits the package across multiple directories. For info on that see [`MODULE_DESIGN`](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#namespace-packages), [`reorder_editable`](https://github.com/seanbreckenridge/reorder_editable), and possibly the [`HPI-template`](https://github.com/seanbreckenridge/HPI-template) to create your own HPI namespace package to create your own `all.py` file. + +TODO: link to seanbreckenridge/HPI-personal for an example of this once this is merged/settled + +Sidenote: the reason why we want to specifically override +the all.py and not just create a script that filters out the items you're +not interested in is because we want to be able to import from `my.ip.all` +or `my.location.all` from other modules and get the filtered results, without +having to mix data filtering logic with parsing/loading/caching (the stuff HPI does) diff --git a/doc/MODULE_DESIGN.org b/doc/MODULE_DESIGN.org index b17526d..691dd1c 100644 --- a/doc/MODULE_DESIGN.org +++ b/doc/MODULE_DESIGN.org @@ -226,8 +226,7 @@ The main goals are: - doesn't require you to maintain a fork of this repository, though you can maintain a separate HPI repository (so no patching/merge conflicts) - allows you to easily add/remove sources to the ~all.py~ module, either by: - overriding an ~all.py~ in your own repository - - just commenting out the source/adding 2 lines to import and ~yield - from~ your new source + - just commenting out the source/adding 2 lines to import and ~yield from~ your new source - doing nothing! (~import_source~ will catch the error and just warn you and continue to work without changing any code) diff --git a/my/config.py b/my/config.py index e9eafec..7075d1d 100644 --- a/my/config.py +++ b/my/config.py @@ -72,16 +72,18 @@ class google: from typing import Sequence, Union, Tuple -from datetime import datetime, date +from datetime import datetime, date, timedelta DateIsh = Union[datetime, date, str] LatLon = Tuple[float, float] class location: # todo ugh, need to think about it... mypy wants the type here to be general, otherwise it can't deduce # and we can't import the types from the module itself, otherwise would be circular. common module? home: Union[LatLon, Sequence[Tuple[DateIsh, LatLon]]] = (1.0, -1.0) + home_accuracy = 30_000.0 class via_ip: accuracy: float + for_duration: timedelta class gpslogger: export_path: Paths = '' diff --git a/my/core/compat.py b/my/core/compat.py index 3c825f2..dcf97cc 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -125,3 +125,35 @@ else: else: from typing import Dict TypedDict = Dict + + +# bisect_left doesnt have a 'key' parameter (which we use) +# till python3.10 +if sys.version_info[:2] <= (3, 9): + from typing import List, TypeVar, Any, Optional, Callable + X = TypeVar('X') + # copied from python src + def bisect_left(a: List[Any], x: Any, lo: int=0, hi: Optional[int]=None, *, key: Optional[Callable[..., Any]]=None) -> int: + if lo < 0: + raise ValueError('lo must be non-negative') + if hi is None: + hi = len(a) + # Note, the comparison uses "<" to match the + # __lt__() logic in list.sort() and in heapq. + if key is None: + while lo < hi: + mid = (lo + hi) // 2 + if a[mid] < x: + lo = mid + 1 + else: + hi = mid + else: + while lo < hi: + mid = (lo + hi) // 2 + if key(a[mid]) < x: + lo = mid + 1 + else: + hi = mid + return lo +else: + from bisect import bisect_left # type: ignore[misc] diff --git a/my/core/denylist.py b/my/core/denylist.py new file mode 100644 index 0000000..fcf3e2b --- /dev/null +++ b/my/core/denylist.py @@ -0,0 +1,178 @@ +""" +A helper module for defining denylists for sources programatically +(in lamens terms, this lets you remove some output from a module you don't want) + +For docs, see doc/DENYLIST.md +""" + +import sys +import json +import functools +from collections import defaultdict +from typing import TypeVar, Set, Any, Mapping, Iterator, Dict, List +from pathlib import Path + +import click +from more_itertools import seekable +from my.core.serialize import dumps +from my.core.common import PathIsh +from my.core.warnings import medium + + +T = TypeVar("T") + +DenyMap = Mapping[str, Set[Any]] + + +def _default_key_func(obj: T) -> str: + return str(obj) + + +class DenyList: + def __init__(self, denylist_file: PathIsh): + self.file = Path(denylist_file).expanduser().absolute() + self._deny_raw_list: List[Dict[str, Any]] = [] + self._deny_map: DenyMap = defaultdict(set) + + # deny cli, user can override these + self.fzf_path = None + self._fzf_options = () + self._deny_cli_key_func = None + + def _load(self) -> None: + if not self.file.exists(): + medium(f"denylist file {self.file} does not exist") + return + + deny_map: DenyMap = defaultdict(set) + data: List[Dict[str, Any]]= json.loads(self.file.read_text()) + self._deny_raw_list = data + + for ignore in data: + for k, v in ignore.items(): + deny_map[k].add(v) + + self._deny_map = deny_map + + def load(self) -> DenyMap: + self._load() + return self._deny_map + + def write(self) -> None: + if not self._deny_raw_list: + medium("no denylist data to write") + return + self.file.write_text(json.dumps(self._deny_raw_list)) + + @classmethod + def _is_json_primitive(cls, val: Any) -> bool: + return isinstance(val, (str, int, float, bool, type(None))) + + @classmethod + def _stringify_value(cls, val: Any) -> Any: + # if it's a primitive, just return it + if cls._is_json_primitive(val): + return val + # otherwise, stringify-and-back so we can compare to + # json data loaded from the denylist file + return json.loads(dumps(val)) + + @classmethod + def _allow(cls, obj: T, deny_map: DenyMap) -> bool: + for deny_key, deny_set in deny_map.items(): + # this should be done separately and not as part of the getattr + # because 'null'/None could actually be a value in the denylist, + # and the user may define behavior to filter that out + if not hasattr(obj, deny_key): + return False + val = cls._stringify_value(getattr(obj, deny_key)) + # this object doesn't have have the attribute in the denylist + if val in deny_set: + return False + # if we tried all the denylist keys and didn't return False, + # then this object is allowed + return True + + def filter( + self, + itr: Iterator[T], + invert: bool = False, + ) -> Iterator[T]: + denyf = functools.partial(self._allow, deny_map=self.load()) + if invert: + return filter(lambda x: not denyf(x), itr) + return filter(denyf, itr) + + def deny(self, key: str, value: Any, write: bool = False) -> None: + ''' + add a key/value pair to the denylist + ''' + if not self._deny_raw_list: + self._load() + self._deny_raw({key: self._stringify_value(value)}, write=write) + + def _deny_raw(self, data: Dict[str, Any], write: bool = False) -> None: + self._deny_raw_list.append(data) + if write: + self.write() + + def _prompt_keys(self, item: T) -> str: + import pprint + + click.echo(pprint.pformat(item)) + # TODO: extract keys from item by checking if its dataclass/NT etc.? + resp = click.prompt("Key to deny on").strip() + if not hasattr(item, resp): + click.echo(f"Could not find key '{resp}' on item", err=True) + return self._prompt_keys(item) + return resp + + def _deny_cli_remember( + self, + items: Iterator[T], + mem: Dict[str, T], + ) -> Iterator[str]: + keyf = self._deny_cli_key_func or _default_key_func + # i.e., convert each item to a string, and map str -> item + for item in items: + key = keyf(item) + mem[key] = item + yield key + + def deny_cli(self, itr: Iterator[T]) -> None: + try: + from pyfzf import FzfPrompt + except ImportError: + click.echo("pyfzf is required to use the denylist cli, run 'python3 -m pip install pyfzf_iter'", err=True) + sys.exit(1) + + # wrap in seekable so we can use it multiple times + # progressively caches the items as we iterate over them + sit = seekable(itr) + + prompt_continue = True + + while prompt_continue: + # reset the iterator + sit.seek(0) + # so we can map the selected string from fzf back to the original objects + memory_map: Dict[str, T] = {} + picker = FzfPrompt( + executable_path=self.fzf_path, default_options="--no-multi" + ) + picked_l = picker.prompt( + self._deny_cli_remember(itr, memory_map), + "--read0", + *self._fzf_options, + delimiter="\0", + ) + assert isinstance(picked_l, list) + if picked_l: + picked: T = memory_map[picked_l[0]] + key = self._prompt_keys(picked) + self.deny(key, getattr(picked, key), write=True) + click.echo(f"Added {self._deny_raw_list[-1]} to denylist", err=True) + else: + click.echo("No item selected", err=True) + + prompt_continue = click.confirm("Continue?") diff --git a/my/ip/all.py b/my/ip/all.py index b21b543..f4cdb37 100644 --- a/my/ip/all.py +++ b/my/ip/all.py @@ -13,7 +13,7 @@ from typing import Iterator from my.core.common import Stats, warn_if_empty -from .common import IP +from my.ip.common import IP @warn_if_empty diff --git a/my/ip/common.py b/my/ip/common.py index 82008e2..b4bfc8e 100644 --- a/my/ip/common.py +++ b/my/ip/common.py @@ -7,7 +7,7 @@ REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] from my.core import __NOT_HPI_MODULE__ import ipaddress -from typing import NamedTuple, Iterator +from typing import NamedTuple, Iterator, Tuple from datetime import datetime import ipgeocache @@ -23,6 +23,12 @@ class IP(NamedTuple): def ipgeocache(self) -> Json: return ipgeocache.get(self.addr) + @property + def latlon(self) -> Tuple[float, float]: + loc: str = self.ipgeocache()["loc"] + lat, _, lon = loc.partition(",") + return float(lat), float(lon) + @property def tzname(self) -> str: tz: str = self.ipgeocache()["timezone"] diff --git a/my/location/all.py b/my/location/all.py index eec4bcc..8d51a82 100644 --- a/my/location/all.py +++ b/my/location/all.py @@ -32,6 +32,7 @@ def _gpslogger_locations() -> Iterator[Location]: yield from gpslogger.locations() +# TODO: remove, user should use fallback.estimate_location or fallback.fallback_locations instead @import_source(module_name="my.location.via_ip") def _ip_locations() -> Iterator[Location]: from . import via_ip diff --git a/my/location/common.py b/my/location/common.py index b0676ec..fa8bdad 100644 --- a/my/location/common.py +++ b/my/location/common.py @@ -1,17 +1,34 @@ from datetime import date, datetime -from typing import Union, Tuple, NamedTuple, Optional +from typing import Union, Tuple, Optional +from dataclasses import dataclass from my.core import __NOT_HPI_MODULE__ +from my.core.compat import Protocol DateIsh = Union[datetime, date, str] LatLon = Tuple[float, float] -# TODO: add timezone to this? can use timezonefinder in tz provider instead though -class Location(NamedTuple): +class LocationProtocol(Protocol): lat: float lon: float dt: datetime accuracy: Optional[float] elevation: Optional[float] + datasource: Optional[str] = None # which module provided this, useful for debugging + + +# TODO: add timezone to this? can use timezonefinder in tz provider instead though + + +# converted from namedtuple to a dataclass so datasource field can be added optionally +# if we want, can eventually be converted back to a namedtuple when all datasources are compliant +@dataclass(frozen=True, eq=True) +class Location(LocationProtocol): + lat: float + lon: float + dt: datetime + accuracy: Optional[float] + elevation: Optional[float] + datasource: Optional[str] = None # which module provided this, useful for debugging diff --git a/my/location/fallback/all.py b/my/location/fallback/all.py new file mode 100644 index 0000000..0c7b8cd --- /dev/null +++ b/my/location/fallback/all.py @@ -0,0 +1,53 @@ +# TODO: add config here which passes kwargs to estimate_from (under_accuracy) +# overwritable by passing the kwarg name here to the top-level estimate_location + +from typing import Iterator, Optional + +from my.core.source import import_source +from my.location.fallback.common import ( + estimate_from, + FallbackLocation, + DateExact, + LocationEstimator, +) + + +def fallback_locations() -> Iterator[FallbackLocation]: + # can comment/uncomment sources here to enable/disable them + yield from _ip_fallback_locations() + + +def fallback_estimators() -> Iterator[LocationEstimator]: + # can comment/uncomment estimators here to enable/disable them + # the order of the estimators determines priority if location accuries are equal/unavailable + yield _ip_estimate + yield _home_estimate + + +def estimate_location(dt: DateExact, first_match: bool=False, under_accuracy: Optional[int] = None) -> FallbackLocation: + loc = estimate_from(dt, estimators=list(fallback_estimators()), first_match=first_match, under_accuracy=under_accuracy) + # should never happen if the user has home configured + if loc is None: + raise ValueError("Could not estimate location") + return loc + + +@import_source(module_name="my.location.fallback.via_home") +def _home_estimate(dt: DateExact) -> Iterator[FallbackLocation]: + from my.location.fallback.via_home import estimate_location as via_home_estimate + + yield from via_home_estimate(dt) + + +@import_source(module_name="my.location.fallback.via_ip") +def _ip_estimate(dt: DateExact) -> Iterator[FallbackLocation]: + from my.location.fallback.via_ip import estimate_location as via_ip_estimate + + yield from via_ip_estimate(dt) + + +@import_source(module_name="my.location.fallback.via_ip") +def _ip_fallback_locations() -> Iterator[FallbackLocation]: + from my.location.fallback.via_ip import fallback_locations as via_ip_fallback + + yield from via_ip_fallback() diff --git a/my/location/fallback/common.py b/my/location/fallback/common.py new file mode 100644 index 0000000..fa1d4c5 --- /dev/null +++ b/my/location/fallback/common.py @@ -0,0 +1,120 @@ +from __future__ import annotations +from dataclasses import dataclass +from typing import Optional, Callable, Sequence, Iterator, List, Union +from datetime import datetime, timedelta, timezone + +from ..common import LocationProtocol, Location +DateExact = Union[datetime, float, int] # float/int as epoch timestamps + +Second = float + +@dataclass +class FallbackLocation(LocationProtocol): + lat: float + lon: float + dt: datetime + duration: Optional[Second] = None + accuracy: Optional[float] = None + elevation: Optional[float] = None + datasource: Optional[str] = None # which module provided this, useful for debugging + + def to_location(self, end: bool = False) -> Location: + ''' + by default the start date is used for the location + If end is True, the start date + duration is used + ''' + dt: datetime = self.dt + if end and self.duration is not None: + dt += timedelta(self.duration) + return Location( + lat=self.lat, + lon=self.lon, + dt=dt, + accuracy=self.accuracy, + elevation=self.elevation, + datasource=self.datasource, + ) + + @classmethod + def from_end_date( + cls, + *, + lat: float, + lon: float, + dt: datetime, + end_dt: datetime, + accuracy: Optional[float] = None, + elevation: Optional[float] = None, + datasource: Optional[str] = None, + ) -> FallbackLocation: + ''' + Create FallbackLocation from a start date and an end date + ''' + if end_dt < dt: + raise ValueError("end_date must be after dt") + duration = (end_dt - dt).total_seconds() + return cls( + lat=lat, + lon=lon, + dt=dt, + duration=duration, + accuracy=accuracy, + elevation=elevation, + datasource=datasource, + ) + + +# a location estimator can return multiple fallbacks, incase there are +# differing accuracies/to allow for possible matches to be computed +# iteratively +LocationEstimator = Callable[[DateExact], Iterator[FallbackLocation]] +LocationEstimators = Sequence[LocationEstimator] + +# helper function, instead of dealing with datetimes while comparing, just use epoch timestamps +def _datetime_timestamp(dt: DateExact) -> float: + if isinstance(dt, datetime): + try: + return dt.timestamp() + except ValueError: + # https://github.com/python/cpython/issues/75395 + return dt.replace(tzinfo=timezone.utc).timestamp() + return float(dt) + +def _iter_estimate_from( + dt: DateExact, + estimators: LocationEstimators, +) -> Iterator[FallbackLocation]: + for est in estimators: + yield from est(dt) + + +def estimate_from( + dt: DateExact, + estimators: LocationEstimators, + *, + first_match: bool = False, + under_accuracy: Optional[int] = None, +) -> Optional[FallbackLocation]: + ''' + first_match: if True, return the first location found + under_accuracy: if set, only return locations with accuracy under this value + ''' + found: List[FallbackLocation] = [] + for loc in _iter_estimate_from(dt, estimators): + if under_accuracy is not None and loc.accuracy is not None and loc.accuracy > under_accuracy: + continue + if first_match: + return loc + found.append(loc) + + if not found: + return None + + # if all items have accuracy, return the one with the lowest accuracy + # otherwise, we should prefer the order that the estimators are passed in as + if all(loc.accuracy is not None for loc in found): + # return the location with the lowest accuracy + return min(found, key=lambda loc: loc.accuracy) # type: ignore[return-value, arg-type] + else: + # return the first location + return found[0] diff --git a/my/location/fallback/via_home.py b/my/location/fallback/via_home.py new file mode 100644 index 0000000..240da84 --- /dev/null +++ b/my/location/fallback/via_home.py @@ -0,0 +1,104 @@ +''' +Simple location provider, serving as a fallback when more detailed data isn't available +''' + +from dataclasses import dataclass +from datetime import datetime, time, timezone +from functools import lru_cache +from typing import Sequence, Tuple, Union, cast, List, Iterator + +from my.config import location as user_config + +from my.location.common import LatLon, DateIsh +from my.location.fallback.common import FallbackLocation, DateExact + +@dataclass +class Config(user_config): + home: Union[ + LatLon, # either single, 'current' location + Sequence[Tuple[ # or, a sequence of location history + DateIsh, # date when you moved to + LatLon, # the location + ]] + ] + + # default ~30km accuracy + # this is called 'home_accuracy' since it lives on the base location.config object, + # to differentiate it from accuracy for other providers + home_accuracy: float = 30_000 + + # TODO could make current Optional and somehow determine from system settings? + @property + def _history(self) -> Sequence[Tuple[datetime, LatLon]]: + home1 = self.home + # todo ugh, can't test for isnstance LatLon, it's a tuple itself + home2: Sequence[Tuple[DateIsh, LatLon]] + if isinstance(home1[0], tuple): + # already a sequence + home2 = cast(Sequence[Tuple[DateIsh, LatLon]], home1) + else: + # must be a pair of coordinates. also doesn't really matter which date to pick? + loc = cast(LatLon, home1) + home2 = [(datetime.min, loc)] + + # todo cache? + res = [] + for x, loc in home2: + dt: datetime + if isinstance(x, str): + dt = datetime.fromisoformat(x) + elif isinstance(x, datetime): + dt = x + else: + dt = datetime.combine(x, time.min) + # todo not sure about doing it here, but makes it easier to compare.. + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + res.append((dt, loc)) + res = list(sorted(res, key=lambda p: p[0])) + return res + + +from ...core.cfg import make_config +config = make_config(Config) + + +@lru_cache(maxsize=None) +def get_location(dt: datetime) -> LatLon: + ''' + Interpolates the location at dt + ''' + loc = list(estimate_location(dt)) + assert len(loc) == 1 + return loc[0].lat, loc[0].lon + + +# TODO: in python3.9, use functools.cached_property instead? +@lru_cache(maxsize=None) +def homes_cached() -> List[Tuple[datetime, LatLon]]: + return list(config._history) + + +def estimate_location(dt: DateExact) -> Iterator[FallbackLocation]: + from my.location.fallback.common import _datetime_timestamp + d: float = _datetime_timestamp(dt) + hist = list(reversed(homes_cached())) + for pdt, (lat, lon) in hist: + if d >= pdt.timestamp(): + yield FallbackLocation( + lat=lat, + lon=lon, + accuracy=config.home_accuracy, + dt=datetime.fromtimestamp(d, timezone.utc), + datasource='via_home') + return + else: + # I guess the most reasonable is to fallback on the first location + lat, lon = hist[-1][1] + yield FallbackLocation( + lat=lat, + lon=lon, + accuracy=config.home_accuracy, + dt=datetime.fromtimestamp(d, timezone.utc), + datasource='via_home') + return diff --git a/my/location/fallback/via_ip.py b/my/location/fallback/via_ip.py new file mode 100644 index 0000000..1da2315 --- /dev/null +++ b/my/location/fallback/via_ip.py @@ -0,0 +1,99 @@ +""" +Converts IP addresses provided by my.location.ip to estimated locations +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + +from datetime import timedelta + +from my.core import dataclass, Stats, make_config +from my.config import location +from my.core.warnings import medium + + +@dataclass +class ip_config(location.via_ip): + # no real science to this, just a guess of ~15km accuracy for IP addresses + accuracy: float = 15_000.0 + # default to being accurate for a day + for_duration: timedelta = timedelta(hours=24) + + +# TODO: move config to location.fallback.via_location instead and add migration +config = make_config(ip_config) + + +from functools import lru_cache +from typing import Iterator, List + +from my.core.common import LazyLogger +from my.core.compat import bisect_left +from my.ip.all import ips +from my.location.common import Location +from my.location.fallback.common import FallbackLocation, DateExact, _datetime_timestamp + +logger = LazyLogger(__name__, level="warning") + + +def fallback_locations() -> Iterator[FallbackLocation]: + dur = config.for_duration.total_seconds() + for ip in ips(): + lat, lon = ip.latlon + yield FallbackLocation( + lat=lat, + lon=lon, + dt=ip.dt, + accuracy=config.accuracy, + duration=dur, + elevation=None, + datasource="via_ip", + ) + + +# for compatibility with my.location.via_ip, this shouldnt be used by other modules +def locations() -> Iterator[Location]: + medium("locations is deprecated, should use fallback_locations or estimate_location") + yield from map(FallbackLocation.to_location, fallback_locations()) + + +@lru_cache(1) +def _sorted_fallback_locations() -> List[FallbackLocation]: + fl = list(filter(lambda l: l.duration is not None, fallback_locations())) + logger.debug(f"Fallback locations: {len(fl)}, sorting...:") + fl.sort(key=lambda l: l.dt.timestamp()) + return fl + + +def estimate_location(dt: DateExact) -> Iterator[FallbackLocation]: + # logger.debug(f"Estimating location for: {dt}") + fl = _sorted_fallback_locations() + dt_ts = _datetime_timestamp(dt) + + # search to find the first possible location which contains dt (something that started up to + # config.for_duration ago, and ends after dt) + idx = bisect_left(fl, dt_ts - config.for_duration.total_seconds(), key=lambda l: l.dt.timestamp()) # type: ignore[operator,call-arg,type-var] + + # all items are before the given dt + if idx == len(fl): + return + + # iterate through in sorted order, until we find a location that is after the given dt + while idx < len(fl): + loc = fl[idx] + start_time = loc.dt.timestamp() + # loc.duration is filtered for in _sorted_fallback_locations + end_time = start_time + loc.duration # type: ignore[operator] + if start_time <= dt_ts <= end_time: + # logger.debug(f"Found location for {dt}: {loc}") + yield loc + # no more locations could possibly contain dt + if start_time > dt_ts: + # logger.debug(f"Passed start time: {end_time} > {dt_ts} ({datetime.fromtimestamp(end_time)} > {datetime.fromtimestamp(dt_ts)})") + break + idx += 1 + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/location/google_takeout.py b/my/location/google_takeout.py index 80b31cb..a1c1403 100644 --- a/my/location/google_takeout.py +++ b/my/location/google_takeout.py @@ -23,7 +23,12 @@ def locations() -> Iterator[Location]: for g in events(): if isinstance(g, GoogleLocation): yield Location( - lon=g.lng, lat=g.lat, dt=g.dt, accuracy=g.accuracy, elevation=None + lon=g.lng, + lat=g.lat, + dt=g.dt, + accuracy=g.accuracy, + elevation=None, + datasource="google_takeout", ) diff --git a/my/location/gpslogger.py b/my/location/gpslogger.py index 95f4474..46fc381 100644 --- a/my/location/gpslogger.py +++ b/my/location/gpslogger.py @@ -32,9 +32,16 @@ from .common import Location logger = LazyLogger(__name__, level="warning") +def _input_sort_key(path: Path) -> str: + if "_" in path.name: + return path.name.split("_", maxsplit=1)[1] + return path.name + def inputs() -> Sequence[Path]: - return get_files(config.export_path, glob="*.gpx") + # gpslogger files can optionally be prefixed by a device id, + # like b5760c66102a5269_20211214142156.gpx + return sorted(get_files(config.export_path, glob="*.gpx", sort=False), key=_input_sort_key) def _cachew_depends_on() -> List[float]: @@ -65,6 +72,7 @@ def _extract_locations(path: Path) -> Iterator[Location]: accuracy=config.accuracy, elevation=point.elevation, dt=datetime.replace(point.time, tzinfo=timezone.utc), + datasource="gpslogger", ) diff --git a/my/location/home.py b/my/location/home.py index ac0fcb8..f6e6978 100644 --- a/my/location/home.py +++ b/my/location/home.py @@ -1,71 +1,7 @@ -''' -Simple location provider, serving as a fallback when more detailed data isn't available -''' -from dataclasses import dataclass -from datetime import datetime, time, timezone -from functools import lru_cache -from typing import Sequence, Tuple, Union, cast +from .fallback.via_home import * -from my.config import location as user_config +from my.core.warnings import high -from my.location.common import LatLon, DateIsh - -@dataclass -class Config(user_config): - home: Union[ - LatLon, # either single, 'current' location - Sequence[Tuple[ # or, a sequence of location history - DateIsh, # date when you moved to - LatLon, # the location - ]] - ] - # TODO could make current Optional and somehow determine from system settings? - @property - def _history(self) -> Sequence[Tuple[datetime, LatLon]]: - home1 = self.home - # todo ugh, can't test for isnstance LatLon, it's a tuple itself - home2: Sequence[Tuple[DateIsh, LatLon]] - if isinstance(home1[0], tuple): - # already a sequence - home2 = cast(Sequence[Tuple[DateIsh, LatLon]], home1) - else: - # must be a pair of coordinates. also doesn't really matter which date to pick? - loc = cast(LatLon, home1) - home2 = [(datetime.min, loc)] - - # todo cache? - res = [] - for x, loc in home2: - dt: datetime - if isinstance(x, str): - dt = datetime.fromisoformat(x) - elif isinstance(x, datetime): - dt = x - else: - dt = datetime.combine(x, time.min) - # todo not sure about doing it here, but makes it easier to compare.. - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - res.append((dt, loc)) - res = list(sorted(res, key=lambda p: p[0])) - return res - - -from ..core.cfg import make_config -config = make_config(Config) - - -@lru_cache(maxsize=None) -def get_location(dt: datetime) -> LatLon: - ''' - Interpolates the location at dt - ''' - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - hist = list(reversed(config._history)) - for pdt, loc in hist: - if dt >= pdt: - return loc - else: - # I guess the most reasonable is to fallback on the first location - return hist[-1][1] +high( + "my.location.home is deprecated, use my.location.fallback.via_home instead, or estimate locations using the higher-level my.location.fallback.all.estimate_location" +) diff --git a/my/location/via_ip.py b/my/location/via_ip.py index e882cdb..df48f8b 100644 --- a/my/location/via_ip.py +++ b/my/location/via_ip.py @@ -1,39 +1,7 @@ -""" -Converts IP addresses provided by my.location.ip to estimated locations -""" - REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] -from my.core import dataclass, Stats -from my.config import location +from .fallback.via_ip import * +from my.core.warnings import high -@dataclass -class config(location.via_ip): - # no real science to this, just a guess of ~15km accuracy for IP addresses - accuracy: float = 15_000.0 - - -from typing import Iterator - -from .common import Location -from my.ip.all import ips - - -def locations() -> Iterator[Location]: - for ip in ips(): - loc: str = ip.ipgeocache()["loc"] - lat, _, lon = loc.partition(",") - yield Location( - lat=float(lat), - lon=float(lon), - dt=ip.dt, - accuracy=config.accuracy, - elevation=None, - ) - - -def stats() -> Stats: - from my.core import stat - - return {**stat(locations)} +high("my.location.via_ip is deprecated, use my.location.fallback.via_ip instead") diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 6b8e835..e111a4a 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -41,17 +41,23 @@ class config(user_config): # if the accuracy for the location is more than 5km, don't use require_accuracy: float = 5_000 + # how often (hours) to refresh the cachew timezone cache + # this may be removed in the future if we opt for dict-based caching + _iter_tz_refresh_time: int = 6 + from collections import Counter from datetime import date, datetime from functools import lru_cache from itertools import groupby -from typing import Iterator, NamedTuple, Optional, Tuple, Any, List, Iterable +from typing import Iterator, NamedTuple, Optional, Tuple, Any, List, Iterable, Set -from more_itertools import seekable +import heapq import pytz +from more_itertools import seekable from my.core.common import LazyLogger, mcachew, tzdatetime +from my.core.source import import_source logger = LazyLogger(__name__, level='warning') @@ -102,23 +108,13 @@ def _sorted_locations() -> List[Tuple[LatLon, datetime]]: return list(sorted(_locations(), key=lambda x: x[1])) -# Note: this takes a while, as the upstream since _locations isn't sorted, so this -# has to do an iterative sort of the entire my.locations.all list -def _iter_local_dates() -> Iterator[DayWithZone]: - finder = _timezone_finder(fast=config.fast) # rely on the default - #pdt = None - # TODO: warnings doesnt actually warn? - warnings = [] - - locs: Iterable[Tuple[LatLon, datetime]] - locs = _sorted_locations() if config.sort_locations else _locations() - - # todo allow to skip if not noo many errors in row? +def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> Iterator[DayWithZone]: for (lat, lon), dt in locs: # TODO right. its _very_ slow... zone = finder.timezone_at(lat=lat, lng=lon) + # todo allow to skip if not noo many errors in row? if zone is None: - warnings.append(f"Couldn't figure out tz for {lat}, {lon}") + # warnings.append(f"Couldn't figure out tz for {lat}, {lon}") continue tz = pytz.timezone(zone) # TODO this is probably a bit expensive... test & benchmark @@ -133,6 +129,33 @@ def _iter_local_dates() -> Iterator[DayWithZone]: z = tz.zone; assert z is not None yield DayWithZone(day=ndate, zone=z) +# Note: this takes a while, as the upstream since _locations isn't sorted, so this +# has to do an iterative sort of the entire my.locations.all list +def _iter_local_dates() -> Iterator[DayWithZone]: + finder = _timezone_finder(fast=config.fast) # rely on the default + #pdt = None + # TODO: warnings doesnt actually warn? + # warnings = [] + + locs: Iterable[Tuple[LatLon, datetime]] + locs = _sorted_locations() if config.sort_locations else _locations() + + yield from _find_tz_for_locs(finder, locs) + + +# my.location.fallback.estimate_location could be used here +# but iterating through all the locations is faster since this +# is saved behind cachew +@import_source(module_name="my.location.fallback.all") +def _iter_local_dates_fallback() -> Iterator[DayWithZone]: + from my.location.fallback.all import fallback_locations as flocs + + def _fallback_locations() -> Iterator[Tuple[LatLon, datetime]]: + for loc in sorted(flocs(), key=lambda x: x.dt): + yield ((loc.lat, loc.lon), loc.dt) + + yield from _find_tz_for_locs(_timezone_finder(fast=config.fast), _fallback_locations()) + def most_common(lst: List[DayWithZone]) -> DayWithZone: res, _ = Counter(lst).most_common(1)[0] # type: ignore[var-annotated] @@ -142,27 +165,43 @@ def most_common(lst: List[DayWithZone]) -> DayWithZone: def _iter_tz_depends_on() -> str: """ Since you might get new data which specifies a new timezone sometime - in the day, this causes _iter_tzs to refresh every 6 hours, like: + in the day, this causes _iter_tzs to refresh every _iter_tz_refresh_time hours + (default 6), like: 2022-04-26_00 2022-04-26_06 2022-04-26_12 2022-04-26_18 """ + mod = config._iter_tz_refresh_time + assert mod >= 1 day = str(date.today()) hr = datetime.now().hour - hr_truncated = hr // 6 * 6 + hr_truncated = hr // mod * mod return "{}_{}".format(day, hr_truncated) -# refresh _iter_tzs every 6 hours -- don't think a better depends_on is possible dynamically +# refresh _iter_tzs every few hours -- don't think a better depends_on is possible dynamically @mcachew(logger=logger, depends_on=_iter_tz_depends_on) def _iter_tzs() -> Iterator[DayWithZone]: # since we have no control over what order the locations are returned, # we need to sort them first before we can do a groupby local_dates: List[DayWithZone] = list(_iter_local_dates()) local_dates.sort(key=lambda p: p.day) - for d, gr in groupby(local_dates, key=lambda p: p.day): - logger.info('processed %s', d) + logger.debug(f"no. of items using exact locations: {len(local_dates)}") + + local_dates_fallback: List[DayWithZone] = list(_iter_local_dates_fallback()) + local_dates_fallback.sort(key=lambda p: p.day) + + # find days that are in fallback but not in local_dates (i.e., missing days) + local_dates_set: Set[date] = set(d.day for d in local_dates) + use_fallback_days: List[DayWithZone] = [d for d in local_dates_fallback if d.day not in local_dates_set] + logger.debug(f"no. of items being used from fallback locations: {len(use_fallback_days)}") + + # combine local_dates and missing days from fallback into a sorted list + all_dates = heapq.merge(local_dates, use_fallback_days, key=lambda p: p.day) + + for d, gr in groupby(all_dates, key=lambda p: p.day): + logger.info(f"processed {d}{', using fallback' if d in local_dates_set else ''}") zone = most_common(list(gr)).zone yield DayWithZone(day=d, zone=zone) @@ -192,7 +231,7 @@ def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]: # ok to cache, there are only a few home locations? @lru_cache(maxsize=None) -def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]: +def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]: (lat, lng) = loc finder = _timezone_finder(fast=False) # ok to use slow here for better precision zone = finder.timezone_at(lat=lat, lng=lng) @@ -211,9 +250,17 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: if res is not None: return res # fallback to home tz - from ...location import home - loc = home.get_location(dt) - return _get_home_tz(loc=loc) + # note: the fallback to fallback.via_home.estimate_location is still needed, since + # _iter_local_dates_fallback only returns days which we actually have a datetime for + # (e.g. there was an IP address within a day of that datetime) + # + # given a datetime, fallback.via_home.estimate_location will find which home location + # that datetime is between, else fallback on your first home location, so it acts + # as a last resort + from my.location.fallback import via_home as home + loc = list(home.estimate_location(dt)) + assert len(loc) == 1, f"should only have one home location, received {loc}" + return _get_home_tz(loc=(loc[0].lat, loc[0].lon)) # expose as 'public' function get_tz = _get_tz diff --git a/setup.py b/setup.py index 31fc393..b0f4ab6 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ def main() -> None: # todo document these? 'logzero', 'orjson', # for my.core.serialize + 'pyfzf_iter', # for my.core.denylist 'cachew>=0.8.0', 'mypy', # used for config checks ], diff --git a/tests/core/test_denylist.py b/tests/core/test_denylist.py new file mode 100644 index 0000000..d6f4c49 --- /dev/null +++ b/tests/core/test_denylist.py @@ -0,0 +1,106 @@ +import warnings + +import json +from pathlib import Path +from datetime import datetime +from typing import NamedTuple, Iterator + +from my.core.denylist import DenyList + + +class IP(NamedTuple): + addr: str + dt: datetime + + +def data() -> Iterator[IP]: + # random IP addresses + yield IP(addr="67.98.113.0", dt=datetime(2020, 1, 1)) + yield IP(addr="59.40.113.87", dt=datetime(2020, 2, 1)) + yield IP(addr="161.235.192.228", dt=datetime(2020, 3, 1)) + yield IP(addr="165.243.139.87", dt=datetime(2020, 4, 1)) + yield IP(addr="69.69.141.154", dt=datetime(2020, 5, 1)) + yield IP(addr="50.72.224.80", dt=datetime(2020, 6, 1)) + yield IP(addr="221.67.89.168", dt=datetime(2020, 7, 1)) + yield IP(addr="177.113.119.251", dt=datetime(2020, 8, 1)) + yield IP(addr="93.200.246.215", dt=datetime(2020, 9, 1)) + yield IP(addr="127.105.171.61", dt=datetime(2020, 10, 1)) + + +def test_denylist(tmp_path: Path) -> None: + tf = (tmp_path / "denylist.json").absolute() + with warnings.catch_warnings(record=True): + + # create empty denylist (though file does not have to exist for denylist to work) + tf.write_text("[]") + + d = DenyList(tf) + + d.load() + assert dict(d._deny_map) == {} + assert d._deny_raw_list == [] + + assert list(d.filter(data())) == list(data()) + # no data in denylist yet + assert len(d._deny_map) == 0 + assert len(d._deny_raw_list) == 0 + + # add some data + d.deny(key="addr", value="67.98.113.0") + # write and reload to update _deny_map, _deny_raw_list + d.write() + d.load() + + assert len(d._deny_map) == 1 + assert len(d._deny_raw_list) == 1 + + assert d._deny_raw_list == [{"addr": "67.98.113.0"}] + + filtered = list(d.filter(data())) + assert len(filtered) == 9 + assert "67.98.113.0" not in [i.addr for i in filtered] + + assert dict(d._deny_map) == {"addr": {"67.98.113.0"}} + + denied = list(d.filter(data(), invert=True)) + assert len(denied) == 1 + + assert denied[0] == IP(addr="67.98.113.0", dt=datetime(2020, 1, 1)) + + # add some non-JSON primitive data + + d.deny(key="dt", value=datetime(2020, 2, 1)) + + # test internal behavior, _deny_raw_list should have been updated, + # but _deny_map doesnt get updated by a call to .deny + # + # if we change this just update the test, is just here to ensure + # this is the behaviour + + assert len(d._deny_map) == 1 + + # write and load to update _deny_map + d.write() + d.load() + + assert len(d._deny_map) == 2 + assert len(d._deny_raw_list) == 2 + + assert d._deny_raw_list[-1] == {"dt": "2020-02-01T00:00:00"} + + filtered = list(d.filter(data())) + assert len(filtered) == 8 + + assert "59.40.113.87" not in [i.addr for i in filtered] + + with open(tf, "r") as f: + data_json = json.loads(f.read()) + + assert data_json == [ + { + "addr": "67.98.113.0", + }, + { + "dt": "2020-02-01T00:00:00", + }, + ] diff --git a/tests/location.py b/tests/location.py index 298b7ba..c47849e 100644 --- a/tests/location.py +++ b/tests/location.py @@ -1,7 +1,5 @@ from pathlib import Path -from more_itertools import one - import pytest # type: ignore @@ -20,26 +18,11 @@ def test() -> None: @pytest.fixture(autouse=True) def prepare(tmp_path: Path): - from .common import reset_modules - reset_modules() - - user_config = _prepare_google_config(tmp_path) + from .shared_config import temp_config + user_config = temp_config(tmp_path) import my.core.cfg as C with C.tmp_config() as config: - config.google = user_config # type: ignore + config.google = user_config.google yield - -def _prepare_google_config(tmp_path: Path): - from .common import testdata - track = one(testdata().rglob('italy-slovenia-2017-07-29.json')) - - # todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs - import zipfile - with zipfile.ZipFile(tmp_path / 'takeout.zip', 'w') as zf: - zf.writestr('Takeout/Location History/Location History.json', track.read_bytes()) - - class google_config: - takeout_path = tmp_path - return google_config diff --git a/tests/location_fallback.py b/tests/location_fallback.py new file mode 100644 index 0000000..aad33ee --- /dev/null +++ b/tests/location_fallback.py @@ -0,0 +1,125 @@ +""" +To test my.location.fallback_location.all +""" + +from typing import Iterator +from datetime import datetime, timezone, timedelta + +from more_itertools import ilen + +from my.ip.common import IP + +def data() -> Iterator[IP]: + # random IP addresses + yield IP(addr="67.98.113.0", dt=datetime(2020, 1, 1, 12, 0, 0, tzinfo=timezone.utc)) + yield IP(addr="67.98.112.0", dt=datetime(2020, 1, 15, 12, 0, 0, tzinfo=timezone.utc)) + yield IP(addr="59.40.113.87", dt=datetime(2020, 2, 1, 12, 0, 0, tzinfo=timezone.utc)) + yield IP(addr="59.40.139.87", dt=datetime(2020, 2, 1, 16, 0, 0, tzinfo=timezone.utc)) + yield IP(addr="161.235.192.228", dt=datetime(2020, 3, 1, 12, 0, 0, tzinfo=timezone.utc)) + +# redefine the my.ip.all function using data for testing +import my.ip.all as ip_module +ip_module.ips = data + +from my.location.fallback import via_ip + +# these are all tests for the bisect algorithm defined in via_ip.py +# to make sure we can correctly find IPs that are within the 'for_duration' of a given datetime + +def test_ip_fallback() -> None: + # make sure that the data override works + assert ilen(ip_module.ips()) == ilen(data()) + assert ilen(ip_module.ips()) == ilen(via_ip.fallback_locations()) + assert ilen(via_ip.fallback_locations()) == 5 + assert ilen(via_ip._sorted_fallback_locations()) == 5 + + # confirm duration from via_ip since that is used for bisect + assert via_ip.config.for_duration == timedelta(hours=24) + + # basic tests + + # try estimating slightly before the first IP + est = list(via_ip.estimate_location(datetime(2020, 1, 1, 11, 59, 59, tzinfo=timezone.utc))) + assert len(est) == 0 + + # during the duration for the first IP + est = list(via_ip.estimate_location(datetime(2020, 1, 1, 12, 30, 0, tzinfo=timezone.utc))) + assert len(est) == 1 + + # right after the 'for_duration' for an IP + est = list(via_ip.estimate_location(datetime(2020, 1, 1, 12, 0, 0, tzinfo=timezone.utc) + via_ip.config.for_duration + timedelta(seconds=1))) + assert len(est) == 0 + + # on 2/1/2020, threes one IP if before 16:30 + est = list(via_ip.estimate_location(datetime(2020, 2, 1, 12, 30, 0, tzinfo=timezone.utc))) + assert len(est) == 1 + + # and two if after 16:30 + est = list(via_ip.estimate_location(datetime(2020, 2, 1, 17, 00, 0, tzinfo=timezone.utc))) + assert len(est) == 2 + + # the 12:30 IP should 'expire' before the 16:30 IP, use 3:30PM on the next day + est = list(via_ip.estimate_location(datetime(2020, 2, 2, 15, 30, 0, tzinfo=timezone.utc))) + assert len(est) == 1 + + use_dt = datetime(2020, 3, 1, 12, 15, 0, tzinfo=timezone.utc) + + # test last IP + est = list(via_ip.estimate_location(use_dt)) + assert len(est) == 1 + + # datetime should be the IPs, not the passed IP (if via_home, it uses the passed dt) + assert est[0].dt != use_dt + + # test interop with other fallback estimators/all.py + # + # redefine fallback_estimators to prevent possible namespace packages the user + # may have installed from having side effects testing this + from my.location.fallback import all + from my.location.fallback import via_home + def _fe() -> Iterator[all.LocationEstimator]: + yield via_ip.estimate_location + yield via_home.estimate_location + + all.fallback_estimators = _fe + assert ilen(all.fallback_estimators()) == 2 + + # test that all.estimate_location has access to both IPs + # + # just passing via_ip should give one IP + from my.location.fallback.common import _iter_estimate_from + raw_est = list(_iter_estimate_from(use_dt, (via_ip.estimate_location,))) + assert len(raw_est) == 1 + assert raw_est[0].datasource == "via_ip" + assert raw_est[0].accuracy == 15_000 + + # passing home should give one + home_est = list(_iter_estimate_from(use_dt, (via_home.estimate_location,))) + assert len(home_est) == 1 + assert home_est[0].accuracy == 30_000 + + # make sure ip accuracy is more accurate + assert raw_est[0].accuracy < home_est[0].accuracy + + # passing both should give two + raw_est = list(_iter_estimate_from(use_dt, (via_ip.estimate_location, via_home.estimate_location))) + assert len(raw_est) == 2 + + # shouldn't raise value error + all_est = all.estimate_location(use_dt) + # should have used the IP from via_ip since it was more accurate + assert all_est.datasource == "via_ip" + + # test that a home defined in shared_config.py is used if no IP is found + loc = all.estimate_location(datetime(2021, 1, 1, 12, 30, 0, tzinfo=timezone.utc)) + assert loc.datasource == "via_home" + + # test a different home using location.fallback.all + bulgaria = all.estimate_location(datetime(2006, 1, 1, 12, 30, 0, tzinfo=timezone.utc)) + assert bulgaria.datasource == "via_home" + assert (bulgaria.lat, bulgaria.lon) == (42.697842, 23.325973) + assert (loc.lat, loc.lon) != (bulgaria.lat, bulgaria.lon) + + +# re-use prepare fixture for overriding config from shared_config.py +from .tz import prepare diff --git a/tests/shared_config.py b/tests/shared_config.py new file mode 100644 index 0000000..6b83a5a --- /dev/null +++ b/tests/shared_config.py @@ -0,0 +1,65 @@ +# Defines some shared config for tests + +from datetime import datetime, date, timezone +from pathlib import Path + +from typing import Any, NamedTuple +import my.time.tz.via_location as LTZ +from more_itertools import one + + +class SharedConfig(NamedTuple): + google: Any + location: Any + time: Any + + +def _prepare_google_config(tmp_path: Path): + from .common import testdata + try: + track = one(testdata().rglob('italy-slovenia-2017-07-29.json')) + except ValueError: + raise RuntimeError('testdata not found, setup git submodules?') + + + # todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs + import zipfile + with zipfile.ZipFile(tmp_path / 'takeout.zip', 'w') as zf: + zf.writestr('Takeout/Location History/Location History.json', track.read_bytes()) + + class google_config: + takeout_path = tmp_path + return google_config + + +# pass tmp_path from pytest to this helper function +# see tests/tz.py as an example +def temp_config(temp_path: Path) -> Any: + from .common import reset_modules + reset_modules() + + LTZ.config.fast = True + + class location: + home_accuracy = 30_000 + home = ( + # supports ISO strings + ('2005-12-04' , (42.697842, 23.325973)), # Bulgaria, Sofia + # supports date/datetime objects + (date(year=1980, month=2, day=15) , (40.7128 , -74.0060 )), # NY + # check tz handling.. + (datetime.fromtimestamp(1600000000, tz=timezone.utc), (55.7558 , 37.6173 )), # Moscow, Russia + ) + # note: order doesn't matter, will be sorted in the data provider + class via_ip: + accuracy = 15_000 + class gpslogger: + pass + + class time: + class tz: + class via_location: + pass # just rely on the defaults... + + + return SharedConfig(google=_prepare_google_config(temp_path), location=location, time=time) diff --git a/tests/tz.py b/tests/tz.py index 0ea2b40..8f80800 100644 --- a/tests/tz.py +++ b/tests/tz.py @@ -1,4 +1,5 @@ -from datetime import datetime, timedelta, date, timezone +import sys +from datetime import datetime, timedelta from pathlib import Path import pytest # type: ignore @@ -46,8 +47,15 @@ def test_tz() -> None: tz = LTZ._get_tz(D('20201001 14:15:16')) assert tz is not None - tz = LTZ._get_tz(datetime.min) - assert tz is not None + on_windows = sys.platform == 'win32' + if not on_windows: + tz = LTZ._get_tz(datetime.min) + assert tz is not None + else: + # seems this fails because windows doesnt support same date ranges + # https://stackoverflow.com/a/41400321/ + with pytest.raises(OSError): + LTZ._get_tz(datetime.min) def test_policies() -> None: @@ -73,36 +81,15 @@ def D(dstr: str) -> datetime: return datetime.strptime(dstr, '%Y%m%d %H:%M:%S') -# TODO copy pasted from location.py, need to extract some common provider + @pytest.fixture(autouse=True) def prepare(tmp_path: Path): - from .common import reset_modules - reset_modules() - - LTZ.config.fast = True - - from .location import _prepare_google_config - google = _prepare_google_config(tmp_path) - - class location: - home = ( - # supports ISO strings - ('2005-12-04' , (42.697842, 23.325973)), # Bulgaria, Sofia - # supports date/datetime objects - (date(year=1980, month=2, day=15) , (40.7128 , -74.0060 )), # NY - # check tz handling.. - (datetime.fromtimestamp(1600000000, tz=timezone.utc), (55.7558 , 37.6173 )), # Moscow, Russia - ) - # note: order doesn't matter, will be sorted in the data provider - - class time: - class tz: - class via_location: - pass # just rely on the defaults... + from .shared_config import temp_config + conf = temp_config(tmp_path) import my.core.cfg as C with C.tmp_config() as config: - config.google = google - config.time = time - config.location = location + config.google = conf.google + config.time = conf.time + config.location = conf.location yield diff --git a/tox.ini b/tox.ini index 6e7ca23..efe6069 100644 --- a/tox.ini +++ b/tox.ini @@ -47,7 +47,12 @@ commands = hpi module install my.location.google pip install ijson # optional dependency + # tz/location hpi module install my.time.tz.via_location + hpi module install my.ip.all + hpi module install my.location.gpslogger + hpi module install my.location.fallback.via_ip + hpi module install my.google.takeout.parser hpi module install my.calendar.holidays @@ -125,8 +130,7 @@ commands = my.rescuetime \ my.runnerup \ my.stackexchange.stexport \ - my.smscalls \ - my.tinder.android + my.smscalls {envpython} -m mypy --install-types --non-interactive \ From 435cb020f942eb4fa5a6579d96a7237c47aecad3 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Mon, 27 Feb 2023 20:54:40 -0800 Subject: [PATCH 081/302] add example for denylist, update ci --- doc/DENYLIST.md | 2 +- tox.ini | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/DENYLIST.md b/doc/DENYLIST.md index d57b8b1..440715c 100644 --- a/doc/DENYLIST.md +++ b/doc/DENYLIST.md @@ -121,7 +121,7 @@ To edit the `all.py`, you could either: - install it as editable (`python3 -m pip install --user -e ./HPI`), and then edit the file directly - or, create a namespace package, which splits the package across multiple directories. For info on that see [`MODULE_DESIGN`](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#namespace-packages), [`reorder_editable`](https://github.com/seanbreckenridge/reorder_editable), and possibly the [`HPI-template`](https://github.com/seanbreckenridge/HPI-template) to create your own HPI namespace package to create your own `all.py` file. -TODO: link to seanbreckenridge/HPI-personal for an example of this once this is merged/settled +For a real example of this see, [seanbreckenridge/HPI-personal](https://github.com/seanbreckenridge/HPI-personal/blob/master/my/ip/all.py) Sidenote: the reason why we want to specifically override the all.py and not just create a script that filters out the items you're diff --git a/tox.ini b/tox.ini index efe6069..04c1cac 100644 --- a/tox.ini +++ b/tox.ini @@ -116,12 +116,12 @@ commands = my.fbmessenger.export \ my.goodreads \ my.google.takeout.parser \ + my.ip.common \ my.orgmode \ my.hypothesis \ my.instapaper \ my.kobo \ my.location.gpslogger \ - my.location.via_ip \ my.pdfs \ my.pinboard \ my.pocket \ @@ -130,7 +130,8 @@ commands = my.rescuetime \ my.runnerup \ my.stackexchange.stexport \ - my.smscalls + my.smscalls \ + my.time.tz.via_location {envpython} -m mypy --install-types --non-interactive \ From f36bc6144b1b8db5cb029a9ee29d9302167a7983 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Tue, 28 Feb 2023 11:10:35 -0800 Subject: [PATCH 082/302] tox: use my.ip.all, sort hpi installs --- tox.ini | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tox.ini b/tox.ini index 04c1cac..e3ff8f1 100644 --- a/tox.ini +++ b/tox.ini @@ -108,20 +108,20 @@ commands = hpi module install --parallel \ my.arbtt \ - my.coding.commits \ my.browser.export \ - my.github.ghexport \ + my.coding.commits \ my.emfit \ my.endomondo \ my.fbmessenger.export \ + my.github.ghexport \ my.goodreads \ my.google.takeout.parser \ - my.ip.common \ - my.orgmode \ my.hypothesis \ my.instapaper \ + my.ip.all \ my.kobo \ my.location.gpslogger \ + my.orgmode \ my.pdfs \ my.pinboard \ my.pocket \ @@ -129,8 +129,8 @@ commands = my.reddit.rexport \ my.rescuetime \ my.runnerup \ - my.stackexchange.stexport \ my.smscalls \ + my.stackexchange.stexport \ my.time.tz.via_location From a70118645b5f33bf79c65dd80b27603cbc033747 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Tue, 28 Feb 2023 11:10:58 -0800 Subject: [PATCH 083/302] my.ip.common: remove REQUIRES no reason to have it there since its __NOT_HPI_MODULE__, so is not discoverable anyways --- my/ip/common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/my/ip/common.py b/my/ip/common.py index b4bfc8e..244ddc5 100644 --- a/my/ip/common.py +++ b/my/ip/common.py @@ -2,8 +2,6 @@ Provides location/timezone data from IP addresses, using [[https://github.com/seanbreckenridge/ipgeocache][ipgeocache]] """ -REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] - from my.core import __NOT_HPI_MODULE__ import ipaddress From db2cd00bedbc7e7ac58f3079e8306095e56b7514 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Tue, 28 Feb 2023 11:39:44 -0800 Subject: [PATCH 084/302] try removing parallel on mac to prevent CI failure --- my/core/__main__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index 11f32fc..76db469 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -382,7 +382,9 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool=False) - cmds = [] # disable parallel on windows, sometimes throws a # '[WinError 32] The process cannot access the file because it is being used by another process' - if parallel and sys.platform not in ['win32', 'cygwin']: + # same on mac it seems? possible race conditions which are hard to debug? + # WARNING: Error parsing requirements for sqlalchemy: [Errno 2] No such file or directory: '/Users/runner/work/HPI/HPI/.tox/mypy-misc/lib/python3.7/site-packages/SQLAlchemy-2.0.4.dist-info/METADATA' + if parallel and sys.platform not in ['win32', 'cygwin', 'darwin']: # todo not really sure if it's safe to install in parallel like this # but definitely doesn't hurt to experiment for e.g. mypy pipelines # pip has '--use-feature=fast-deps', but it doesn't really work @@ -391,6 +393,8 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool=False) - for r in requirements: cmds.append(pre_cmd + [r]) else: + if parallel: + warning('parallel install is not supported on this platform, installing sequentially...') # install everything in one cmd cmds.append(pre_cmd + list(requirements)) From b94904f5ee6a20f15019b8d2d61daedf3c919167 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 28 Feb 2023 22:53:06 +0000 Subject: [PATCH 085/302] core.kompress: support .zst extension, seems more conventional than .zstd --- my/core/kompress.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/my/core/kompress.py b/my/core/kompress.py index 26e0bbd..9b8a40f 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -18,13 +18,14 @@ class Ext: zip = '.zip' lz4 = '.lz4' zstd = '.zstd' + zst = '.zst' targz = '.tar.gz' def is_compressed(p: Path) -> bool: # todo kinda lame way for now.. use mime ideally? # should cooperate with kompress.kopen? - return any(p.name.endswith(ext) for ext in {Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.targz}) + return any(p.name.endswith(ext) for ext in {Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.zst, Ext.targz}) def _zstd_open(path: Path, *args, **kwargs) -> IO[str]: @@ -70,7 +71,7 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]: elif name.endswith(Ext.lz4): import lz4.frame # type: ignore return lz4.frame.open(str(pp), mode, *args, **kwargs) - elif name.endswith(Ext.zstd): + elif name.endswith(Ext.zstd) or name.endswith(Ext.zst): return _zstd_open(pp, mode, *args, **kwargs) elif name.endswith(Ext.targz): import tarfile From 4dfc4029c38012a0358f783b96045bfcc09fe62f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 1 Mar 2023 00:45:20 +0000 Subject: [PATCH 086/302] core.kompress: proper support for read_text/read_bytes against zstd/xz archives --- my/core/kompress.py | 50 ++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/my/core/kompress.py b/my/core/kompress.py index 9b8a40f..0274e6c 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -28,30 +28,44 @@ def is_compressed(p: Path) -> bool: return any(p.name.endswith(ext) for ext in {Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.zst, Ext.targz}) -def _zstd_open(path: Path, *args, **kwargs) -> IO[str]: +def _zstd_open(path: Path, *args, **kwargs) -> IO: import zstandard as zstd # type: ignore fh = path.open('rb') dctx = zstd.ZstdDecompressor() reader = dctx.stream_reader(fh) - return io.TextIOWrapper(reader, **kwargs) # meh + + mode = kwargs.get('mode', 'rt') + if mode == 'rb': + return reader + else: + # must be text mode + kwargs.pop('mode') # TextIOWrapper doesn't like it + return io.TextIOWrapper(reader, **kwargs) # meh -# TODO returns protocol that we can call 'read' against? -# TODO use the 'dependent type' trick? -def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]: - # TODO handle mode in *rags? - encoding = kwargs.get('encoding', 'utf8') +# TODO use the 'dependent type' trick for return type? +def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO: + # just in case, but I think this shouldn't be necessary anymore + # since when we cann .read_text, encoding is passed already + if mode in {'r', 'rt'}: + encoding = kwargs.get('encoding', 'utf8') + else: + encoding = None kwargs['encoding'] = encoding pp = Path(path) name = pp.name if name.endswith(Ext.xz): import lzma - r = lzma.open(pp, mode, *args, **kwargs) - # should only happen for binary mode? - # file:///usr/share/doc/python3/html/library/lzma.html?highlight=lzma#lzma.open - assert not isinstance(r, lzma.LZMAFile), r - return r + + # ugh. for lzma, 'r' means 'rb' + # https://github.com/python/cpython/blob/d01cf5072be5511595b6d0c35ace6c1b07716f8d/Lib/lzma.py#L97 + # whereas for regular open, 'r' means 'rt' + # https://docs.python.org/3/library/functions.html#open + if mode == 'r': + mode = 'rt' + kwargs['mode'] = mode + return lzma.open(pp, *args, **kwargs) elif name.endswith(Ext.zip): # eh. this behaviour is a bit dodgy... from zipfile import ZipFile @@ -72,7 +86,8 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]: import lz4.frame # type: ignore return lz4.frame.open(str(pp), mode, *args, **kwargs) elif name.endswith(Ext.zstd) or name.endswith(Ext.zst): - return _zstd_open(pp, mode, *args, **kwargs) + kwargs['mode'] = mode + return _zstd_open(pp, *args, **kwargs) elif name.endswith(Ext.targz): import tarfile # FIXME pass mode? @@ -104,8 +119,15 @@ class CPath(BasePath): _accessor.open has to return file descriptor, doesn't work for compressed stuff. """ def open(self, *args, **kwargs): + kopen_kwargs = {} + mode = kwargs.get('mode') + if mode is not None: + kopen_kwargs['mode'] = mode + encoding = kwargs.get('encoding') + if encoding is not None: + kopen_kwargs['encoding'] = encoding # TODO assert read only? - return kopen(str(self)) + return kopen(str(self), **kopen_kwargs) open = kopen # TODO deprecate From bee17d932bc98b3ab8f4e60e712f8c0a559b1c64 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 2 Mar 2023 23:06:45 +0000 Subject: [PATCH 087/302] fbmessenger.android: use Optional name, best to leave for the consumer to decide how to behave when it's unavailable e.g. using was causing issues when used as zulip contact name --- my/fbmessenger/android.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index 69555cb..b8bdbda 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -41,7 +41,7 @@ def inputs() -> Sequence[Path]: @dataclass(unsafe_hash=True) class Sender: id: str - name: str + name: Optional[str] @dataclass(unsafe_hash=True) @@ -103,7 +103,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: for r in db.execute('''SELECT * FROM thread_users'''): # for messaging_actor_type == 'REDUCED_MESSAGING_ACTOR', name is None # but they are still referenced, so need to keep - name = r['name'] or '' + name = r['name'] user_key = r['user_key'] s = Sender( id=_normalise_user_id(user_key), @@ -135,7 +135,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: name = r['name'] # seems that it's only set for some groups if name is None: users = thread_users[thread_key] - name = ', '.join([u.name for u in users]) + name = ', '.join([u.name or u.id for u in users]) yield Thread( id=_normalise_thread_id(thread_key), name=name, From a4c713664e63dde68480d0f5acf89fa793d2a6da Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 2 Mar 2023 23:10:15 +0000 Subject: [PATCH 088/302] core.logging: sync logging helper with Promnesia, adds more goodies - print exception traceback by default when using logger.exception - COLLAPSE_DEBUG_LOGS env variable --- my/core/logging.py | 104 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 81 insertions(+), 23 deletions(-) diff --git a/my/core/logging.py b/my/core/logging.py index 03484bf..6cfa12b 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -1,24 +1,21 @@ #!/usr/bin/env python3 ''' Default logger is a bit meh, see 'test'/run this file for a demo -TODO name 'klogging' to avoid possible conflict with default 'logging' module -TODO shit. too late already? maybe use fallback & deprecate ''' - def test() -> None: - from typing import Callable import logging import sys + from typing import Callable M: Callable[[str], None] = lambda s: print(s, file=sys.stderr) M(" Logging module's defaults are not great...'") l = logging.getLogger('test_logger') - # todo why is mypy unhappy about these??? l.error("For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level") M(" The reason is that you need to remember to call basicConfig() first") + logging.basicConfig() l.error("OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number") M("") @@ -32,8 +29,9 @@ def test() -> None: import logging -from typing import Union, Optional +from typing import Union, Optional, cast import os +import warnings Level = int LevelIsh = Optional[Union[Level, str]] @@ -56,42 +54,102 @@ FORMAT_COLOR = FORMAT.format(start='%(color)s', end='%(end_color)s') FORMAT_NOCOLOR = FORMAT.format(start='', end='') DATEFMT = '%Y-%m-%d %H:%M:%S' +COLLAPSE_DEBUG_LOGS = os.environ.get('COLLAPSE_DEBUG_LOGS', False) + +_init_done = 'lazylogger_init_done' def setup_logger(logger: logging.Logger, level: LevelIsh) -> None: lvl = mklevel(level) try: import logzero # type: ignore[import] - except ModuleNotFoundError: - import warnings - - warnings.warn("You might want to install 'logzero' for nice colored logs!") - logger.setLevel(lvl) - h = logging.StreamHandler() - h.setLevel(lvl) - h.setFormatter(logging.Formatter(fmt=FORMAT_NOCOLOR, datefmt=DATEFMT)) - logger.addHandler(h) - logger.propagate = False # ugh. otherwise it duplicates log messages? not sure about it.. - else: formatter = logzero.LogFormatter( fmt=FORMAT_COLOR, datefmt=DATEFMT, ) + use_logzero = True + except ModuleNotFoundError: + warnings.warn("You might want to install 'logzero' for nice colored logs!") + formatter = logging.Formatter(fmt=FORMAT_NOCOLOR, datefmt=DATEFMT) + use_logzero = False + + logger.addFilter(AddExceptionTraceback()) + if use_logzero and not COLLAPSE_DEBUG_LOGS: # all set, nothing to do + # 'simple' setup logzero.setup_logger(logger.name, level=lvl, formatter=formatter) + return + + h = CollapseDebugHandler() if COLLAPSE_DEBUG_LOGS else logging.StreamHandler() + logger.setLevel(lvl) + h.setLevel(lvl) + h.setFormatter(formatter) + logger.addHandler(h) + logger.propagate = False # ugh. otherwise it duplicates log messages? not sure about it.. class LazyLogger(logging.Logger): def __new__(cls, name: str, level: LevelIsh = 'INFO') -> 'LazyLogger': logger = logging.getLogger(name) + # this is called prior to all _log calls so makes sense to do it here? - def isEnabledFor_lazyinit(*args, logger=logger, orig=logger.isEnabledFor, **kwargs): - att = 'lazylogger_init_done' - if not getattr(logger, att, False): # init once, if necessary + def isEnabledFor_lazyinit(*args, logger=logger, orig=logger.isEnabledFor, **kwargs) -> bool: + if not getattr(logger, _init_done, False): # init once, if necessary setup_logger(logger, level=level) - setattr(logger, att, True) + setattr(logger, _init_done, True) + logger.isEnabledFor = orig # restore the callback return orig(*args, **kwargs) - logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[assignment] - return logger # type: ignore[return-value] + # oh god.. otherwise might go into an inf loop + if not hasattr(logger, _init_done): + setattr(logger, _init_done, False) # will setup on the first call + logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[assignment] + return cast(LazyLogger, logger) + + +# by default, logging.exception isn't logging traceback +# which is a bit annoying since we have to +# also see https://stackoverflow.com/questions/75121925/why-doesnt-python-logging-exception-method-log-traceback-by-default +# tod also amend by post about defensive error handling? +class AddExceptionTraceback(logging.Filter): + def filter(self, record): + s = super().filter(record) + if s is False: + return False + if record.levelname == 'ERROR': + exc = record.msg + if isinstance(exc, BaseException): + if record.exc_info is None or record.exc_info == (None, None, None): + exc_info = (type(exc), exc, exc.__traceback__) + record.exc_info = exc_info + return s + + +# todo also save full log in a file? +class CollapseDebugHandler(logging.StreamHandler): + ''' + Collapses subsequent debug log lines and redraws on the same line. + Hopefully this gives both a sense of progress and doesn't clutter the terminal as much? + ''' + last = False + + def emit(self, record: logging.LogRecord) -> None: + try: + msg = self.format(record) + cur = record.levelno == logging.DEBUG and '\n' not in msg + if cur: + if self.last: + self.stream.write('\033[K' + '\r') # clear line + return carriage + else: + if self.last: + self.stream.write('\n') # clean up after the last debug line + self.last = cur + import os + columns, _ = os.get_terminal_size(0) + # ugh. the columns thing is meh. dunno I guess ultimately need curses for that + # TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keyworkds (INFO/DEBUG/etc) + self.stream.write(msg + ' ' * max(0, columns - len(msg)) + ('' if cur else '\n')) + self.flush() + except: + self.handleError(record) if __name__ == '__main__': From 9d231a8ea9423b00b8b1be6fe5ba239d0bf38a14 Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Sat, 4 Mar 2023 10:36:10 -0800 Subject: [PATCH 089/302] google_takeout: add semantic location history (#278) * google_takeout: add semantic location history --- my/config.py | 8 +++ my/location/all.py | 12 ++++ my/location/google_takeout_semantic.py | 76 ++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 my/location/google_takeout_semantic.py diff --git a/my/config.py b/my/config.py index 7075d1d..318539c 100644 --- a/my/config.py +++ b/my/config.py @@ -89,6 +89,14 @@ class location: export_path: Paths = '' accuracy: float + class google_takeout_semantic: + # a value between 0 and 100, 100 being the most confident + # set to 0 to include all locations + # https://locationhistoryformat.com/reference/semantic/#/$defs/placeVisit/properties/locationConfidence + require_confidence: float = 40 + # default accuracy for semantic locations + accuracy: float = 100 + from my.core.compat import Literal class time: diff --git a/my/location/all.py b/my/location/all.py index 8d51a82..fd88721 100644 --- a/my/location/all.py +++ b/my/location/all.py @@ -16,6 +16,7 @@ logger = LazyLogger(__name__, level="warning") def locations() -> Iterator[Location]: # can add/comment out sources here to disable them, or use core.disabled_modules yield from _takeout_locations() + yield from _takeout_semantic_locations() yield from _gpslogger_locations() yield from _ip_locations() @@ -26,6 +27,17 @@ def _takeout_locations() -> Iterator[Location]: yield from google_takeout.locations() +@import_source(module_name="my.location.google_takeout_semantic") +def _takeout_semantic_locations() -> Iterator[Location]: + from . import google_takeout_semantic + + for event in google_takeout_semantic.locations(): + if isinstance(event, Exception): + logger.error(f"google_takeout_semantic: {event}") + continue + yield event + + @import_source(module_name="my.location.gpslogger") def _gpslogger_locations() -> Iterator[Location]: from . import gpslogger diff --git a/my/location/google_takeout_semantic.py b/my/location/google_takeout_semantic.py new file mode 100644 index 0000000..4d3514e --- /dev/null +++ b/my/location/google_takeout_semantic.py @@ -0,0 +1,76 @@ +""" +Extracts semantic location history using google_takeout_parser +""" + +# This is a separate module to prevent ImportError and a new config block from breaking +# previously functional my.location.google_takeout locations + +REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] + +from typing import Iterator, List + +from my.google.takeout.parser import events, _cachew_depends_on as _parser_cachew_depends_on +from google_takeout_parser.models import PlaceVisit as SemanticLocation + +from my.core import dataclass, make_config +from my.core.common import mcachew, LazyLogger, Stats +from my.core.error import Res +from .common import Location + +logger = LazyLogger(__name__) + +from my.config import location as user_config + +@dataclass +class semantic_locations_config(user_config.google_takeout_semantic): + # a value between 0 and 100, 100 being the most confident + # set to 0 to include all locations + # https://locationhistoryformat.com/reference/semantic/#/$defs/placeVisit/properties/locationConfidence + require_confidence: int = 40 + # default accuracy for semantic locations + accuracy: float = 100 + + +config = make_config(semantic_locations_config) + + +# add config to cachew dependency so it recomputes on config changes +def _cachew_depends_on() -> List[str]: + dep = _parser_cachew_depends_on() + dep.insert(0, f"require_confidence={config.require_confidence} accuracy={config.accuracy}") + return dep + + + +@mcachew( + depends_on=_cachew_depends_on, + logger=logger, +) +def locations() -> Iterator[Res[Location]]: + require_confidence = config.require_confidence + if require_confidence < 0 or require_confidence > 100: + yield ValueError("location.google_takeout.semantic_require_confidence must be between 0 and 100") + return + + for g in events(): + if isinstance(g, SemanticLocation): + if g.visitConfidence < require_confidence: + logger.debug(f"Skipping {g} due to low confidence ({g.visitConfidence}))") + continue + yield Location( + lon=g.lng, + lat=g.lat, + dt=g.dt, + # can accuracy be inferred from visitConfidence? + # there's no exact distance value in the data, its a 0-100% confidence value... + accuracy=config.accuracy, + elevation=None, + datasource="google_takeout_semantic", + ) + + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} From 79eeab212812370ac9c486248e61fdd6d83443c6 Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Mon, 6 Mar 2023 13:36:36 -0800 Subject: [PATCH 090/302] cli completion doc updates, hide legacy import warning (#279) * core/cli: hide warnings when autocompleting * link to completion in setup/troubleshooting * update completion docs to make source path clear --- doc/SETUP.org | 2 ++ misc/completion/README.md | 10 ++++++---- my/core/legacy.py | 7 ++++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/SETUP.org b/doc/SETUP.org index a10c9b3..aff5158 100644 --- a/doc/SETUP.org +++ b/doc/SETUP.org @@ -193,6 +193,8 @@ If you only have a few modules set up, lots of them will error for you, which is If you're having issues with ~cachew~ or want to show logs to troubleshoot what may be happening, you can pass the debug flag (e.g., ~hpi --debug doctor my.module_name~) or set the ~HPI_LOGS~ environment variable (e.g., ~HPI_LOGS=debug hpi query my.module_name~) to print all logs, including the ~cachew~ dependencies. ~HPI_LOGS~ could also be used to silence ~info~ logs, like ~HPI_LOGS=warning hpi ...~ +If you want ~HPI~ to autocomplete the module names for you, this comes with shell completion, see [[../misc/completion/][misc/completion]] + If you have any ideas on how to improve it, please let me know! Here's a screenshot how it looks when everything is mostly good: [[https://user-images.githubusercontent.com/291333/82806066-f7dfe400-9e7c-11ea-8763-b3bee8ada308.png][link]]. diff --git a/misc/completion/README.md b/misc/completion/README.md index 699e27e..344387a 100644 --- a/misc/completion/README.md +++ b/misc/completion/README.md @@ -10,21 +10,23 @@ eval "$(_HPI_COMPLETE=fish_source hpi)" # in ~/.config/fish/config.fish That is slightly slower since its generating the completion code on the fly -- see [click docs](https://click.palletsprojects.com/en/8.0.x/shell-completion/#enabling-completion) for more info -To use the completions here: +To use the generated completion files in this repository, you need to source the file in `./bash`, `./zsh`, or `./fish` depending on your shell. + +If you don't have HPI cloned locally, after installing `HPI` you can generate the file yourself using one of the commands above. For example, for `bash`: `_HPI_COMPLETE=bash_source hpi > ~/.config/hpi_bash_completion`, and then source it like `source ~/.config/hpi_bash_completion` ### bash -Put `source /path/to/bash/_hpi` in your `~/.bashrc` +Put `source /path/to/hpi/repo/misc/completion/bash/_hpi` in your `~/.bashrc` ### zsh You can either source the file: -`source /path/to/zsh/_hpi` +`source /path/to/hpi/repo/misc/completion/zsh/_hpi` ..or add the directory to your `fpath` to load it lazily: -`fpath=("/path/to/zsh/" "${fpath[@]}")` (Note: the directory, not the script `_hpi`) +`fpath=("/path/to/hpi/repo/misc/completion/zsh/" "${fpath[@]}")` (Note: the directory, not the script `_hpi`) If your zsh configuration doesn't automatically run `compinit`, after modifying your `fpath` you should: diff --git a/my/core/legacy.py b/my/core/legacy.py index 21ec056..3ad121d 100644 --- a/my/core/legacy.py +++ b/my/core/legacy.py @@ -1,4 +1,5 @@ # I think 'compat' should be for python-specific compat stuff, whereas this for HPI specific backwards compatibility +import os import inspect import re from typing import List @@ -45,8 +46,12 @@ def handle_legacy_import( if re.match(rf'from\s+{parent_module_name}\s+import\s+{legacy_submodule_name}', line): importing_submodule = True + # click sets '_HPI_COMPLETE' env var when it's doing autocompletion + # otherwise, the warning will be printed every time you try to tab complete + autocompleting_module_cli = "_HPI_COMPLETE" in os.environ + is_legacy_import = not (imported_as_parent or importing_submodule) - if is_legacy_import: + if is_legacy_import and not autocompleting_module_cli: W.high(f'''\ importing {parent_module_name} is DEPRECATED! \ Instead, import from {parent_module_name}.{legacy_submodule_name} or {parent_module_name}.all \ From 9db5f318fbe5e946e97ee958762f13708e7a1554 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 16 Mar 2023 01:31:58 +0000 Subject: [PATCH 091/302] my.twitter.twint: use dict row factory instead of sqlite Row otherwise it's not json serializable --- my/twitter/twint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 54c7f91..ceb5406 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -109,13 +109,13 @@ ORDER BY T.created_at def tweets() -> Iterator[Res[Tweet]]: - with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db: + with sqlite_connection(get_db_path(), immutable=True, row_factory='dict') as db: res = db.execute(_QUERY.format(where='F.tweet_id IS NULL')) yield from map(Tweet, res) def likes() -> Iterator[Res[Tweet]]: - with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db: + with sqlite_connection(get_db_path(), immutable=True, row_factory='dict') as db: res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL')) yield from map(Tweet, res) From 457797bdfb838a59cfa5ad7b79375a9fa62aedbc Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 16 Mar 2023 20:27:46 +0000 Subject: [PATCH 092/302] my.bumble.android: better handling for missing conversation id in database --- my/bumble/android.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/my/bumble/android.py b/my/bumble/android.py index 2fa6bd8..6bc27dc 100644 --- a/my/bumble/android.py +++ b/my/bumble/android.py @@ -114,6 +114,9 @@ def _key(r: EntitiesRes): return r +_UNKNOWN_PERSON = "UNKNOWN_PERSON" + + def messages() -> Iterator[Res[Message]]: id2person: Dict[str, Person] = {} id2msg: Dict[str, Message] = {} @@ -126,8 +129,12 @@ def messages() -> Iterator[Res[Message]]: continue if isinstance(x, _Message): reply_to_id = x.reply_to_id + # hmm seems that sometimes there are messages with no corresponding conversation_info? + # possibly if user never clicked on conversation before.. + person = id2person.get(x.conversation_id) + if person is None: + person = Person(user_id=x.conversation_id, user_name=_UNKNOWN_PERSON) try: - person = id2person[x.conversation_id] reply_to = None if reply_to_id is None else id2msg[reply_to_id] except Exception as e: yield e From 0a05b272667b229d7adc71e3fa1200d93470b51b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 16 Mar 2023 21:49:29 +0000 Subject: [PATCH 093/302] my.fbmessenger.android: set timezone to utc --- my/fbmessenger/android.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index b8bdbda..616a6af 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -4,14 +4,14 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab from __future__ import annotations from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path import sqlite3 from typing import Iterator, Sequence, Optional, Dict, Union, List from more_itertools import unique_everseen -from my.core import get_files, Paths, datetime_naive, Res, assert_never, LazyLogger, make_config +from my.core import get_files, Paths, datetime_aware, Res, assert_never, LazyLogger, make_config from my.core.error import echain from my.core.sqlite import sqlite_connection @@ -53,8 +53,7 @@ class Thread: @dataclass class _BaseMessage: id: str - # checked against a message sent on 4 may 2022, and it does look naive - dt: datetime_naive + dt: datetime_aware text: Optional[str] @@ -151,7 +150,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: '''): yield _Message( id=r['msg_id'], - dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000), + dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000, tz=timezone.utc), # double checked against some messages in different timezone # is_incoming=False, TODO?? text=r['text'], thread_id=_normalise_thread_id(r['thread_key']), From bef832cbffd77b765fc191e77cb1584418bd50ed Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 16 Mar 2023 21:57:55 +0000 Subject: [PATCH 094/302] my.fbmessenger.export: remove legacy dump_chat_history code --- my/fbmessenger/export.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/my/fbmessenger/export.py b/my/fbmessenger/export.py index 3a9d227..201fad8 100644 --- a/my/fbmessenger/export.py +++ b/my/fbmessenger/export.py @@ -9,7 +9,6 @@ REQUIRES = [ from contextlib import ExitStack, contextmanager from dataclasses import dataclass -from pathlib import Path from typing import Iterator from my.core import PathIsh, Res, stat, Stats @@ -59,33 +58,3 @@ def messages() -> Iterator[Res[messenger.Message]]: def stats() -> Stats: return stat(messages) - - -### vvv not sure if really belongs here... - -def _dump_helper(model: messenger.DAL, tdir: Path) -> None: - for t in model.iter_threads(): - name = t.name.replace('/', '_') # meh.. - path = tdir / (name + '.txt') - with path.open('w') as fo: - for m in t.iter_messages(order_by='-timestamp'): - # TODO would be nice to have usernames perhaps.. - dts = m.dt.strftime('%Y-%m-%d %a %H:%M') - msg = f"{dts}: {m.text}" - print(msg, file=fo) - - -def dump_chat_history(where: PathIsh) -> None: - p = Path(where) - assert not p.exists() or p.is_dir() - - from shutil import rmtree - from tempfile import TemporaryDirectory - with TemporaryDirectory() as tdir, _dal() as model: - td = Path(tdir) - _dump_helper(model, td) - - if p.exists(): - rmtree(p) - td.rename(p) - td.mkdir() # ugh, hacky way of preventing complaints from context manager From 58d2e25a428309e9ecded8e2b5359027337f0785 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 16 Mar 2023 22:19:13 +0000 Subject: [PATCH 095/302] ci: suppress some mypy issues after upgrade --- my/core/cfg.py | 2 +- my/core/freezer.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/my/core/cfg.py b/my/core/cfg.py index 3321a4c..3cddcf7 100644 --- a/my/core/cfg.py +++ b/my/core/cfg.py @@ -18,7 +18,7 @@ def make_config(cls: Type[C], migration: Callable[[Attrs], Attrs]=lambda x: x) - params = { k: v for k, v in new_props.items() - if k in {f.name for f in fields(cls)} + if k in {f.name for f in fields(cls)} # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115 } # todo maybe return type here? return cls(**params) # type: ignore[call-arg] diff --git a/my/core/freezer.py b/my/core/freezer.py index abb2973..649a2b7 100644 --- a/my/core/freezer.py +++ b/my/core/freezer.py @@ -8,7 +8,7 @@ D = TypeVar('D') def _freeze_dataclass(Orig: Type[D]): - ofields = [(f.name, f.type, f) for f in dcl.fields(Orig)] + ofields = [(f.name, f.type, f) for f in dcl.fields(Orig)] # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115 # extract properties along with their types props = list(inspect.getmembers(Orig, lambda o: isinstance(o, property))) @@ -35,7 +35,7 @@ class Freezer(Generic[D]): def freeze(self, value: D) -> D: pvalues = {name: getattr(value, name) for name, _ in self.props} - return self.Frozen(**dcl.asdict(value), **pvalues) + return self.Frozen(**dcl.asdict(value), **pvalues) # type: ignore[call-overload] # see https://github.com/python/typing_extensions/issues/115 ### tests From 347cd1ef77e17f4f8563979555ae1b4e383f2d4f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 16 Mar 2023 23:16:18 +0000 Subject: [PATCH 096/302] my.fbmessenger: add Sender protocol for consistency --- my/fbmessenger/common.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/my/fbmessenger/common.py b/my/fbmessenger/common.py index 1f82327..a498952 100644 --- a/my/fbmessenger/common.py +++ b/my/fbmessenger/common.py @@ -10,8 +10,16 @@ class Thread(Protocol): @property def id(self) -> str: ... - # todo hmm it doesn't like it because one from .export is just str, not Optional... - # name: Optional[str] + @property + def name(self) -> Optional[str]: ... + + +class Sender(Protocol): + @property + def id(self) -> str: ... + + @property + def name(self) -> Optional[str]: ... class Message(Protocol): @@ -27,6 +35,9 @@ class Message(Protocol): @property def thread(self) -> Thread: ... + @property + def sender(self) -> Sender: ... + from itertools import chain from more_itertools import unique_everseen From e7be680841feafb2357048e57a4f4fdc6717632c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 22 Mar 2023 00:29:21 +0000 Subject: [PATCH 097/302] my.instagram.gdpr: handle missing message content defensively --- my/instagram/common.py | 1 + my/instagram/gdpr.py | 40 +++++++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/my/instagram/common.py b/my/instagram/common.py index 23cefe5..b345b8e 100644 --- a/my/instagram/common.py +++ b/my/instagram/common.py @@ -12,6 +12,7 @@ class Message(Protocol): created: datetime text: str # TODO add some sort of thread id + # ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump) @warn_if_empty diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 4c54fbf..62c9f1f 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -3,13 +3,28 @@ Instagram data (uses [[https://www.instagram.com/download/request][official GDPR """ from dataclasses import dataclass from datetime import datetime -from typing import Iterator, Any, Sequence, Dict - -from my.config import instagram as user_config +import json +from pathlib import Path +from typing import Iterator, Sequence, Dict, Union from more_itertools import bucket -from ..core import Paths +from my.core import ( + get_files, + Paths, + datetime_naive, + Res, + assert_never, + LazyLogger, +) +from my.core.kompress import ZipPath + +from my.config import instagram as user_config + + +logger = LazyLogger(__name__, level='debug') + + @dataclass class config(user_config.gdpr): # paths[s]/glob to the exported zip archives @@ -17,8 +32,6 @@ class config(user_config.gdpr): # TODO later also support unpacked directories? -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -31,7 +44,6 @@ class User: full_name: str -from ..core import datetime_naive @dataclass class _BaseMessage: # ugh, this is insane, but does look like it's just keeping local device time??? @@ -57,11 +69,7 @@ def _decode(s: str) -> str: return s.encode('latin-1').decode('utf8') -import json -from typing import Union -from ..core import Res, assert_never def _entities() -> Iterator[Res[Union[User, _Message]]]: - from ..core.kompress import ZipPath last = ZipPath(max(inputs())) # TODO make sure it works both with plan directory # idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here @@ -128,9 +136,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # todo "thread_type": "Regular" ? for jm in j['messages']: - # todo defensive? try: - mtype = jm['type'] # Generic/Share? content = None if 'content' in jm: content = _decode(jm['content']) @@ -141,7 +147,12 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: cc = share or photos or videos if cc is not None: content = str(cc) - assert content is not None, jm + + if content is None: + # not sure what it means.. perhaps likes or something? + logger.warning(f'content is None: {jm}') + continue + timestamp_ms = jm['timestamp_ms'] sender_name = _decode(jm['sender_name']) @@ -153,7 +164,6 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: thread_id=fname, # meh.. but no better way? ) except Exception as e: - # TODO sometimes messages are just missing content?? even with Generic type yield e From 8f7d14e7c6b20405ae6dad71e8274af29ae4f024 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 22 Mar 2023 02:19:57 +0000 Subject: [PATCH 098/302] my.instagram: somewhat mad merging mechanism to correlate gdpr and android exports --- my/instagram/all.py | 6 ++++- my/instagram/common.py | 53 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/my/instagram/all.py b/my/instagram/all.py index 4be2b5b..8007399 100644 --- a/my/instagram/all.py +++ b/my/instagram/all.py @@ -22,7 +22,11 @@ def _messages_android() -> Iterator[Res[Message]]: def messages() -> Iterator[Res[Message]]: # TODO in general best to prefer android, it has more data - # but for now prefer gdpr prefix until we figure out how to correlate conversation threads + # - message ids + # - usernames are correct for Android data + # - thread ids more meaninful? + # but for now prefer gdpr prefix since it makes a bit things a bit more consistent? + # e.g. a new batch of android exports can throw off ids if we rely on it for mapping yield from _merge_messages( _messages_gdpr(), _messages_android(), diff --git a/my/instagram/common.py b/my/instagram/common.py index b345b8e..a172ac8 100644 --- a/my/instagram/common.py +++ b/my/instagram/common.py @@ -1,22 +1,31 @@ +from dataclasses import replace from datetime import datetime from itertools import chain -from typing import Iterator +from typing import Iterator, Dict, Any from my.core import warn_if_empty, Res from my.core.compat import Protocol -from more_itertools import unique_everseen + +class User(Protocol): + id: str + username: str + full_name: str class Message(Protocol): created: datetime text: str - # TODO add some sort of thread id - # ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump) + thread_id: str + + # property because it's more mypy friendly + @property + def user(self) -> User: ... @warn_if_empty def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: + # TODO double check it works w.r.t. naive/aware timestamps? def key(r: Res[Message]): if isinstance(r, Exception): # NOTE: using str() against Exception is nice so exceptions with same args are treated the same.. @@ -28,4 +37,38 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: without_us = r.created.replace(microsecond=round_us) # using text as key is a bit crap.. but atm there are no better shared fields return (without_us, r.text) - return unique_everseen(chain(*sources), key=key) + + # ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump) + # so the only way to correlate is to try and match messages + # we also can't use unique_everseen here, otherwise will never get a chance to unify threads + mmap: Dict[str, Message] = {} + thread_map = {} + user_map = {} + + for m in chain(*sources): + if isinstance(m, Exception): + yield m + continue + + k = key(m) + mm = mmap.get(k) + + if mm is not None: + # already emitted, we get a chance to populate mappings + if m.thread_id not in thread_map: + thread_map[m.thread_id] = mm.thread_id + if m.user.id not in user_map: + user_map[m.user.id] = mm.user + else: + # not emitted yet, need to emit + repls: Dict[str, Any] = {} + tid = thread_map.get(m.thread_id) + if tid is not None: + repls['thread_id'] = tid + user = user_map.get(m.user.id) + if user is not None: + repls['user'] = user + if len(repls) > 0: + m = replace(m, **repls) # type: ignore[type-var] # ugh mypy is confused because of Protocol? + mmap[k] = m + yield m From 9aadbb504ba6bf58ff199f35b2547889031818e4 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 24 Mar 2023 21:48:13 +0000 Subject: [PATCH 099/302] my.instagram.android: properly extract our own user --- my/config.py | 3 +++ my/instagram/android.py | 48 ++++++++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/my/config.py b/my/config.py index 318539c..5102b6e 100644 --- a/my/config.py +++ b/my/config.py @@ -146,6 +146,9 @@ class tinder: class instagram: class android: export_path: Paths + username: Optional[str] + full_name: Optional[str] + class gdpr: export_path: Paths diff --git a/my/instagram/android.py b/my/instagram/android.py index 8e44ebe..709cfe0 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -5,22 +5,41 @@ from __future__ import annotations from dataclasses import dataclass from datetime import datetime -from typing import Iterator, Sequence, Optional, Dict +import json +from pathlib import Path +from typing import Iterator, Sequence, Optional, Dict, Union from more_itertools import unique_everseen +from my.core import ( + get_files, Paths, + make_config, + LazyLogger, + datetime_naive, + Json, + Res, assert_never, +) +from my.core.sqlite import sqlite_connect_immutable, select + from my.config import instagram as user_config -from ..core import Paths +logger = LazyLogger(__name__, level='debug') + @dataclass -class config(user_config.android): +class instagram_android_config(user_config.android): # paths[s]/glob to the exported sqlite databases export_path: Paths + # sadly doesn't seem easy to extract user's own handle/name from the db... + # todo maybe makes more sense to keep in parent class? not sure... + username: Optional[str] = None + full_name: Optional[str] = None + + +config = make_config(instagram_android_config) + -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -32,7 +51,6 @@ class User: full_name: str -from ..core import datetime_naive # todo not sure about order of fields... @dataclass class _BaseMessage: @@ -78,7 +96,6 @@ class MessageError(RuntimeError): return self.rest == other.rest -from ..core import Json def _parse_message(j: Json) -> Optional[_Message]: id = j['item_id'] t = j['item_type'] @@ -108,18 +125,23 @@ def _parse_message(j: Json) -> Optional[_Message]: ) -import json -from typing import Union -from ..core import Res, assert_never -import sqlite3 -from ..core.sqlite import sqlite_connect_immutable, select def _entities() -> Iterator[Res[Union[User, _Message]]]: # NOTE: definitely need to merge multiple, app seems to recycle old messages # TODO: hmm hard to guarantee timestamp ordering when we use synthetic input data... # todo use TypedDict? for f in inputs(): with sqlite_connect_immutable(f) as db: - for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db): + # TODO ugh. seems like no way to extract username? + # sometimes messages (e.g. media_share) contain it in message field + # but generally it's not present. ugh + for (self_uid,) in select(('user_id',), 'FROM session', db=db): + yield User( + id=str(self_uid), + full_name=config.full_name or 'USERS_OWN_FULL_NAME', + username=config.full_name or 'USERS_OWN_USERNAME', + ) + + for (thread_json,) in select(('thread_info',), 'FROM threads', db=db): j = json.loads(thread_json) # todo in principle should leave the thread attached to the message? # since thread is a group of users? From 919c84fb5a4dec176a666c78f739d3bab4109421 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 24 Mar 2023 22:24:26 +0000 Subject: [PATCH 100/302] my.instagram: better unification of like messages/reactions --- my/instagram/android.py | 9 ++++++--- my/instagram/gdpr.py | 11 ++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/my/instagram/android.py b/my/instagram/android.py index 709cfe0..8e62363 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -102,7 +102,7 @@ def _parse_message(j: Json) -> Optional[_Message]: tid = j['thread_key']['thread_id'] uid = j['user_id'] created = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000) - text: str + text: Optional[str] = None if t == 'text': text = j['text'] elif t == 'reel_share': @@ -110,11 +110,14 @@ def _parse_message(j: Json) -> Optional[_Message]: # the problem is that the links are deliberately expired by instagram.. text = j['reel_share']['text'] elif t == 'action_log': - # something like "X liked message" -- hardly useful? - return None + # for likes this ends up as 'Liked a message' or reactions + # which isn't super useful by itself perhaps, but matches GDPR so lets us unify threads better + text = j['action_log']['description'] else: raise MessageError(id, f"{t} isn't handled yet") + assert text is not None, j + return _Message( id=id, created=created, diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 62c9f1f..b9f8780 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -140,6 +140,9 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: content = None if 'content' in jm: content = _decode(jm['content']) + if content.endswith(' to your message '): + # ugh. for some reason these contain an extra space and that messes up message merging.. + content = content.strip() else: share = jm.get('share') photos = jm.get('photos') @@ -149,9 +152,11 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: content = str(cc) if content is None: - # not sure what it means.. perhaps likes or something? - logger.warning(f'content is None: {jm}') - continue + # this happens e.g. on reel shares.. + # not sure what we can do properly, GPDR has literally no other info in this case + # on android in this case at the moment we have as content '' + # so for consistency let's do that too + content = '' timestamp_ms = jm['timestamp_ms'] sender_name = _decode(jm['sender_name']) From d2ef23fcb4a8d9938621fc39b04d853b7c5d2d78 Mon Sep 17 00:00:00 2001 From: Kian-Meng Ang Date: Sun, 26 Mar 2023 17:10:33 +0800 Subject: [PATCH 101/302] docs: fix typos found via `codespell -L copie,datas,pres,fo,tooks,noo,ue,ket,frop` --- CHANGELOG.md | 2 +- doc/DESIGN.org | 2 +- doc/MODULES.org | 2 +- doc/MODULE_DESIGN.org | 4 ++-- doc/SETUP.org | 2 +- misc/check-twitter.sh | 6 +++--- my/arbtt.py | 2 +- my/body/exercise/cross_trainer.py | 2 +- my/coding/commits.py | 2 +- my/core/__main__.py | 10 +++++----- my/core/common.py | 2 +- my/core/compat.py | 2 +- my/core/denylist.py | 2 +- my/core/discovery_pure.py | 2 +- my/core/init.py | 2 +- my/core/kompress.py | 2 +- my/core/logging.py | 2 +- my/core/pandas.py | 2 +- my/core/query.py | 10 +++++----- my/core/query_range.py | 8 ++++---- my/core/serialize.py | 6 +++--- my/core/sqlite.py | 2 +- my/core/stats.py | 2 +- my/core/util.py | 2 +- my/demo.py | 2 +- my/emfit/__init__.py | 2 +- my/fbmessenger/android.py | 2 +- my/github/ghexport.py | 2 +- my/hackernews/dogsheep.py | 2 +- my/instagram/android.py | 2 +- my/jawbone/__init__.py | 2 +- my/location/fallback/common.py | 2 +- my/location/fallback/via_ip.py | 2 +- my/location/google.py | 2 +- my/pdfs.py | 2 +- my/photos/main.py | 2 +- my/photos/utils.py | 4 ++-- my/reddit/rexport.py | 2 +- my/taplog.py | 2 +- my/time/tz/via_location.py | 6 +++--- my/tinder/android.py | 2 +- my/youtube/takeout.py | 2 +- tests/bluemaestro.py | 2 +- tests/config.py | 2 +- tests/core/test_denylist.py | 2 +- tests/core/test_kompress.py | 2 +- tests/demo.py | 2 +- tests/extra/polar.py | 2 +- tests/pdfs.py | 2 +- tests/tz.py | 2 +- 50 files changed, 69 insertions(+), 69 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index edaaf02..3dd19df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ General/my.core changes: - 746c3da0cadcba3b179688783186d8a0bd0999c5 core.pandas: allow specifying schema; add tests - 5313984d8fea2b6eef6726b7b346c1f4316acd01 add `tmp_config` context manager for test & adhoc patching - df9a7f7390aee6c69f1abf1c8d1fc7659ebb957c core.pandas: add check for 'error' column + add empty one by default -- e81dddddf083ffd81aa7e2b715bd34f59949479c proprely resolve class properties in make_config + add test +- e81dddddf083ffd81aa7e2b715bd34f59949479c properly resolve class properties in make_config + add test Modules: - some innitial work on filling **InfluxDB** with HPI data diff --git a/doc/DESIGN.org b/doc/DESIGN.org index b8d40f9..81137d2 100644 --- a/doc/DESIGN.org +++ b/doc/DESIGN.org @@ -4,7 +4,7 @@ note: this doc is in progress - interoperable - # note: this link doesnt work in org, but does for the github preview + # note: this link doesn't work in org, but does for the github preview This is the main motivation and [[file:../README.org#why][why]] I created HPI in the first place. Ideally it should be possible to hook into anything you can imagine -- regardless the database/programming language/etc. diff --git a/doc/MODULES.org b/doc/MODULES.org index 9e2dbcf..9f48024 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -190,7 +190,7 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http fast: bool = True # sort locations by date - # incase multiple sources provide them out of order + # in case multiple sources provide them out of order sort_locations: bool = True # if the accuracy for the location is more than 5km (this diff --git a/doc/MODULE_DESIGN.org b/doc/MODULE_DESIGN.org index 691dd1c..d57f8fb 100644 --- a/doc/MODULE_DESIGN.org +++ b/doc/MODULE_DESIGN.org @@ -113,7 +113,7 @@ Not all HPI Modules are currently at that level of complexity -- some are simple A related concern is how to structure namespace packages to allow users to easily extend them, and how this conflicts with single file modules (Keep reading below for more information on namespace packages/extension) If a module is converted from a single file module to a namespace with multiple files, it seems this is a breaking change, see [[https://github.com/karlicoss/HPI/issues/89][#89]] for an example of this. The current workaround is to leave it a regular python package with an =__init__.py= for some amount of time and send a deprecation warning, and then eventually remove the =__init__.py= file to convert it into a namespace package. For an example, see the [[https://github.com/karlicoss/HPI/blob/8422c6e420f5e274bd1da91710663be6429c666c/my/reddit/__init__.py][reddit init file]]. -Its quite a pain to have to convert a file from a single file module to a namespace module, so if theres *any* possibility that you might convert it to a namespace package, might as well just start it off as one, to avoid the pain down the road. As an example, say you were creating something to parse ~zsh~ history. Instead of creating ~my/zsh.py~, it would be better to create ~my/zsh/parser.py~. That lets users override the file using editable/namespace packages, and it also means in the future its much more trivial to extend it to something like: +Its quite a pain to have to convert a file from a single file module to a namespace module, so if there's *any* possibility that you might convert it to a namespace package, might as well just start it off as one, to avoid the pain down the road. As an example, say you were creating something to parse ~zsh~ history. Instead of creating ~my/zsh.py~, it would be better to create ~my/zsh/parser.py~. That lets users override the file using editable/namespace packages, and it also means in the future its much more trivial to extend it to something like: #+begin_src my/zsh @@ -161,7 +161,7 @@ There's no requirement to follow this entire structure when you start off, the e Note: this section covers some of the complexities and benefits with this being a namespace package and/or editable install, so it assumes some familiarity with python/imports -HPI is installed as a namespace package, which allows an additional way to add your own modules. For the details on namespace packges, see [[https://www.python.org/dev/peps/pep-0420/][PEP420]], or the [[https://packaging.python.org/guides/packaging-namespace-packages][packaging docs for a summary]], but for our use case, a sufficient description might be: Namespace packages let you split a package across multiple directories on disk. +HPI is installed as a namespace package, which allows an additional way to add your own modules. For the details on namespace packages, see [[https://www.python.org/dev/peps/pep-0420/][PEP420]], or the [[https://packaging.python.org/guides/packaging-namespace-packages][packaging docs for a summary]], but for our use case, a sufficient description might be: Namespace packages let you split a package across multiple directories on disk. Without adding a bulky/boilerplate-y plugin framework to HPI, as that increases the barrier to entry, [[https://packaging.python.org/guides/creating-and-discovering-plugins/#using-namespace-packages][namespace packages offers an alternative]] with little downsides. diff --git a/doc/SETUP.org b/doc/SETUP.org index aff5158..6605f66 100644 --- a/doc/SETUP.org +++ b/doc/SETUP.org @@ -452,7 +452,7 @@ connect the data with other apps and libraries! See more in [[file:../README.org::#how-do-you-use-it]["How do you use it?"]] section. -Also check out [[https://beepb00p.xyz/myinfra.html#hpi][my personal infrastructure map]] to see wher I'm using HPI. +Also check out [[https://beepb00p.xyz/myinfra.html#hpi][my personal infrastructure map]] to see where I'm using HPI. * Adding/modifying modules # TODO link to 'overlays' documentation? diff --git a/misc/check-twitter.sh b/misc/check-twitter.sh index 318ff71..1552673 100755 --- a/misc/check-twitter.sh +++ b/misc/check-twitter.sh @@ -21,7 +21,7 @@ check '2011-05-12 Thu 17:51.*set ><' # this would probs be from twint or something? check '2013-06-01 Sat 18:48.* Iterable[Entry]: if len(inps) == 0: cmds = [base] # rely on default else: - # otherise, 'merge' them + # otherwise, 'merge' them cmds = [base + ['--logfile', f] for f in inps] import ijson.backends.yajl2_cffi as ijson # type: ignore diff --git a/my/body/exercise/cross_trainer.py b/my/body/exercise/cross_trainer.py index 58c32b2..b25985c 100644 --- a/my/body/exercise/cross_trainer.py +++ b/my/body/exercise/cross_trainer.py @@ -146,7 +146,7 @@ def dataframe() -> DataFrameT: # todo careful about 'how'? we need it to preserve the errors # maybe pd.merge is better suited for this?? df = edf.join(mdf, how='outer', rsuffix='_manual') - # todo reindex? so we dont' have Nan leftovers + # todo reindex? so we don't have Nan leftovers # todo set date anyway? maybe just squeeze into the index?? noendo = df['error'] == NO_ENDOMONDO diff --git a/my/coding/commits.py b/my/coding/commits.py index 5b15db1..7786055 100644 --- a/my/coding/commits.py +++ b/my/coding/commits.py @@ -59,7 +59,7 @@ class Commit: committed_dt: datetime authored_dt: datetime message: str - repo: str # TODO put canonical name here straightaway?? + repo: str # TODO put canonical name here straight away?? sha: str ref: Optional[str] = None # TODO filter so they are authored by me diff --git a/my/core/__main__.py b/my/core/__main__.py index 76db469..05f5a2c 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -143,7 +143,7 @@ def config_ok() -> bool: else: info(f'import order: {paths}') - # first try doing as much as possible without actually imporing my.config + # first try doing as much as possible without actually importing my.config from .preinit import get_mycfg_dir cfg_path = get_mycfg_dir() # alternative is importing my.config and then getting cfg_path from its __file__/__path__ @@ -267,7 +267,7 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: Li # todo more specific command? error(f'{click.style("FAIL", fg="red")}: {m:<50} loading failed{vw}') # check that this is an import error in particular, not because - # of a ModuleNotFoundError because some dependency wasnt installed + # of a ModuleNotFoundError because some dependency wasn't installed if isinstance(e, (ImportError, AttributeError)): warn_my_config_import_error(e) if verbose: @@ -441,7 +441,7 @@ def _locate_functions_or_prompt(qualified_names: List[str], prompt: bool = True) from .query import locate_qualified_function, QueryException from .stats import is_data_provider - # if not connected to a terminal, cant prompt + # if not connected to a terminal, can't prompt if not sys.stdout.isatty(): prompt = False @@ -471,7 +471,7 @@ def _locate_functions_or_prompt(qualified_names: List[str], prompt: bool = True) else: choices = [f.__name__ for f in data_providers] if prompt is False: - # theres more than one possible data provider in this module, + # there's more than one possible data provider in this module, # STDOUT is not a TTY, can't prompt eprint("During fallback, more than one possible data provider, can't prompt since STDOUT is not a TTY") eprint("Specify one of:") @@ -576,7 +576,7 @@ def main(debug: bool) -> None: # acts as a contextmanager of sorts - any subcommand will then run # in something like /tmp/hpi_temp_dir # to avoid importing relative modules by accident during development - # maybe can be removed later if theres more test coverage/confidence that nothing + # maybe can be removed later if there's more test coverage/confidence that nothing # would happen? # use a particular directory instead of a random one, since diff --git a/my/core/common.py b/my/core/common.py index 6ad8146..7adfd7a 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -433,7 +433,7 @@ def warn_if_empty(f): QUICK_STATS = False -# incase user wants to use the stats functions/quick option +# in case user wants to use the stats functions/quick option # elsewhere -- can use this decorator instead of editing # the global state directly @contextmanager diff --git a/my/core/compat.py b/my/core/compat.py index dcf97cc..8bdb401 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -127,7 +127,7 @@ else: TypedDict = Dict -# bisect_left doesnt have a 'key' parameter (which we use) +# bisect_left doesn't have a 'key' parameter (which we use) # till python3.10 if sys.version_info[:2] <= (3, 9): from typing import List, TypeVar, Any, Optional, Callable diff --git a/my/core/denylist.py b/my/core/denylist.py index fcf3e2b..8c18e06 100644 --- a/my/core/denylist.py +++ b/my/core/denylist.py @@ -1,5 +1,5 @@ """ -A helper module for defining denylists for sources programatically +A helper module for defining denylists for sources programmatically (in lamens terms, this lets you remove some output from a module you don't want) For docs, see doc/DENYLIST.md diff --git a/my/core/discovery_pure.py b/my/core/discovery_pure.py index 5c9dbed..c88ef1c 100644 --- a/my/core/discovery_pure.py +++ b/my/core/discovery_pure.py @@ -119,7 +119,7 @@ def _extract_requirements(a: ast.Module) -> Requires: elif isinstance(c, ast.Str): deps.append(c.s) else: - raise RuntimeError(f"Expecting string contants only in {REQUIRES} declaration") + raise RuntimeError(f"Expecting string constants only in {REQUIRES} declaration") return tuple(deps) return None diff --git a/my/core/init.py b/my/core/init.py index 9e1fc4d..2e47e87 100644 --- a/my/core/init.py +++ b/my/core/init.py @@ -1,7 +1,7 @@ ''' A hook to insert user's config directory into Python's search path. -Ideally that would be in __init__.py (so it's executed without having to import explicityly) +Ideally that would be in __init__.py (so it's executed without having to import explicitly) But, with namespace packages, we can't have __init__.py in the parent subpackage (see http://python-notes.curiousefficiency.org/en/latest/python_concepts/import_traps.html#the-init-py-trap) diff --git a/my/core/kompress.py b/my/core/kompress.py index 0274e6c..a44b9d1 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -46,7 +46,7 @@ def _zstd_open(path: Path, *args, **kwargs) -> IO: # TODO use the 'dependent type' trick for return type? def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO: # just in case, but I think this shouldn't be necessary anymore - # since when we cann .read_text, encoding is passed already + # since when we call .read_text, encoding is passed already if mode in {'r', 'rt'}: encoding = kwargs.get('encoding', 'utf8') else: diff --git a/my/core/logging.py b/my/core/logging.py index 6cfa12b..a948dd8 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -145,7 +145,7 @@ class CollapseDebugHandler(logging.StreamHandler): import os columns, _ = os.get_terminal_size(0) # ugh. the columns thing is meh. dunno I guess ultimately need curses for that - # TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keyworkds (INFO/DEBUG/etc) + # TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keywords (INFO/DEBUG/etc) self.stream.write(msg + ' ' * max(0, columns - len(msg)) + ('' if cur else '\n')) self.flush() except: diff --git a/my/core/pandas.py b/my/core/pandas.py index 370c119..8ccacd2 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -74,7 +74,7 @@ No 'error' column detected. You probably forgot to handle errors defensively, wh from typing import Any, Callable, TypeVar FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT]) -# TODO ugh. typing this is a mess... shoul I use mypy_extensions.VarArg/KwArgs?? or what?? +# TODO ugh. typing this is a mess... should I use mypy_extensions.VarArg/KwArgs?? or what?? from decorator import decorator @decorator def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', *args, **kwargs) -> DataFrameT: diff --git a/my/core/query.py b/my/core/query.py index 43574d0..ed29649 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -26,7 +26,7 @@ ET = Res[T] U = TypeVar("U") # In a perfect world, the return value from a OrderFunc would just be U, # not Optional[U]. However, since this has to deal with so many edge -# cases, theres a possibility that the functions generated by +# cases, there's a possibility that the functions generated by # _generate_order_by_func can't find an attribute OrderFunc = Callable[[ET], Optional[U]] Where = Callable[[ET], bool] @@ -54,7 +54,7 @@ def locate_function(module_name: str, function_name: str) -> Callable[[], Iterab for (fname, func) in inspect.getmembers(mod, inspect.isfunction): if fname == function_name: return func - # incase the function is defined dynamically, + # in case the function is defined dynamically, # like with a globals().setdefault(...) or a module-level __getattr__ function func = getattr(mod, function_name, None) if func is not None and callable(func): @@ -244,7 +244,7 @@ def _drop_unsorted(itr: Iterator[ET], orderfunc: OrderFunc) -> Iterator[ET]: # try getting the first value from the iterator -# similar to my.core.common.warn_if_empty? this doesnt go through the whole iterator though +# similar to my.core.common.warn_if_empty? this doesn't go through the whole iterator though def _peek_iter(itr: Iterator[ET]) -> Tuple[Optional[ET], Iterator[ET]]: itr = more_itertools.peekable(itr) try: @@ -290,7 +290,7 @@ def _handle_unsorted( return iter([]), itr -# handles creating an order_value functon, using a lookup for +# handles creating an order_value function, using a lookup for # different types. ***This consumes the iterator***, so # you should definitely itertoolts.tee it beforehand # as to not exhaust the values @@ -374,7 +374,7 @@ def select( by allowing you to provide custom predicates (functions) which can sort by a function, an attribute, dict key, or by the attributes values. - Since this supports mixed types, theres always a possibility + Since this supports mixed types, there's always a possibility of KeyErrors or AttributeErrors while trying to find some value to order by, so this provides multiple mechanisms to deal with that diff --git a/my/core/query_range.py b/my/core/query_range.py index ea625e5..179e4ea 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -220,7 +220,7 @@ def _create_range_filter( # inclusivity here? Is [after, before) currently, # items are included on the lower bound but not the # upper bound - # typically used for datetimes so doesnt have to + # typically used for datetimes so doesn't have to # be exact in that case def generated_predicate(obj: Any) -> bool: ov: Any = attr_func(obj) @@ -294,7 +294,7 @@ def select_range( # some operations to do before ordering/filtering if drop_exceptions or raise_exceptions or where is not None: - # doesnt wrap unsortable items, because we pass no order related kwargs + # doesn't wrap unsortable items, because we pass no order related kwargs itr = select(itr, where=where, drop_exceptions=drop_exceptions, raise_exceptions=raise_exceptions) order_by_chosen: Optional[OrderFunc] = None @@ -356,7 +356,7 @@ Specify a type or a key to order the value by""") # # this select is also run if the user didn't specify anything to # order by, and is just returning the data in the same order as - # as the srouce iterable + # as the source iterable # i.e. none of the range-related filtering code ran, this is just a select itr = select(itr, order_by=order_by_chosen, @@ -483,7 +483,7 @@ def test_parse_range() -> None: assert res2 == RangeTuple(after=start_date.timestamp(), before=end_date.timestamp(), within=None) - # cant specify all three + # can't specify all three with pytest.raises(QueryException, match=r"Cannot specify 'after', 'before' and 'within'"): dt_parse_range(unparsed_range=RangeTuple(str(start_date), str(end_date.timestamp()), "7d")) diff --git a/my/core/serialize.py b/my/core/serialize.py index c0cbae9..ca68fef 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -96,7 +96,7 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: # is rust-based and compiling on rarer architectures may not work # out of the box # - # unlike the builtin JSON modue which serializes NamedTuples as lists + # unlike the builtin JSON module which serializes NamedTuples as lists # (even if you provide a default function), simplejson correctly # serializes namedtuples to dictionaries @@ -157,7 +157,7 @@ def dumps( def test_serialize_fallback() -> None: import json as jsn # dont cause possible conflicts with module code - # cant use a namedtuple here, since the default json.dump serializer + # can't use a namedtuple here, since the default json.dump serializer # serializes namedtuples as tuples, which become arrays # just test with an array of mixed objects X = [5, datetime.timedelta(seconds=5.0)] @@ -216,7 +216,7 @@ def test_default_serializer() -> None: def _serialize_with_default(o: Any) -> Any: if isinstance(o, Unserializable): return {"x": o.x, "y": o.y} - raise TypeError("Couldnt serialize") + raise TypeError("Couldn't serialize") # this serializes both Unserializable, which is a custom type otherwise # not handled, and timedelta, which is handled by the '_default_encode' diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 7c02940..80dbc3f 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -94,7 +94,7 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: # NOTE hmm, so this kinda works # V = TypeVar('V', bound=Tuple[Any, ...]) -# def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]: +# def select(cols: V, rest: str, *, db: sqlite3.Connection) -> Iterator[V]: # but sadly when we pass columns (Tuple[str, ...]), it seems to bind this type to V? # and then the return type ends up as Iterator[Tuple[str, ...]], which isn't desirable :( # a bit annoying to have this copy-pasting, but hopefully not a big issue diff --git a/my/core/stats.py b/my/core/stats.py index ba32be7..8923996 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -35,7 +35,7 @@ def is_data_provider(fun: Any) -> bool: 1. returns iterable or something like that 2. takes no arguments? (otherwise not callable by stats anyway?) 3. doesn't start with an underscore (those are probably helper functions?) - 4. functions isnt the 'inputs' function (or ends with '_inputs') + 4. functions isn't the 'inputs' function (or ends with '_inputs') """ # todo maybe for 2 allow default arguments? not sure # one example which could benefit is my.pdfs diff --git a/my/core/util.py b/my/core/util.py index 64bf6fe..f12b578 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -246,7 +246,7 @@ def stats(): sys.path = orig_path # shouldn't crash at least assert res is None # good as far as discovery is concerned - assert xx.read_text() == 'some precious data' # make sure module wasn't evauluated + assert xx.read_text() == 'some precious data' # make sure module wasn't evaluated ### tests end diff --git a/my/demo.py b/my/demo.py index 3a9d1b3..1023795 100644 --- a/my/demo.py +++ b/my/demo.py @@ -46,7 +46,7 @@ from .core import Json, get_files @dataclass class Item: ''' - Some completely arbirary artificial stuff, just for testing + Some completely arbitrary artificial stuff, just for testing ''' username: str raw: Json diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index a081416..0a1eb73 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -38,7 +38,7 @@ def datas() -> Iterable[Res[Emfit]]: import dataclasses # data from emfit is coming in UTC. There is no way (I think?) to know the 'real' timezone, and local times matter more for sleep analysis - # TODO actully this is wrong?? check this.. + # TODO actually this is wrong?? check this.. emfit_tz = config.timezone for x in dal.sleeps(config.export_path): diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index 616a6af..38551b4 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -177,7 +177,7 @@ def messages() -> Iterator[Res[Message]]: reply_to_id = x.reply_to_id # hmm, reply_to be missing due to the synthetic nature of export, so have to be defensive reply_to = None if reply_to_id is None else msgs.get(reply_to_id) - # also would be interesting to merge together entities rather than resuling messages from different sources.. + # also would be interesting to merge together entities rather than resulting messages from different sources.. # then the merging thing could be moved to common? try: sender = senders[x.sender_id] diff --git a/my/github/ghexport.py b/my/github/ghexport.py index c9ba7ea..67042fc 100644 --- a/my/github/ghexport.py +++ b/my/github/ghexport.py @@ -128,7 +128,7 @@ def _get_summary(e) -> Tuple[str, Optional[Link], Optional[EventId], Optional[Bo rt = pl['ref_type'] ref = pl['ref'] if what == 'created': - # FIXME should handle delection?... + # FIXME should handle deletion?... eid = EventIds.repo_created(dts=dts, name=rname, ref_type=rt, ref=ref) mref = '' if ref is None else ' ' + ref # todo link to branch? only contains weird API link though diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py index 462cbc0..aac0b1a 100644 --- a/my/hackernews/dogsheep.py +++ b/my/hackernews/dogsheep.py @@ -58,7 +58,7 @@ def items() -> Iterator[Res[Item]]: type=r['type'], created=datetime.fromtimestamp(r['time']), title=r['title'], - # todo hmm maybe a method to stip off html tags would be nice + # todo hmm maybe a method to strip off html tags would be nice text_html=r['text'], url=r['url'], ) diff --git a/my/instagram/android.py b/my/instagram/android.py index 8e62363..48e8021 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -71,7 +71,7 @@ class _Message(_BaseMessage): @dataclass(unsafe_hash=True) class Message(_BaseMessage): user: User - # TODO could also extract Thread objec? not sure if useful + # TODO could also extract Thread object? not sure if useful # reply_to: Optional[Message] diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 89f104a..9f53abe 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -242,7 +242,7 @@ def plot_one(sleep: SleepEntry, fig: Figure, axes: Axes, xlims=None, showtext=Tr def predicate(sleep: SleepEntry): """ - Filter for comparing similar sleep sesssions + Filter for comparing similar sleep sessions """ start = sleep.created.time() end = sleep.completed.time() diff --git a/my/location/fallback/common.py b/my/location/fallback/common.py index fa1d4c5..fd508c6 100644 --- a/my/location/fallback/common.py +++ b/my/location/fallback/common.py @@ -64,7 +64,7 @@ class FallbackLocation(LocationProtocol): ) -# a location estimator can return multiple fallbacks, incase there are +# a location estimator can return multiple fallbacks, in case there are # differing accuracies/to allow for possible matches to be computed # iteratively LocationEstimator = Callable[[DateExact], Iterator[FallbackLocation]] diff --git a/my/location/fallback/via_ip.py b/my/location/fallback/via_ip.py index 1da2315..303074f 100644 --- a/my/location/fallback/via_ip.py +++ b/my/location/fallback/via_ip.py @@ -50,7 +50,7 @@ def fallback_locations() -> Iterator[FallbackLocation]: ) -# for compatibility with my.location.via_ip, this shouldnt be used by other modules +# for compatibility with my.location.via_ip, this shouldn't be used by other modules def locations() -> Iterator[Location]: medium("locations is deprecated, should use fallback_locations or estimate_location") yield from map(FallbackLocation.to_location, fallback_locations()) diff --git a/my/location/google.py b/my/location/google.py index 21ba3ed..fdddd92 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -82,7 +82,7 @@ def _iter_via_grep(fo) -> Iterable[TsLatLon]: # todo could also use pool? not sure if that would really be faster... -# earch thread could process 100K at once? +# search thread could process 100K at once? # would need to find out a way to know when to stop? process in some sort of sqrt progression?? diff --git a/my/pdfs.py b/my/pdfs.py index 1314f0e..5355d8a 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -79,7 +79,7 @@ class Annotation(NamedTuple): def _as_annotation(*, raw: pdfannots.Annotation, path: str) -> Annotation: d = vars(raw) pos = raw.pos - # make mypy happy (pos alwasy present for Annotation https://github.com/0xabu/pdfannots/blob/dbdfefa158971e1746fae2da139918e9f59439ea/pdfannots/types.py#L302) + # make mypy happy (pos always present for Annotation https://github.com/0xabu/pdfannots/blob/dbdfefa158971e1746fae2da139918e9f59439ea/pdfannots/types.py#L302) assert pos is not None d['page'] = pos.page.pageno return Annotation( diff --git a/my/photos/main.py b/my/photos/main.py index 69e5a46..c491ac1 100644 --- a/my/photos/main.py +++ b/my/photos/main.py @@ -43,7 +43,7 @@ class Photo(NamedTuple): if self.path.startswith(bp): return self.path[len(bp):] else: - raise RuntimeError(f'Weird path {self.path}, cant match against anything') + raise RuntimeError(f"Weird path {self.path}, can't match against anything") @property def name(self) -> str: diff --git a/my/photos/utils.py b/my/photos/utils.py index 15d7659..8c16dc5 100644 --- a/my/photos/utils.py +++ b/my/photos/utils.py @@ -48,7 +48,7 @@ def _get_exif_data(image) -> Exif: def to_degree(value) -> float: """Helper function to convert the GPS coordinates - stored in the EXIF to degress in float format""" + stored in the EXIF to digress in float format""" (d, m, s) = value return d + (m / 60.0) + (s / 3600.0) @@ -65,7 +65,7 @@ from datetime import datetime from typing import Optional # TODO surely there is a library that does it?? -# TODO this belogs to a private overlay or something +# TODO this belongs to a private overlay or something # basically have a function that patches up dates after the files were yielded.. _DT_REGEX = re.compile(r'\D(\d{8})\D*(\d{6})\D') def dt_from_path(p: Path) -> Optional[datetime]: diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index 0924e55..a8ce651 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -197,7 +197,7 @@ def _get_events(backups: Sequence[Path], parallel: bool=True) -> Iterator[Event] # eh. I guess just take max and it will always be correct? assert not first yield Event( - dt=bdt, # TODO average wit ps.save_dt? + dt=bdt, # TODO average with ps.save_dt? text="unfavorited", kind=ps, eid=f'unf-{ps.sid}', diff --git a/my/taplog.py b/my/taplog.py index 6353c14..51eeb72 100644 --- a/my/taplog.py +++ b/my/taplog.py @@ -39,7 +39,7 @@ class Entry(NamedTuple): def timestamp(self) -> datetime: ts = self.row['timestamp'] # already with timezone apparently - # TODO not sure if should stil localize though? it only kept tz offset, not real tz + # TODO not sure if should still localize though? it only kept tz offset, not real tz return datetime.fromisoformat(ts) # TODO also has gps info! diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index e111a4a..7716be0 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -35,7 +35,7 @@ class config(user_config): fast: bool = True # sort locations by date - # incase multiple sources provide them out of order + # in case multiple sources provide them out of order sort_locations: bool = True # if the accuracy for the location is more than 5km, don't use @@ -94,7 +94,7 @@ def _locations() -> Iterator[Tuple[LatLon, datetime]]: except Exception as e: from my.core.warnings import high - logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implemetation", exc_info=e) + logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implementation", exc_info=e) high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data") import my.location.google @@ -134,7 +134,7 @@ def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> I def _iter_local_dates() -> Iterator[DayWithZone]: finder = _timezone_finder(fast=config.fast) # rely on the default #pdt = None - # TODO: warnings doesnt actually warn? + # TODO: warnings doesn't actually warn? # warnings = [] locs: Iterable[Tuple[LatLon, datetime]] diff --git a/my/tinder/android.py b/my/tinder/android.py index 18b59d8..a820947 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -102,7 +102,7 @@ def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]: try: yield _parse_person(row) except Exception as e: - # todo attach error contex? + # todo attach error context? yield e for row in db.execute('SELECT * FROM match'): diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index 3d284b6..a3a2dda 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -68,7 +68,7 @@ def watched() -> Iterable[Res[Watched]]: continue if title.startswith('Subscribed to') and url.startswith('https://www.youtube.com/channel/'): - # todo might be interesting to process somwhere? + # todo might be interesting to process somewhere? continue # all titles contain it, so pointless to include 'Watched ' diff --git a/tests/bluemaestro.py b/tests/bluemaestro.py index 283bd77..c932d73 100644 --- a/tests/bluemaestro.py +++ b/tests/bluemaestro.py @@ -32,7 +32,7 @@ def test() -> None: assert len(tp) == 1 # should be unique - # 2.5 K + 4 K datapoints, somwhat overlapping + # 2.5 K + 4 K datapoints, somewhat overlapping assert len(res2020) < 6000 diff --git a/tests/config.py b/tests/config.py index 49138c3..cef3787 100644 --- a/tests/config.py +++ b/tests/config.py @@ -8,7 +8,7 @@ def test_dynamic_configuration(notes: Path) -> None: from my.core.cfg import tmp_config with tmp_config() as C: C.orgmode = NS(paths=[notes]) - # TODO ugh. this belongs to tz provider or global config or someting + # TODO ugh. this belongs to tz provider or global config or something C.weight = NS(default_timezone=pytz.timezone('Europe/London')) from my.body.weight import from_orgmode diff --git a/tests/core/test_denylist.py b/tests/core/test_denylist.py index d6f4c49..4e55a1f 100644 --- a/tests/core/test_denylist.py +++ b/tests/core/test_denylist.py @@ -72,7 +72,7 @@ def test_denylist(tmp_path: Path) -> None: d.deny(key="dt", value=datetime(2020, 2, 1)) # test internal behavior, _deny_raw_list should have been updated, - # but _deny_map doesnt get updated by a call to .deny + # but _deny_map doesn't get updated by a call to .deny # # if we change this just update the test, is just here to ensure # this is the behaviour diff --git a/tests/core/test_kompress.py b/tests/core/test_kompress.py index 97539cb..0e7d71b 100644 --- a/tests/core/test_kompress.py +++ b/tests/core/test_kompress.py @@ -98,7 +98,7 @@ def test_zippath() -> None: ], rpaths - # TODO hmm this doesn't work atm, wheras Path does + # TODO hmm this doesn't work atm, whereas Path does # not sure if it should be defensive or something... # ZipPath('doesnotexist') # same for this one diff --git a/tests/demo.py b/tests/demo.py index 436bc63..6ac937c 100644 --- a/tests/demo.py +++ b/tests/demo.py @@ -19,7 +19,7 @@ def test_dynamic_config_1(tmp_path: Path) -> None: assert item1.username == 'user' -# exactly the same test, but using a different config, to test out the behavious w.r.t. import order +# exactly the same test, but using a different config, to test out the behaviour w.r.t. import order def test_dynamic_config_2(tmp_path: Path) -> None: # doesn't work without it! # because the config from test_dybamic_config_1 is cached in my.demo.demo diff --git a/tests/extra/polar.py b/tests/extra/polar.py index 0fddcf3..1091f2a 100644 --- a/tests/extra/polar.py +++ b/tests/extra/polar.py @@ -38,7 +38,7 @@ PARAMS = [ def prepare(request): dotpolar = request.param class user_config: - if dotpolar != '': # defaul + if dotpolar != '': # default polar_dir = Path(ROOT / dotpolar) defensive = False diff --git a/tests/pdfs.py b/tests/pdfs.py index ae6318d..343a209 100644 --- a/tests/pdfs.py +++ b/tests/pdfs.py @@ -8,7 +8,7 @@ from .common import testdata def test_module(with_config) -> None: - # TODO crap. if module is imported too early (on the top level, it makes it super hard to overrride config) + # TODO crap. if module is imported too early (on the top level, it makes it super hard to override config) # need to at least detect it... from my.pdfs import annotations, annotated_pdfs diff --git a/tests/tz.py b/tests/tz.py index 8f80800..f2498a2 100644 --- a/tests/tz.py +++ b/tests/tz.py @@ -52,7 +52,7 @@ def test_tz() -> None: tz = LTZ._get_tz(datetime.min) assert tz is not None else: - # seems this fails because windows doesnt support same date ranges + # seems this fails because windows doesn't support same date ranges # https://stackoverflow.com/a/41400321/ with pytest.raises(OSError): LTZ._get_tz(datetime.min) From 74710b339ada763d950c2ea6b6c10dbdc3305706 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 27 Mar 2023 02:03:48 +0100 Subject: [PATCH 102/302] telegram_backup: order messages by date and users/chats by id for determinism --- my/telegram/telegram_backup.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/my/telegram/telegram_backup.py b/my/telegram/telegram_backup.py index 3e2d6a7..ad7a216 100644 --- a/my/telegram/telegram_backup.py +++ b/my/telegram/telegram_backup.py @@ -62,6 +62,8 @@ class Message: Chats = Dict[str, Chat] def _message_from_row(r: sqlite3.Row, *, chats: Chats) -> Message: ts = r['time'] + # desktop export uses UTC (checked by exporting in winter time vs summer time) + # and telegram_backup timestamps seem same as in desktop export time = datetime.fromtimestamp(ts, tz=timezone.utc) chat = chats[r['source_id']] sender = chats[r['sender_id']] @@ -78,12 +80,12 @@ def messages() -> Iterator[Message]: with sqlite_connection(config.export_path, immutable=True, row_factory='row') as db: chats: Chats = {} - for r in db.execute('SELECT * FROM chats'): + for r in db.execute('SELECT * FROM chats ORDER BY id'): chat = Chat(id=r['id'], name=r['name'], handle=None) assert chat.id not in chats chats[chat.id] = chat - for r in db.execute('SELECT * FROM users'): + for r in db.execute('SELECT * FROM users ORDER BY id'): first = r["first_name"] last = r["last_name"] name: Optional[str] @@ -96,8 +98,7 @@ def messages() -> Iterator[Message]: assert chat.id not in chats chats[chat.id] = chat - # TODO order by? not sure - for r in db.execute('SELECT * FROM messages WHERE message_type NOT IN ("service_message", "empty_message")'): + for r in db.execute('SELECT * FROM messages WHERE message_type NOT IN ("service_message", "empty_message") ORDER BY time'): # seems like the only remaining have message_type = 'message' yield _message_from_row(r, chats=chats) From 8288032b1c185bda2ddae6b3a956e87d43314604 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 27 Mar 2023 03:13:01 +0100 Subject: [PATCH 103/302] my.telegram.telegram_backup: support optional extra_where and optional media info extraction for Promnesia --- my/telegram/telegram_backup.py | 90 ++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 5 deletions(-) diff --git a/my/telegram/telegram_backup.py b/my/telegram/telegram_backup.py index ad7a216..0617501 100644 --- a/my/telegram/telegram_backup.py +++ b/my/telegram/telegram_backup.py @@ -4,6 +4,7 @@ Telegram data via [fabianonline/telegram_backup](https://github.com/fabianonline from dataclasses import dataclass from datetime import datetime, timezone +from struct import unpack_from, calcsize import sqlite3 from typing import Dict, Iterator, Optional @@ -43,6 +44,7 @@ class Message: chat: Chat sender: User text: str + extra_media_info: Optional[str] = None @property def permalink(self) -> str: @@ -60,25 +62,41 @@ class Message: Chats = Dict[str, Chat] -def _message_from_row(r: sqlite3.Row, *, chats: Chats) -> Message: +def _message_from_row(r: sqlite3.Row, *, chats: Chats, with_extra_media_info: bool) -> Message: ts = r['time'] # desktop export uses UTC (checked by exporting in winter time vs summer time) # and telegram_backup timestamps seem same as in desktop export time = datetime.fromtimestamp(ts, tz=timezone.utc) chat = chats[r['source_id']] sender = chats[r['sender_id']] + + extra_media_info: Optional[str] = None + if with_extra_media_info and r['has_media'] == 1: + # also it's quite hacky, so at least for now it's just an optional attribute behind the flag + # defensive because it's a bit tricky to correctly parse without a proper api parser.. + # maybe later we'll improve it + try: + extra_media_info = _extract_extra_media_info(data=r['data']) + except Exception as e: + pass + return Message( id=r['message_id'], time=time, chat=chat, sender=User(id=sender.id, name=sender.name), text=r['text'], + extra_media_info=extra_media_info, ) -def messages() -> Iterator[Message]: - with sqlite_connection(config.export_path, immutable=True, row_factory='row') as db: +def messages(*, extra_where: Optional[str]=None, with_extra_media_info: bool=False) -> Iterator[Message]: + messages_query = 'SELECT * FROM messages WHERE message_type NOT IN ("service_message", "empty_message")' + if extra_where is not None: + messages_query += ' AND ' + extra_where + messages_query += ' ORDER BY time' + with sqlite_connection(config.export_path, immutable=True, row_factory='row') as db: chats: Chats = {} for r in db.execute('SELECT * FROM chats ORDER BY id'): chat = Chat(id=r['id'], name=r['name'], handle=None) @@ -98,7 +116,69 @@ def messages() -> Iterator[Message]: assert chat.id not in chats chats[chat.id] = chat - for r in db.execute('SELECT * FROM messages WHERE message_type NOT IN ("service_message", "empty_message") ORDER BY time'): + for r in db.execute(messages_query): # seems like the only remaining have message_type = 'message' - yield _message_from_row(r, chats=chats) + yield _message_from_row(r, chats=chats, with_extra_media_info=with_extra_media_info) + +def _extract_extra_media_info(data: bytes) -> Optional[str]: + # ugh... very hacky, but it does manage to extract from 90% of messages that have media + pos = 0 + + def skip(count: int) -> None: + nonlocal pos + pos += count + + def getstring() -> str: + # jesus + # https://core.telegram.org/type/string + if data[pos] == 254: + skip(1) + (sz1, sz2, sz3) = unpack_from('BBB', data, offset=pos) + skip(3) + sz = 256 ** 2 * sz3 + 256 * sz2 + sz1 + short = 0 + else: + (sz, ) = unpack_from('B', data, offset=pos) + skip(1) + short = 1 + assert sz > 0, sz + + padding = 0 if (sz + short) % 4 == 0 else 4 - (sz + short) % 4 + + (ss,) = unpack_from(f'{sz}s{padding}x', data, offset=pos) + skip(sz + padding) + try: + return ss.decode('utf8') + except UnicodeDecodeError as e: + raise RuntimeError(f'Failed to decode {ss}') from e + + def debug(count: int=10) -> None: + print([hex(x) for x in data[pos: pos + count]]) + print([chr(x) for x in data[pos: pos + count]]) + + header = 'H2xII8xI' + (flags, mid, src, ts) = unpack_from(header, data, offset=pos) + pos += calcsize(header) + + # see https://core.telegram.org/constructor/message + has_media = (flags >> 9) & 1 + if has_media == 0: + return None + + msg_body = getstring() + skip(20) + url1 = getstring() + url2 = getstring() + ss_type = getstring() + # not sure if assert is really necessary her + # assert ss_type in { + # 'article', + # 'photo', + # 'app', + # 'video', + # }, ss_type + link_title = getstring() + link_title_2 = getstring() + link_description = getstring() + return link_description From 0c5b2b4a09c55fcac8b0740d076ebcb1678999f8 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 1 Apr 2023 03:40:48 +0100 Subject: [PATCH 104/302] my.whatsapp.android: initial module --- my/config.py | 6 +- my/whatsapp/android.py | 164 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 my/whatsapp/android.py diff --git a/my/config.py b/my/config.py index 5102b6e..8d958f1 100644 --- a/my/config.py +++ b/my/config.py @@ -261,5 +261,7 @@ class roamresearch: username: str - - +class whatsapp: + class android: + export_path: Paths + my_user_id: Optional[str] diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py new file mode 100644 index 0000000..fbccbf5 --- /dev/null +++ b/my/whatsapp/android.py @@ -0,0 +1,164 @@ +""" +Whatsapp data from Android app database (in =/data/data/com.whatsapp/databases/msgstore.db=) +""" +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +import sqlite3 +from typing import Sequence, Iterator, Optional + +from more_itertools import unique_everseen + +from my.core import get_files, Paths, datetime_aware, Res, LazyLogger, make_config +from my.core.error import echain, notnone +from my.core.sqlite import sqlite_connection + + +from my.config import whatsapp as user_config + + +logger = LazyLogger(__name__) + + +@dataclass +class Config(user_config.android): + # paths[s]/glob to the exported sqlite databases + export_path: Paths + my_user_id: Optional[str] = None + + +config = make_config(Config) + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +@dataclass(unsafe_hash=True) +class Chat: + id: str + # todo not sure how to support renames? + # could change Chat object itself, but this won't work well with incremental processing.. + name: Optional[str] + + +@dataclass(unsafe_hash=True) +class Sender: + id: str + name: Optional[str] + + +@dataclass(unsafe_hash=True) +class Message: + chat: Chat + id: str + dt: datetime_aware + sender: Sender + text: Optional[str] + + +def _process_db(db: sqlite3.Connection): + # TODO later, split out Chat/Sender objects separately to safe on object creation, similar to other android data sources + + chats = {} + for r in db.execute(''' + SELECT raw_string_jid AS chat_id, subject + FROM chat_view + WHERE chat_id IS NOT NULL /* seems that it might be null for chats that are 'recycled' (the db is more like an LRU cache) */ + '''): + chat_id = r['chat_id'] + subject = r['subject'] + chat = Chat( + id=chat_id, + name=subject, + ) + chats[chat.id] = chat + + + senders = {} + for r in db.execute(''' + SELECT _id, raw_string + FROM jid + '''): + # TODO seems that msgstore.db doesn't have contact names + # perhaps should extract from wa.db and match against wa_contacts.jid? + s = Sender( + id=r['raw_string'], + name=None, + ) + senders[r['_id']] = s + + + # todo message_type? mostly 0, but seems all over, even for seemingly normal messages with text + for r in db.execute(''' + SELECT C.raw_string_jid AS chat_id, M.key_id, M.timestamp, sender_jid_row_id, M.from_me, M.text_data, MM.file_path + FROM message AS M + LEFT JOIN chat_view AS C + ON M.chat_row_id = C._id + LEFT JOIN message_media AS MM + ON M._id = MM.message_row_id + WHERE M.key_id != -1 /* key_id -1 is some sort of fake message where everything is null */ + ORDER BY M.timestamp + '''): + msg_id: str = notnone(r['key_id']) + ts: int = notnone(r['timestamp']) + dt = datetime.fromtimestamp(ts / 1000, tz=timezone.utc) + + text: Optional[str] = r['text_data'] + media_file_path: Optional[str] = r['file_path'] + + if media_file_path is not None: + mm = f'MEDIA: {media_file_path}' + if text is None: + text = mm + else: + text = text + '\n' + mm + + from_me = r['from_me'] == 1 + + chat_id = r['chat_id'] + if chat_id is None: + # ugh, I think these might have been edited messages? unclear.. + logger.warning(f"CHAT ID IS NONE, WTF?? {dt} {ts} {text}") + continue + chat = chats[chat_id] + + sender_row_id = r['sender_jid_row_id'] + if sender_row_id == 0: + # seems that it's always 0 for 1-1 chats + # for group chats our onw id is still 0, but other ids are properly set + if from_me: + myself_user_id = config.my_user_id or 'MYSELF_USER_ID' + sender = Sender(id=myself_user_id, name=None) + else: + sender = Sender(id=chat.id, name=None) + else: + sender = senders[sender_row_id] + + + + m = Message( + chat=chat, + id=msg_id, + dt=dt, + sender=sender, + text=text + ) + yield m + + +def _messages() -> Iterator[Res[Message]]: + dbs = inputs() + for i, f in enumerate(dbs): + logger.debug(f'processing {f} {i}/{len(dbs)}') + with sqlite_connection(f, immutable=True, row_factory='row') as db: + try: + yield from _process_db(db) + except Exception as e: + yield echain(RuntimeError(f'While processing {f}'), cause=e) + + +def messages() -> Iterator[Res[Message]]: + yield from unique_everseen(_messages()) From d464b1e607d24aba247fe961293a534ed29e4648 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 3 Apr 2023 22:30:58 +0100 Subject: [PATCH 105/302] core: implement more methods for ZipPath and better support for get_files --- my/core/common.py | 13 +++++++++++-- my/core/kompress.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index 7adfd7a..090c564 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -161,6 +161,11 @@ from .logging import setup_logger, LazyLogger Paths = Union[Sequence[PathIsh], PathIsh] +def _is_zippath(p: Path) -> bool: + # weak type check here, don't want to depend on .kompress module in get_files + return type(p).__name__ == 'ZipPath' + + DEFAULT_GLOB = '*' def get_files( pp: Paths, @@ -183,7 +188,7 @@ def get_files( return () # early return to prevent warnings etc sources = [Path(pp)] else: - sources = [Path(p) for p in pp] + sources = [p if isinstance(p, Path) else Path(p) for p in pp] def caller() -> str: import traceback @@ -192,6 +197,10 @@ def get_files( paths: List[Path] = [] for src in sources: + if _is_zippath(src): + paths.append(src) + continue + if src.parts[0] == '~': src = src.expanduser() # note: glob handled first, because e.g. on Windows asterisk makes is_dir unhappy @@ -226,7 +235,7 @@ def get_files( if guess_compression: from .kompress import CPath, is_compressed - paths = [CPath(p) if is_compressed(p) else p for p in paths] + paths = [CPath(p) if is_compressed(p) and not _is_zippath(p) else p for p in paths] return tuple(paths) diff --git a/my/core/kompress.py b/my/core/kompress.py index a44b9d1..5ba32d3 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -3,6 +3,7 @@ Various helpers for compression """ from __future__ import annotations +from functools import total_ordering from datetime import datetime import pathlib from pathlib import Path @@ -155,6 +156,7 @@ else: zipfile_Path = object +@total_ordering class ZipPath(zipfile_Path): # NOTE: is_dir/is_file might not behave as expected, the base class checks it only based on the slash in path @@ -175,6 +177,9 @@ class ZipPath(zipfile_Path): def absolute(self) -> ZipPath: return ZipPath(self.filepath.absolute(), self.at) + def expanduser(self) -> ZipPath: + return ZipPath(self.filepath.expanduser(), self.at) + def exists(self) -> bool: if self.at == '': # special case, the base class returns False in this case for some reason @@ -224,6 +229,11 @@ class ZipPath(zipfile_Path): return False return (self.filepath, self.subpath) == (other.filepath, other.subpath) + def __lt__(self, other) -> bool: + if not isinstance(other, ZipPath): + return False + return (self.filepath, self.subpath) < (other.filepath, other.subpath) + def __hash__(self) -> int: return hash((self.filepath, self.subpath)) From 02c738594f2cae36ca4fab43cf9533fe6aa89396 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Tue, 11 Apr 2023 09:27:17 -0700 Subject: [PATCH 106/302] smscalls: make some fields optional, yield errors reflects the new types-lxml package https://github.com/abelcheung/types-lxml --- my/smscalls.py | 50 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/my/smscalls.py b/my/smscalls.py index 25acf4b..08a5c57 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -18,45 +18,54 @@ config = make_config(smscalls) from datetime import datetime, timezone from pathlib import Path -from typing import NamedTuple, Iterator, Set, Tuple +from typing import NamedTuple, Iterator, Set, Tuple, Optional from lxml import etree # type: ignore -from .core.common import get_files, Stats +from my.core.common import get_files, Stats +from my.core.error import Res class Call(NamedTuple): dt: datetime - dt_readable: str - duration_s: int - who: str + dt_readable: Optional[str] + duration_s: Optional[int] + who: Optional[str] @property def summary(self) -> str: return f"talked with {self.who} for {self.duration_s} secs" -def _extract_calls(path: Path) -> Iterator[Call]: +def _extract_calls(path: Path) -> Iterator[Res[Call]]: tr = etree.parse(str(path)) for cxml in tr.findall('call'): + date_str = cxml.get('date') + if date_str is None: + yield RuntimeError(f"no date in {etree.tostring(cxml).decode('utf-8')}") + continue + duration = cxml.get('duration') # TODO we've got local tz here, not sure if useful.. # ok, so readable date is local datetime, changing throughout the backup yield Call( - dt=_parse_dt_ms(cxml.get('date')), + dt=_parse_dt_ms(date_str), dt_readable=cxml.get('readable_date'), - duration_s=int(cxml.get('duration')), + duration_s=int(duration) if duration is not None else None, who=cxml.get('contact_name') # TODO number if contact is unavail?? # TODO type? must be missing/outgoing/incoming ) -def calls() -> Iterator[Call]: +def calls() -> Iterator[Res[Call]]: files = get_files(config.export_path, glob='calls-*.xml') # TODO always replacing with the latter is good, we get better contact names?? emitted: Set[datetime] = set() for p in files: for c in _extract_calls(p): + if isinstance(c, Exception): + yield c + continue if c.dt in emitted: continue emitted.add(c.dt) @@ -65,19 +74,22 @@ def calls() -> Iterator[Call]: class Message(NamedTuple): dt: datetime - dt_readable: str - who: str - message: str - phone_number: str + dt_readable: Optional[str] + who: Optional[str] + message: Optional[str] + phone_number: Optional[str] from_me: bool -def messages() -> Iterator[Message]: +def messages() -> Iterator[Res[Message]]: files = get_files(config.export_path, glob='sms-*.xml') - emitted: Set[Tuple[datetime, str, bool]] = set() + emitted: Set[Tuple[datetime, Optional[str], Optional[bool]]] = set() for p in files: for c in _extract_messages(p): + if isinstance(c, Exception): + yield c + continue key = (c.dt, c.who, c.from_me) if key in emitted: continue @@ -85,11 +97,15 @@ def messages() -> Iterator[Message]: yield c -def _extract_messages(path: Path) -> Iterator[Message]: +def _extract_messages(path: Path) -> Iterator[Res[Message]]: tr = etree.parse(str(path)) for mxml in tr.findall('sms'): + date_str = mxml.get('date') + if date_str is None: + yield RuntimeError(f"no date in {etree.tostring(mxml).decode('utf-8')}") + continue yield Message( - dt=_parse_dt_ms(mxml.get('date')), + dt=_parse_dt_ms(date_str), dt_readable=mxml.get('readable_date'), who=mxml.get('contact_name'), message=mxml.get('body'), From 40de162fab741df594b4d9651348ee46ee021e9b Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Fri, 14 Apr 2023 16:31:11 -0700 Subject: [PATCH 107/302] cli: add option to output locations to gpx files (#286) * cli: add option to output locations to gpx files --- my/core/__main__.py | 10 ++++++++- my/location/common.py | 47 ++++++++++++++++++++++++++++++++++++++++++- tox.ini | 1 + 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index 05f5a2c..dce646a 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -542,6 +542,14 @@ def query_hpi_functions( pprint(item) else: pprint(list(res)) + elif output == 'gpx': + from my.location.common import locations_to_gpx + + # can ignore the mypy warning here, locations_to_gpx yields any errors + # if you didnt pass it something that matches the LocationProtocol + for exc in locations_to_gpx(res, sys.stdout): # type: ignore[arg-type] + click.echo(str(exc), err=True) + sys.stdout.flush() else: res = list(res) # type: ignore[assignment] # output == 'repl' @@ -681,7 +689,7 @@ def module_install_cmd(user: bool, parallel: bool, modules: Sequence[str]) -> No @click.option('-o', '--output', default='json', - type=click.Choice(['json', 'pprint', 'repl']), + type=click.Choice(['json', 'pprint', 'repl', 'gpx']), help='what to do with the result [default: json]') @click.option('-s', '--stream', diff --git a/my/location/common.py b/my/location/common.py index fa8bdad..5c03d5e 100644 --- a/my/location/common.py +++ b/my/location/common.py @@ -1,5 +1,5 @@ from datetime import date, datetime -from typing import Union, Tuple, Optional +from typing import Union, Tuple, Optional, Iterable, TextIO, Iterator from dataclasses import dataclass from my.core import __NOT_HPI_MODULE__ @@ -32,3 +32,48 @@ class Location(LocationProtocol): accuracy: Optional[float] elevation: Optional[float] datasource: Optional[str] = None # which module provided this, useful for debugging + + +def locations_to_gpx(locations: Iterable[LocationProtocol], buffer: TextIO) -> Iterator[Exception]: + """ + Convert locations to a GPX file, printing to a buffer (an open file, io.StringIO, sys.stdout, etc) + """ + + try: + import gpxpy.gpx + except ImportError as ie: + from my.core.warnings import warn + + warn("gpxpy not installed, cannot write to gpx. 'pip install gpxpy'") + raise ie + + gpx = gpxpy.gpx.GPX() + + # hmm -- would it be useful to allow the user to split this into tracks?, perhaps by date? + + # Create first track in our GPX: + gpx_track = gpxpy.gpx.GPXTrack() + gpx.tracks.append(gpx_track) + + # Create first segment in our GPX track: + gpx_segment = gpxpy.gpx.GPXTrackSegment() + gpx_track.segments.append(gpx_segment) + + + for location in locations: + try: + point = gpxpy.gpx.GPXTrackPoint( + latitude=location.lat, + longitude=location.lon, + elevation=location.elevation, + time=location.dt, + comment=location.datasource, + ) + except AttributeError: + yield TypeError( + f"Expected a Location or Location-like object, got {type(location)} {repr(location)}" + ) + continue + gpx_segment.points.append(point) + + buffer.write(gpx.to_xml()) diff --git a/tox.ini b/tox.ini index e3ff8f1..2809e3c 100644 --- a/tox.ini +++ b/tox.ini @@ -85,6 +85,7 @@ allowlist_externals = cat commands = pip install -e .[testing,optional] pip install orgparse # used it core.orgmode? + pip install gpxpy # for hpi query --output gpx {envpython} -m mypy --install-types --non-interactive \ -p my.core \ From 82bc51d9fc41750da35ac38f1713a0776df0fed2 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 14 Apr 2023 22:55:14 -0700 Subject: [PATCH 108/302] smscalls: make checking for keys stricter sort of reverts #287, but also makes some other improvements this allows us to remove some of the Optional's to make downstream consumers easier to write. However, this keeps the return type as a Res (result, with errors), so downstream consumers will have to handle those incase the schema ever changes (highly unlikely) also added the 'call_type/message_type' with a comment there describing the values I left 'who' Optional I believe it actually should be - its very possible for there to be no contact name, added a check incase its '(Unknown)' which is what my phone sets it to --- my/smscalls.py | 86 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 61 insertions(+), 25 deletions(-) diff --git a/my/smscalls.py b/my/smscalls.py index 08a5c57..c383a36 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -3,6 +3,8 @@ Phone calls and SMS messages Exported using https://play.google.com/store/apps/details?id=com.riteshsahu.SMSBackupRestore&hl=en_US """ +# See: https://www.synctech.com.au/sms-backup-restore/fields-in-xml-backup-files/ for schema + REQUIRES = ['lxml'] from .core import Paths, dataclass @@ -28,31 +30,51 @@ from my.core.error import Res class Call(NamedTuple): dt: datetime - dt_readable: Optional[str] - duration_s: Optional[int] + dt_readable: str + duration_s: int who: Optional[str] + # type - 1 = Incoming, 2 = Outgoing, 3 = Missed, 4 = Voicemail, 5 = Rejected, 6 = Refused List. + call_type: int @property def summary(self) -> str: return f"talked with {self.who} for {self.duration_s} secs" + @property + def from_me(self) -> bool: + return self.call_type == 2 + + +# From docs: +# All the field values are read as-is from the underlying database and no conversion is done by the app in most cases. +# +# The '(Unknown)' is just what my android phone does, not sure if there are others +UNKNOWN: Set[str] = {'(Unknown)'} + def _extract_calls(path: Path) -> Iterator[Res[Call]]: tr = etree.parse(str(path)) for cxml in tr.findall('call'): - date_str = cxml.get('date') - if date_str is None: - yield RuntimeError(f"no date in {etree.tostring(cxml).decode('utf-8')}") - continue + dt = cxml.get('date') + dt_readable = cxml.get('readable_date') duration = cxml.get('duration') + who = cxml.get('contact_name') + call_type = cxml.get('type') + # if name is missing, its not None (its some string), depends on the phone/message app + if who is not None and who in UNKNOWN: + who = None + if dt is None or dt_readable is None or duration is None or call_type is None: + call_str = etree.tostring(cxml).decode('utf-8') + yield RuntimeError(f"Missing one or more required attributes [date, readable_date, duration, type] in {call_str}") + continue # TODO we've got local tz here, not sure if useful.. # ok, so readable date is local datetime, changing throughout the backup yield Call( - dt=_parse_dt_ms(date_str), - dt_readable=cxml.get('readable_date'), - duration_s=int(duration) if duration is not None else None, - who=cxml.get('contact_name') # TODO number if contact is unavail?? - # TODO type? must be missing/outgoing/incoming + dt=_parse_dt_ms(dt), + dt_readable=dt_readable, + duration_s=int(duration), + who=who, + call_type=int(call_type), ) @@ -74,17 +96,22 @@ def calls() -> Iterator[Res[Call]]: class Message(NamedTuple): dt: datetime - dt_readable: Optional[str] + dt_readable: str who: Optional[str] - message: Optional[str] - phone_number: Optional[str] - from_me: bool + message: str + phone_number: str + # type - 1 = Received, 2 = Sent, 3 = Draft, 4 = Outbox, 5 = Failed, 6 = Queued + message_type: int + + @property + def from_me(self) -> bool: + return self.message_type == 2 def messages() -> Iterator[Res[Message]]: files = get_files(config.export_path, glob='sms-*.xml') - emitted: Set[Tuple[datetime, Optional[str], Optional[bool]]] = set() + emitted: Set[Tuple[datetime, Optional[str], bool]] = set() for p in files: for c in _extract_messages(p): if isinstance(c, Exception): @@ -100,17 +127,26 @@ def messages() -> Iterator[Res[Message]]: def _extract_messages(path: Path) -> Iterator[Res[Message]]: tr = etree.parse(str(path)) for mxml in tr.findall('sms'): - date_str = mxml.get('date') - if date_str is None: - yield RuntimeError(f"no date in {etree.tostring(mxml).decode('utf-8')}") + dt = mxml.get('date') + dt_readable = mxml.get('readable_date') + who = mxml.get('contact_name') + if who is not None and who in UNKNOWN: + who = None + message = mxml.get('body') + phone_number = mxml.get('address') + message_type = mxml.get('type') + + if dt is None or dt_readable is None or message is None or phone_number is None or message_type is None: + msg_str = etree.tostring(mxml).decode('utf-8') + yield RuntimeError(f"Missing one or more required attributes [date, readable_date, body, address, type] in {msg_str}") continue yield Message( - dt=_parse_dt_ms(date_str), - dt_readable=mxml.get('readable_date'), - who=mxml.get('contact_name'), - message=mxml.get('body'), - phone_number=mxml.get('address'), - from_me=mxml.get('type') == '2', # 1 is received message, 2 is sent message + dt=_parse_dt_ms(dt), + dt_readable=dt_readable, + who=who, + message=message, + phone_number=phone_number, + message_type=int(message_type), ) From 7a32302d663c633fd6b929efe8836f316b9ee50b Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Mon, 17 Apr 2023 16:15:35 -0700 Subject: [PATCH 109/302] query: add --warn-exceptions, dateparser, docs (#290) * query: add --warn-exceptions, dateparser, docs added --warn-exceptions (like --raise-exceptions/--drop-exceptions, but lets you pass a warn_func if you want to customize how the exceptions are handled. By default this creates a logger in main and logs the exception added dateparser as a fallback if its installed (it's not a strong dependency, but I mentioned in the docs that it's useful for parsing dates/times) added docs for query, and a few examples --output gpx respects the --{drop,warn,raise}--exceptions flags, have an example of that in the docs as well --- .gitignore | 1 + README.org | 4 +- doc/QUERY.md | 304 +++++++++++++++++++++++++++++++++++++++++ my/core/__main__.py | 32 ++++- my/core/error.py | 33 ++++- my/core/query.py | 28 ++-- my/core/query_range.py | 29 +++- 7 files changed, 407 insertions(+), 24 deletions(-) create mode 100644 doc/QUERY.md diff --git a/.gitignore b/.gitignore index 888867a..19c3380 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ auto-save-list tramp .\#* +*.gpx # Org-mode .org-id-locations diff --git a/README.org b/README.org index 4843a9f..c065a0c 100644 --- a/README.org +++ b/README.org @@ -531,7 +531,7 @@ If you like the shell or just want to quickly convert/grab some information from #+begin_src bash $ hpi query my.coding.commits.commits --stream # stream JSON objects as they're read --order-type datetime # find the 'datetime' attribute and order by that - --after '2020-01-01 00:00:00' --before '2020-12-31 23:59:59' # in 2020 + --after '2020-01-01' --before '2021-01-01' # in 2020 | jq '.committed_dt' -r # extract the datetime # mangle the output a bit to group by month and graph it | cut -d'-' -f-2 | sort | uniq -c | awk '{print $2,$1}' | sort -n | termgraph @@ -552,6 +552,8 @@ If you like the shell or just want to quickly convert/grab some information from 2020-12: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 383.00 #+end_src +See [[https://github.com/karlicoss/HPI/blob/master/doc/QUERY.md][query docs]] +for more examples ** Querying Roam Research database :PROPERTIES: diff --git a/doc/QUERY.md b/doc/QUERY.md new file mode 100644 index 0000000..b672dff --- /dev/null +++ b/doc/QUERY.md @@ -0,0 +1,304 @@ +`hpi query` is a command line tool for querying the output of any `hpi` function. + +``` +Usage: hpi query [OPTIONS] FUNCTION_NAME... + + This allows you to query the results from one or more functions in HPI + + By default this runs with '-o json', converting the results to JSON and + printing them to STDOUT + + You can specify '-o pprint' to just print the objects using their repr, or + '-o repl' to drop into a ipython shell with access to the results + + While filtering using --order-key datetime, the --after, --before and + --within flags parse the input to their datetime and timedelta equivalents. + datetimes can be epoch time, the string 'now', or an date formatted in the + ISO format. timedelta (durations) are parsed from a similar format to the + GNU 'sleep' command, e.g. 1w2d8h5m20s -> 1 week, 2 days, 8 hours, 5 minutes, + 20 seconds + + As an example, to query reddit comments I've made in the last month + + hpi query --order-type datetime --before now --within 4w my.reddit.all.comments + or... + hpi query --recent 4w my.reddit.all.comments + + Can also query within a range. To filter comments between 2016 and 2018: + hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments + +Options: + -o, --output [json|pprint|repl|gpx] + what to do with the result [default: json] + -s, --stream stream objects from the data source instead + of printing a list at the end + -k, --order-key TEXT order by an object attribute or dict key on + the individual objects returned by the HPI + function + -t, --order-type [datetime|date|int|float] + order by searching for some type on the + iterable + -a, --after TEXT while ordering, filter items for the key or + type larger than or equal to this + -b, --before TEXT while ordering, filter items for the key or + type smaller than this + -w, --within TEXT a range 'after' or 'before' to filter items + by. see above for further explanation + -r, --recent TEXT a shorthand for '--order-type datetime + --reverse --before now --within'. e.g. + --recent 5d + --reverse / --no-reverse reverse the results returned from the + functions + -l, --limit INTEGER limit the number of items returned from the + (functions) + --drop-unsorted if the order of an item can't be determined + while ordering, drop those items from the + results + --wrap-unsorted if the order of an item can't be determined + while ordering, wrap them into an + 'Unsortable' object + --warn-exceptions if any errors are returned, print them as + errors on STDERR + --raise-exceptions if any errors are returned (as objects, not + raised) from the functions, raise them + --drop-exceptions ignore any errors returned as objects from + the functions + --help Show this message and exit. +``` + +This works with any function which returns an iterable, for example `my.coding.commits`, which searches for `git commit`s on your computer: + +```bash +hpi query my.coding.commits +``` + +When run with a module, this does some analysis of the functions in that module and tries to find ones that look like data sources. If it can't figure out which, it prompts you like: + +``` +Which function should be used from 'my.coding.commits'? + + 1. commits + 2. repos +``` + +You select the one you want by clicking `1` or `2` on your keyboard. Otherwise, you can provide a fully qualified path, like: + +``` +hpi query my.coding.commits.repos +``` + +The corresponding `repos` function this queries is defined in [`my/coding/commits.py`](../my/coding/commits.py) + +### Ordering/Filtering/Streaming + +By default, this just returns the items in the order they were returned by the function. This allows you to filter by specifying a `--order-key`, or `--order-type`. For example, to get the 10 most recent commits. `--order-type datetime` will try to automatically figure out which attribute to use. If it chooses the wrong one (since `Commit`s have both a `committed_dt` and `authored_dt`), you could tell it which to use. For example, to scan my computer and find the most recent commit I made: + +``` +hpi query my.coding.commits.commits --order-key committed_dt --limit 1 --reverse --output pprint --stream +Commit(committed_dt=datetime.datetime(2023, 4, 14, 23, 9, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), + authored_dt=datetime.datetime(2023, 4, 14, 23, 4, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), + message='sources.smscalls: propogate errors if there are breaking ' + 'schema changes', + repo='/home/sean/Repos/promnesia-fork', + sha='22a434fca9a28df9b0915ccf16368df129d2c9ce', + ref='refs/heads/smscalls-handle-result') +``` + +To instead limit in some range, you can use `--before` and `--within` to filter by a range. For example, to get all the commits I committed in the last day: + +``` +hpi query my.coding.commits.commits --order-type datetime --before now --within 1d +``` + +That prints a a list of `Commit` as JSON objects. You could also use `--output pprint` to pretty-print the objects or `--output repl` drop into a REPL. + +To process the JSON, you can pipe it to [`jq`](https://github.com/stedolan/jq). I often use `jq length` to get the count of some output: + +``` +hpi query my.coding.commits.commits --order-type datetime --before now --within 1d | jq length +6 +``` + +Because grabbing data `--before now` is such a common use case, the `--recent` flag is a shorthand for `--order-type datetime --reverse --before now --within`. The same as above, to get the commits from the last day: + +``` +hpi query my.coding.commits.commits --recent 1d | jq length +6 +``` + +To select a range of commits, you can use `--after` and `--before`, passing ISO or epoch timestamps. Those can be full `datetimes` (`2021-01-01T00:05:30`) or just dates (`2021-01-01`). For example, to get all the commits I made on January 1st, 2021: + +``` +hpi query my.coding.commits.commits --order-type datetime --after 2021-01-01 --before 2021-01-02 | jq length +1 +``` + +If you have [`dateparser`](https://github.com/scrapinghub/dateparser#how-to-use) installed, this supports dozens more natural language formats: + +``` +hpi query my.coding.commits.commits --order-type datetime --after 'last week' --before 'day before yesterday' | jq length +28 +``` + +If you're having issues ordering because there are exceptions in your results not all data is sortable (may have `None` for some attributes), you can use `--drop-unsorted` to drop those items from the results, or `--drop-exceptions` to remove the exceptions + +You can also stream the results, which is useful for functions that take a while to process or have a lot of data. For example, if you wanted to pick a sha hash from a particular repo, you could combine `jq` to `select` and pick that attribute from the JSON: + +``` +hpi query my.coding.commits.commits --recent 30d --stream | jq 'select(.repo | contains("HPI"))' | jq '.sha' -r +4afa899c8b365b3c10e468f6279c02e316d3b650 +40de162fab741df594b4d9651348ee46ee021e9b +e1cb229913482074dc5523e57ef0acf6e9ec2bb2 +87c13defd131e39292b93dcea661d3191222dace +02c738594f2cae36ca4fab43cf9533fe6aa89396 +0b3a2a6ef3a9e4992771aaea0252fb28217b814a +84817ce72d208038b66f634d4ceb6e3a4c7ec5e9 +47992b8e046d27fc5141839179f06f925c159510 +425615614bd508e28ccceb56f43c692240e429ab +eed8f949460d768fb1f1c4801e9abab58a5f9021 +d26ad7d9ce6a4718f96346b994c3c1cd0d74380c +aec517e53c6ac022f2b4cc91261daab5651cebf0 +44b75a88fdfc7af132f61905232877031ce32fcb +b0ff6f29dd2846e97f8aa85a2ca73736b03254a8 +``` + +`jq`s `select` function acts on a stream of JSON objects, not a list, so it filters the output of `hpi query` the objects are generated (the goal here is to conserve memory as items which aren't needed are filtered). The alternative would be to print the entire JSON list at the end, like: + +`hpi query my.coding.commits.commits --recent 30d | jq '.[] | select(.repo | contains("Repos/HPI"))' | jq '.sha' -r`, using `jq '.[]'` to convert the JSON list into a stream of JSON objects. + +## Usage on non-HPI code + +The command can accept any qualified function name, so this could for example be used to check the output of [`promnesia`](https://github.com/karlicoss/promnesia) sources: + +``` +hpi query promnesia.sources.smscalls | jq length +371 +``` + +This can be used on any function that produces an `Iterator`/`Generator` like output, as long as it can be called with no arguments. + +## GPX + +The `hpi query` command can also be used with the `--output gpx` flag to generate gpx files from a list of locations, like the ones defined in the `my.location` package. This could be used to extract some date range and create a `gpx` file which can then be visualized by a GUI application. + +This prints the contents for the `gpx` file to STDOUT, and prints warnings for any objects it could not convert to locations to STDERR, so pipe STDOUT to a output file, like `>out.gpx` + +``` +hpi query my.location.all --after '2021-07-01T00:00:00' --before '2021-07-05T00:00:00' --order-type datetime --output gpx >out.gpx +``` + +If you want to ignore any errors, you can use `--drop-exceptions`. + +To preview, you can use something like [`qgis`](https://qgis.org/en/site/) or for something easier more lightweight, [`gpxsee`](https://github.com/tumic0/GPXSee): + +`gpxsee out.gpx`: + +chicago trip + +(Sidenote: this is [`@seanbreckenridge`](https://github.com/seanbreckenridge/)s locations, on a trip to Chicago) + +## Python reference + +The `hpi query` command is a CLI wrapper around the code in [`query.py`](../my/core/query.py) and [`query_range.py`](../my/core/query_range.py). The `select` function is the core of this, and `select_range` lets you specify dates, timedelta, start-end ranges, and other CLI-specific code. + +`my.core.query.select`: + +``` + A function to query, order, sort and filter items from one or more sources + This supports iterables and lists of mixed types (including handling errors), + by allowing you to provide custom predicates (functions) which can sort + by a function, an attribute, dict key, or by the attributes values. + + Since this supports mixed types, there's always a possibility + of KeyErrors or AttributeErrors while trying to find some value to order by, + so this provides multiple mechanisms to deal with that + + 'where' lets you filter items before ordering, to remove possible errors + or filter the iterator by some condition + + There are multiple ways to instruct select on how to order items. The most + flexible is to provide an 'order_by' function, which takes an item in the + iterator, does any custom checks you may want and then returns the value to sort by + + 'order_key' is best used on items which have a similar structure, or have + the same attribute name for every item in the iterator. If you have a + iterator of objects whose datetime is accessed by the 'timestamp' attribute, + supplying order_key='timestamp' would sort by that (dictionary or attribute) key + + 'order_value' is the most confusing, but often the most useful. Instead of + testing against the keys of an item, this allows you to write a predicate + (function) to test against its values (dictionary, NamedTuple, dataclass, object). + If you had an iterator of mixed types and wanted to sort by the datetime, + but the attribute to access the datetime is different on each type, you can + provide `order_value=lambda v: isinstance(v, datetime)`, and this will + try to find that value for each type in the iterator, to sort it by + the value which is received when the predicate is true + + 'order_value' is often used in the 'hpi query' interface, because of its brevity. + Just given the input function, this can typically sort it by timestamp with + no human intervention. It can sort of be thought as an educated guess, + but it can always be improved by providing a more complete guess function + + Note that 'order_value' is also the most computationally expensive, as it has + to copy the iterator in memory (using itertools.tee) to determine how to order it + in memory + + The 'drop_exceptions', 'raise_exceptions', 'warn_exceptions' let you ignore or raise + when the src contains exceptions. The 'warn_func' lets you provide a custom function + to call when an exception is encountered instead of using the 'warnings' module + + src: an iterable of mixed types, or a function to be called, + as the input to this function + + where: a predicate which filters the results before sorting + + order_by: a function which when given an item in the src, + returns the value to sort by. Similar to the 'key' value + typically passed directly to 'sorted' + + order_key: a string which represents a dict key or attribute name + to use as they key to sort by + + order_value: predicate which determines which attribute on an ADT-like item to sort by, + when given its value. lambda o: isinstance(o, datetime) is commonly passed to sort + by datetime, without knowing the attributes or interface for the items in the src + + default: while ordering, if the order for an object cannot be determined, + use this as the default value + + reverse: reverse the order of the resulting iterable + + limit: limit the results to this many items + + drop_unsorted: before ordering, drop any items from the iterable for which a + order could not be determined. False by default + + wrap_unsorted: before ordering, wrap any items into an 'Unsortable' object. Place + them at the front of the list. True by default + + drop_exceptions: ignore any exceptions from the src + + raise_exceptions: raise exceptions when received from the input src +``` + +`my.core.query_range.select_range`: + +``` + A specialized select function which offers generating functions + to filter/query ranges from an iterable + + order_key and order_value are used in the same way they are in select + + If you specify order_by_value_type, it tries to search for an attribute + on each object/type which has that type, ordering the iterable by that value + + unparsed_range is a tuple of length 3, specifying 'after', 'before', 'duration', + i.e. some start point to allow the computed value we're ordering by, some + end point and a duration (can use the RangeTuple NamedTuple to construct one) + + (this is typically parsed/created in my.core.__main__, from CLI flags + + If you specify a range, drop_unsorted is forced to be True +``` + +Those can be imported and accept any sort of iterator, `hpi query` just defaults to the output of functions here. As an example, see [`listens`](https://github.com/seanbreckenridge/HPI-personal/blob/master/scripts/listens) which just passes an generator (iterator) as the first argument to `query_range` diff --git a/my/core/__main__.py b/my/core/__main__.py index dce646a..620cb5f 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -485,6 +485,13 @@ def _locate_functions_or_prompt(qualified_names: List[str], prompt: bool = True) yield data_providers[chosen_index] +def _warn_exceptions(exc: Exception) -> None: + from my.core.common import LazyLogger + logger = LazyLogger('CLI', level='warning') + + logger.exception(f'hpi query: {exc}') + + # handle the 'hpi query' call # can raise a QueryException, caught in the click command def query_hpi_functions( @@ -501,10 +508,12 @@ def query_hpi_functions( limit: Optional[int], drop_unsorted: bool, wrap_unsorted: bool, + warn_exceptions: bool, raise_exceptions: bool, drop_exceptions: bool, ) -> None: from .query_range import select_range, RangeTuple + import my.core.error as err # chain list of functions from user, in the order they wrote them on the CLI input_src = chain(*(f() for f in _locate_functions_or_prompt(qualified_names))) @@ -518,6 +527,8 @@ def query_hpi_functions( limit=limit, drop_unsorted=drop_unsorted, wrap_unsorted=wrap_unsorted, + warn_exceptions=warn_exceptions, + warn_func=_warn_exceptions, raise_exceptions=raise_exceptions, drop_exceptions=drop_exceptions) @@ -545,10 +556,21 @@ def query_hpi_functions( elif output == 'gpx': from my.location.common import locations_to_gpx + # if user didn't specify to ignore exceptions, warn if locations_to_gpx + # cannot process the output of the command. This can be silenced by + # passing --drop-exceptions + if not raise_exceptions and not drop_exceptions: + warn_exceptions = True + # can ignore the mypy warning here, locations_to_gpx yields any errors # if you didnt pass it something that matches the LocationProtocol for exc in locations_to_gpx(res, sys.stdout): # type: ignore[arg-type] - click.echo(str(exc), err=True) + if warn_exceptions: + _warn_exceptions(exc) + elif raise_exceptions: + raise exc + elif drop_exceptions: + pass sys.stdout.flush() else: res = list(res) # type: ignore[assignment] @@ -742,6 +764,10 @@ def module_install_cmd(user: bool, parallel: bool, modules: Sequence[str]) -> No default=False, is_flag=True, help="if the order of an item can't be determined while ordering, wrap them into an 'Unsortable' object") +@click.option('--warn-exceptions', + default=False, + is_flag=True, + help="if any errors are returned, print them as errors on STDERR") @click.option('--raise-exceptions', default=False, is_flag=True, @@ -765,6 +791,7 @@ def query_cmd( limit: Optional[int], drop_unsorted: bool, wrap_unsorted: bool, + warn_exceptions: bool, raise_exceptions: bool, drop_exceptions: bool, ) -> None: @@ -792,7 +819,7 @@ def query_cmd( \b Can also query within a range. To filter comments between 2016 and 2018: - hpi query --order-type datetime --after '2016-01-01 00:00:00' --before '2019-01-01 00:00:00' my.reddit.all.comments + hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments ''' from datetime import datetime, date @@ -831,6 +858,7 @@ def query_cmd( limit=limit, drop_unsorted=drop_unsorted, wrap_unsorted=wrap_unsorted, + warn_exceptions=warn_exceptions, raise_exceptions=raise_exceptions, drop_exceptions=drop_exceptions) except QueryException as qe: diff --git a/my/core/error.py b/my/core/error.py index e6f76cd..09c1733 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -4,7 +4,7 @@ See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail """ from itertools import tee -from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast +from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast, Iterator from .compat import Literal @@ -29,6 +29,37 @@ def unwrap(res: Res[T]) -> T: else: return res +def drop_exceptions(itr: Iterator[Res[T]]) -> Iterator[T]: + """Return non-errors from the iterable""" + for o in itr: + if isinstance(o, Exception): + continue + yield o + + +def raise_exceptions(itr: Iterable[Res[T]]) -> Iterator[T]: + """Raise errors from the iterable, stops the select function""" + for o in itr: + if isinstance(o, Exception): + raise o + yield o + + +def warn_exceptions(itr: Iterable[Res[T]], warn_func: Optional[Callable[[Exception], None]] = None) -> Iterator[T]: + # if not provided, use the 'warnings' module + if warn_func is None: + from my.core.warnings import medium + def _warn_func(e: Exception) -> None: + # TODO: print traceback? but user could always --raise-exceptions as well + medium(str(e)) + warn_func = _warn_func + + for o in itr: + if isinstance(o, Exception): + warn_func(o) + continue + yield o + def echain(ex: E, cause: Exception) -> E: ex.__cause__ = cause diff --git a/my/core/query.py b/my/core/query.py index ed29649..8a497db 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -14,6 +14,7 @@ from typing import TypeVar, Tuple, Optional, Union, Callable, Iterable, Iterator import more_itertools +import my.core.error as err from .common import is_namedtuple from .error import Res, unwrap from .warnings import low @@ -205,20 +206,6 @@ pass 'drop_exceptions' to ignore exceptions""") return None # couldn't compute a OrderFunc for this class/instance -def _drop_exceptions(itr: Iterator[ET]) -> Iterator[T]: - """Return non-errors from the iterable""" - for o in itr: - if isinstance(o, Exception): - continue - yield o - - -def _raise_exceptions(itr: Iterable[ET]) -> Iterator[T]: - """Raise errors from the iterable, stops the select function""" - for o in itr: - if isinstance(o, Exception): - raise o - yield o # currently using the 'key set' as a proxy for 'this is the same type of thing' @@ -365,6 +352,8 @@ def select( limit: Optional[int] = None, drop_unsorted: bool = False, wrap_unsorted: bool = True, + warn_exceptions: bool = False, + warn_func: Optional[Callable[[Exception], None]] = None, drop_exceptions: bool = False, raise_exceptions: bool = False, ) -> Iterator[ET]: @@ -408,7 +397,9 @@ def select( to copy the iterator in memory (using itertools.tee) to determine how to order it in memory - The 'drop_exceptions' and 'raise_exceptions' let you ignore or raise when the src contains exceptions + The 'drop_exceptions', 'raise_exceptions', 'warn_exceptions' let you ignore or raise + when the src contains exceptions. The 'warn_func' lets you provide a custom function + to call when an exception is encountered instead of using the 'warnings' module src: an iterable of mixed types, or a function to be called, as the input to this function @@ -469,10 +460,13 @@ Will attempt to call iter() on the value""") # if both drop_exceptions and drop_exceptions are provided for some reason, # should raise exceptions before dropping them if raise_exceptions: - itr = _raise_exceptions(itr) + itr = err.raise_exceptions(itr) if drop_exceptions: - itr = _drop_exceptions(itr) + itr = err.drop_exceptions(itr) + + if warn_exceptions: + itr = err.warn_exceptions(itr, warn_func=warn_func) if where is not None: itr = filter(where, itr) diff --git a/my/core/query_range.py b/my/core/query_range.py index 179e4ea..33eb03c 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -73,13 +73,28 @@ def parse_datetime_float(date_str: str) -> float: return ds_float try: # isoformat - default format when you call str() on datetime + # this also parses dates like '2020-01-01' return datetime.fromisoformat(ds).timestamp() except ValueError: pass try: return isoparse(ds).timestamp() except (AssertionError, ValueError): - raise QueryException(f"Was not able to parse {ds} into a datetime") + pass + + try: + import dateparser # type: ignore[import] + except ImportError: + pass + else: + # dateparser is a bit more lenient than the above, lets you type + # all sorts of dates as inputs + # https://github.com/scrapinghub/dateparser#how-to-use + res: Optional[datetime] = dateparser.parse(ds, settings={"DATE_ORDER": "YMD"}) + if res is not None: + return res.timestamp() + + raise QueryException(f"Was not able to parse {ds} into a datetime") # probably DateLike input? but a user could specify an order_key @@ -267,6 +282,8 @@ def select_range( limit: Optional[int] = None, drop_unsorted: bool = False, wrap_unsorted: bool = False, + warn_exceptions: bool = False, + warn_func: Optional[Callable[[Exception], None]] = None, drop_exceptions: bool = False, raise_exceptions: bool = False, ) -> Iterator[ET]: @@ -293,9 +310,15 @@ def select_range( unparsed_range = None # some operations to do before ordering/filtering - if drop_exceptions or raise_exceptions or where is not None: + if drop_exceptions or raise_exceptions or where is not None or warn_exceptions: # doesn't wrap unsortable items, because we pass no order related kwargs - itr = select(itr, where=where, drop_exceptions=drop_exceptions, raise_exceptions=raise_exceptions) + itr = select( + itr, + where=where, + drop_exceptions=drop_exceptions, + raise_exceptions=raise_exceptions, + warn_exceptions=warn_exceptions, + warn_func=warn_func) order_by_chosen: Optional[OrderFunc] = None From a445d2cbfec361746e4602756e0d518d70f3cef2 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 15 May 2023 21:56:23 +0100 Subject: [PATCH 110/302] general: python3.7 will reach EOL soon, remove its support --- .github/workflows/main.yml | 13 ++++++++----- setup.py | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8b23921..47f84cb 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,12 +17,15 @@ jobs: strategy: matrix: platform: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10', '3.11'] exclude: [ - # windows runners are pretty scarce, so let's only run one of them.. - {platform: windows-latest, python-version: '3.7' }, - {platform: windows-latest, python-version: '3.9' }, + # windows runners are pretty scarce, so let's only run lowest and highest python version + {platform: windows-latest, python-version: '3.9'}, {platform: windows-latest, python-version: '3.10'}, + + # same, macos is a bit too slow and ubuntu covers python quirks well + {platform: macos-latest , python-version: '3.9' }, + {platform: macos-latest , python-version: '3.10' }, ] runs-on: ${{ matrix.platform }} @@ -34,7 +37,7 @@ jobs: # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation - run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} diff --git a/setup.py b/setup.py index b0f4ab6..f3f8511 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def main() -> None: author_email='karlicoss@gmail.com', description='A Python interface to my life', - python_requires='>=3.7', + python_requires='>=3.8', install_requires=INSTALL_REQUIRES, extras_require={ 'testing': [ From c34656e8fbc372226e1983a4d35eb89b007b7687 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 15 May 2023 21:53:33 +0100 Subject: [PATCH 111/302] general: update mypy config, seems that logs of type: ignore aren't necessary anymore --- my/bluemaestro.py | 2 +- my/body/blood.py | 2 +- my/body/exercise/all.py | 2 +- my/body/exercise/cross_trainer.py | 4 +-- my/body/sleep/common.py | 2 +- my/body/weight.py | 2 +- my/coding/commits.py | 4 +-- my/core/__main__.py | 4 +-- my/core/cachew.py | 2 +- my/core/cfg.py | 2 +- my/core/common.py | 51 ++++++++++++++++++------------- my/core/compat.py | 25 ++++++++++++--- my/core/core_config.py | 6 ++-- my/core/discovery_pure.py | 2 +- my/core/error.py | 8 ++--- my/core/kompress.py | 6 ++-- my/core/konsume.py | 6 ++-- my/core/logging.py | 13 ++++---- my/core/pandas.py | 4 +-- my/core/preinit.py | 2 +- my/core/query.py | 2 +- my/core/query_range.py | 4 +-- my/core/serialize.py | 4 +-- my/core/sqlite.py | 2 +- my/core/util.py | 2 +- my/emfit/__init__.py | 2 +- my/endomondo.py | 2 +- my/github/gdpr.py | 12 ++++---- my/location/fallback/via_ip.py | 2 +- my/location/gpslogger.py | 2 +- my/orgmode.py | 2 +- my/photos/utils.py | 4 +-- my/reddit/rexport.py | 4 +-- my/rss/feedbin.py | 2 +- my/runnerup.py | 2 +- my/smscalls.py | 2 +- my/time/tz/via_location.py | 4 +-- my/twitter/archive.py | 2 +- mypy.ini | 12 +++++++- tests/calendar.py | 2 +- tests/config.py | 2 +- tests/conftest.py | 2 +- tests/core/test_kompress.py | 2 +- tests/demo.py | 2 +- tests/extra/polar.py | 4 +-- tests/get_files.py | 2 +- tests/location.py | 2 +- tests/pdfs.py | 2 +- tests/reddit.py | 2 +- tests/takeout.py | 2 +- tests/test_tmp_config.py | 2 +- tests/tz.py | 4 +-- 52 files changed, 142 insertions(+), 105 deletions(-) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index ee85f21..b50c77c 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -40,7 +40,7 @@ class Measurement: # fixme: later, rely on the timezone provider # NOTE: the timezone should be set with respect to the export date!!! -import pytz # type: ignore +import pytz tz = pytz.timezone('Europe/London') # TODO when I change tz, check the diff diff --git a/my/body/blood.py b/my/body/blood.py index e282068..fb035eb 100644 --- a/my/body/blood.py +++ b/my/body/blood.py @@ -9,7 +9,7 @@ from ..core.error import Res from ..core.orgmode import parse_org_datetime, one_table -import pandas as pd # type: ignore +import pandas as pd import orgparse diff --git a/my/body/exercise/all.py b/my/body/exercise/all.py index 4fee9d3..e86a5af 100644 --- a/my/body/exercise/all.py +++ b/my/body/exercise/all.py @@ -10,7 +10,7 @@ def dataframe() -> DataFrameT: from ...endomondo import dataframe as EDF from ...runnerup import dataframe as RDF - import pandas as pd # type: ignore + import pandas as pd return pd.concat([ EDF(), RDF(), diff --git a/my/body/exercise/cross_trainer.py b/my/body/exercise/cross_trainer.py index b25985c..d073f43 100644 --- a/my/body/exercise/cross_trainer.py +++ b/my/body/exercise/cross_trainer.py @@ -78,7 +78,7 @@ def cross_trainer_manual_dataframe() -> DataFrameT: ''' Only manual org-mode entries ''' - import pandas as pd # type: ignore[import] + import pandas as pd df = pd.DataFrame(cross_trainer_data()) return df @@ -91,7 +91,7 @@ def dataframe() -> DataFrameT: ''' Attaches manually logged data (which Endomondo can't capture) and attaches it to Endomondo ''' - import pandas as pd # type: ignore[import] + import pandas as pd from ...endomondo import dataframe as EDF edf = EDF() diff --git a/my/body/sleep/common.py b/my/body/sleep/common.py index a07b3fa..7bc1021 100644 --- a/my/body/sleep/common.py +++ b/my/body/sleep/common.py @@ -8,7 +8,7 @@ class Combine: @cdf def dataframe(self, with_temperature: bool=True) -> DataFrameT: - import pandas as pd # type: ignore + import pandas as pd # todo include 'source'? df = pd.concat([m.dataframe() for m in self.modules]) diff --git a/my/body/weight.py b/my/body/weight.py index 659b759..def3e87 100644 --- a/my/body/weight.py +++ b/my/body/weight.py @@ -56,7 +56,7 @@ def from_orgmode() -> Iterator[Result]: def make_dataframe(data: Iterator[Result]): - import pandas as pd # type: ignore + import pandas as pd def it(): for e in data: if isinstance(e, Exception): diff --git a/my/coding/commits.py b/my/coding/commits.py index 7786055..67ee77d 100644 --- a/my/coding/commits.py +++ b/my/coding/commits.py @@ -38,8 +38,8 @@ def config() -> commits_cfg: ########################## -import git # type: ignore -from git.repo.fun import is_git_dir # type: ignore +import git +from git.repo.fun import is_git_dir log = LazyLogger(__name__, level='info') diff --git a/my/core/__main__.py b/my/core/__main__.py index 620cb5f..feb83bb 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -135,7 +135,7 @@ def config_ok() -> bool: # at this point 'my' should already be imported, so doesn't hurt to extract paths from it import my try: - paths: List[str] = list(my.__path__) # type: ignore[attr-defined] + paths: List[str] = list(my.__path__) except Exception as e: errors.append(e) error('failed to determine module import path') @@ -152,7 +152,7 @@ def config_ok() -> bool: ## check we're not using stub config import my.core try: - core_pkg_path = str(Path(my.core.__path__[0]).parent) # type: ignore[attr-defined] + core_pkg_path = str(Path(my.core.__path__[0]).parent) if str(cfg_path).startswith(core_pkg_path): error(f''' Seems that the stub config is used ({cfg_path}). This is likely not going to work. diff --git a/my/core/cachew.py b/my/core/cachew.py index 9959120..dbc4d49 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -30,7 +30,7 @@ def disabled_cachew() -> Iterator[None]: def _appdirs_cache_dir() -> Path: - import appdirs # type: ignore + import appdirs cd = Path(appdirs.user_cache_dir('my')) cd.mkdir(exist_ok=True, parents=True) return cd diff --git a/my/core/cfg.py b/my/core/cfg.py index 3cddcf7..f298e7f 100644 --- a/my/core/cfg.py +++ b/my/core/cfg.py @@ -21,7 +21,7 @@ def make_config(cls: Type[C], migration: Callable[[Attrs], Attrs]=lambda x: x) - if k in {f.name for f in fields(cls)} # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115 } # todo maybe return type here? - return cls(**params) # type: ignore[call-arg] + return cls(**params) F = TypeVar('F') diff --git a/my/core/common.py b/my/core/common.py index 090c564..0b3dc1e 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -3,6 +3,7 @@ from pathlib import Path from datetime import datetime import functools from contextlib import contextmanager +import sys import types from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple, TYPE_CHECKING, NoReturn import warnings @@ -21,13 +22,12 @@ def import_file(p: PathIsh, name: Optional[str] = None) -> types.ModuleType: assert spec is not None, f"Fatal error; Could not create module spec from {name} {p}" foo = importlib.util.module_from_spec(spec) loader = spec.loader; assert loader is not None - loader.exec_module(foo) # type: ignore[attr-defined] + loader.exec_module(foo) return foo def import_from(path: PathIsh, name: str) -> types.ModuleType: path = str(path) - import sys try: sys.path.append(path) import importlib @@ -94,7 +94,7 @@ def ensure_unique( def test_ensure_unique() -> None: - import pytest # type: ignore + import pytest assert list(ensure_unique([1, 2, 3], key=lambda i: i)) == [1, 2, 3] dups = [1, 2, 1, 4] @@ -432,7 +432,7 @@ def warn_if_empty(f): def wrapped(*args, **kwargs): res = f(*args, **kwargs) return _warn_iterable(res, f=f) - return wrapped # type: ignore + return wrapped # global state that turns on/off quick stats @@ -620,6 +620,10 @@ def assert_subpackage(name: str) -> None: assert name == '__main__' or 'my.core' in name, f'Expected module __name__ ({name}) to be __main__ or start with my.core' +from .compat import ParamSpec +_P = ParamSpec('_P') +_T = TypeVar('_T') + # https://stackoverflow.com/a/10436851/706389 from concurrent.futures import Future, Executor class DummyExecutor(Executor): @@ -627,26 +631,31 @@ class DummyExecutor(Executor): self._shutdown = False self._max_workers = max_workers - # TODO: once support for 3.7 is dropped, - # can make 'fn' a positional only parameter, - # which fixes the mypy error this throws without the type: ignore - def submit(self, fn, *args, **kwargs) -> Future: # type: ignore[override] - if self._shutdown: - raise RuntimeError('cannot schedule new futures after shutdown') - - f: Future[Any] = Future() - try: - result = fn(*args, **kwargs) - except KeyboardInterrupt: - raise - except BaseException as e: - f.set_exception(e) + if TYPE_CHECKING: + if sys.version_info[:2] <= (3, 8): + # 3.8 doesn't support ParamSpec as Callable arg :( + # and any attempt to type results in incompatible supertype.. so whatever + def submit(self, fn, *args, **kwargs): ... else: - f.set_result(result) + def submit(self, fn: Callable[_P, _T], /, *args: _P.args, **kwargs: _P.kwargs) -> Future[_T]: ... + else: + def submit(self, fn, *args, **kwargs): + if self._shutdown: + raise RuntimeError('cannot schedule new futures after shutdown') - return f + f: Future[Any] = Future() + try: + result = fn(*args, **kwargs) + except KeyboardInterrupt: + raise + except BaseException as e: + f.set_exception(e) + else: + f.set_result(result) - def shutdown(self, wait: bool=True) -> None: # type: ignore[override] + return f + + def shutdown(self, wait: bool=True, **kwargs) -> None: self._shutdown = True diff --git a/my/core/compat.py b/my/core/compat.py index 8bdb401..0b47bdd 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -86,7 +86,7 @@ else: def cached_property(f: Callable[[Cl], R]) -> R: import functools - return property(functools.lru_cache(maxsize=1)(f)) # type: ignore + return property(functools.lru_cache(maxsize=1)(f)) del Cl del R @@ -111,7 +111,7 @@ if sys.version_info[:2] >= (3, 8): from typing import Protocol else: if TYPE_CHECKING: - from typing_extensions import Protocol # type: ignore[misc] + from typing_extensions import Protocol else: # todo could also use NamedTuple? Protocol = object @@ -121,12 +121,29 @@ if sys.version_info[:2] >= (3, 8): from typing import TypedDict else: if TYPE_CHECKING: - from typing_extensions import TypedDict # type: ignore[misc] + from typing_extensions import TypedDict else: from typing import Dict TypedDict = Dict +if sys.version_info[:2] >= (3, 10): + from typing import ParamSpec +else: + if TYPE_CHECKING: + from typing_extensions import ParamSpec + else: + from typing import NamedTuple, Any + # erm.. I guess as long as it's not crashing, whatever... + class _ParamSpec: + def __call__(self, args): + class _res: + args = None + kwargs = None + return _res + ParamSpec = _ParamSpec() + + # bisect_left doesn't have a 'key' parameter (which we use) # till python3.10 if sys.version_info[:2] <= (3, 9): @@ -156,4 +173,4 @@ if sys.version_info[:2] <= (3, 9): hi = mid return lo else: - from bisect import bisect_left # type: ignore[misc] + from bisect import bisect_left diff --git a/my/core/core_config.py b/my/core/core_config.py index f87a1ba..5c696ce 100644 --- a/my/core/core_config.py +++ b/my/core/core_config.py @@ -7,16 +7,16 @@ from typing import Sequence, Optional from . import warnings, PathIsh, Path try: - from my.config import core as user_config # type: ignore[attr-defined] + from my.config import core as user_config # type: ignore[attr-defined] except Exception as e: try: - from my.config import common as user_config # type: ignore[attr-defined, assignment, misc] + from my.config import common as user_config # type: ignore[attr-defined] warnings.high("'common' config section is deprecated. Please rename it to 'core'.") except Exception as e2: # make it defensive, because it's pretty commonly used and would be annoying if it breaks hpi doctor etc. # this way it'll at least use the defaults # todo actually not sure if needs a warning? Perhaps it's okay without it, because the defaults are reasonable enough - user_config = object # type: ignore[assignment, misc] + user_config = object _HPI_CACHE_DIR_DEFAULT = '' diff --git a/my/core/discovery_pure.py b/my/core/discovery_pure.py index c88ef1c..85b75ab 100644 --- a/my/core/discovery_pure.py +++ b/my/core/discovery_pure.py @@ -144,7 +144,7 @@ def all_modules() -> Iterable[HPIModule]: def _iter_my_roots() -> Iterable[Path]: import my # doesn't import any code, because of namespace package - paths: List[str] = list(my.__path__) # type: ignore[attr-defined] + paths: List[str] = list(my.__path__) if len(paths) == 0: # should probably never happen?, if this code is running, it was imported # because something was added to __path__ to match this name diff --git a/my/core/error.py b/my/core/error.py index 09c1733..236bd30 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -125,7 +125,7 @@ def test_sort_res_by() -> None: 1, Exc('last'), ] - results = sort_res_by(ress, lambda x: int(x)) # type: ignore + results = sort_res_by(ress, lambda x: int(x)) assert results == [ 1, 'bad', @@ -137,11 +137,11 @@ def test_sort_res_by() -> None: Exc('last'), ] - results2 = sort_res_by(ress + [0], lambda x: int(x)) # type: ignore + results2 = sort_res_by(ress + [0], lambda x: int(x)) assert results2 == [Exc('last'), 0] + results[:-1] assert sort_res_by(['caba', 'a', 'aba', 'daba'], key=lambda x: len(x)) == ['a', 'aba', 'caba', 'daba'] - assert sort_res_by([], key=lambda x: x) == [] # type: ignore + assert sort_res_by([], key=lambda x: x) == [] # helpers to associate timestamps with the errors (so something meaningful could be displayed on the plots, for example) @@ -215,7 +215,7 @@ See {help_url}\ if hasattr(err, 'obj') and hasattr(err, "name"): config_obj = cast(object, getattr(err, 'obj')) # the object that caused the attribute error # e.g. active_browser for my.browser - nested_block_name = err.name # type: ignore[attr-defined] + nested_block_name = err.name if config_obj.__module__ == 'my.config': click.secho(f"""You're likely missing the nested config block for '{getattr(config_obj, '__name__', str(config_obj))}.{nested_block_name}'. See {help_url} or check the corresponding module.py file for an example\ diff --git a/my/core/kompress.py b/my/core/kompress.py index 5ba32d3..8ee1cfa 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -82,7 +82,7 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO: ifile.read1 = ifile.read # type: ignore # TODO pass all kwargs here?? # todo 'expected "BinaryIO"'?? - return io.TextIOWrapper(ifile, encoding=encoding) # type: ignore[arg-type] + return io.TextIOWrapper(ifile, encoding=encoding) elif name.endswith(Ext.lz4): import lz4.frame # type: ignore return lz4.frame.open(str(pp), mode, *args, **kwargs) @@ -95,7 +95,7 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO: tf = tarfile.open(pp) # TODO pass encoding? x = tf.extractfile(*args); assert x is not None - return x # type: ignore[return-value] + return x else: return pp.open(mode, *args, **kwargs) @@ -209,7 +209,7 @@ class ZipPath(zipfile_Path): def __truediv__(self, key) -> ZipPath: # need to implement it so the return type is not zipfile.Path tmp = zipfile_Path(self.root) / self.at / key - return ZipPath(self.root, tmp.at) # type: ignore[attr-defined] + return ZipPath(self.root, tmp.at) def iterdir(self) -> Iterator[ZipPath]: for s in self._as_dir().iterdir(): diff --git a/my/core/konsume.py b/my/core/konsume.py index b4cf7b6..588bfe1 100644 --- a/my/core/konsume.py +++ b/my/core/konsume.py @@ -19,7 +19,7 @@ def zoom(w, *keys): # TODO need to support lists class Zoomable: def __init__(self, parent, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) # type: ignore + super().__init__(*args, **kwargs) self.parent = parent # TODO not sure, maybe do it via del?? @@ -147,7 +147,7 @@ Expected {c} to be fully consumed by the parser. from typing import cast def test_unconsumed() -> None: - import pytest # type: ignore + import pytest with pytest.raises(UnconsumedError): with wrap({'a': 1234}) as w: w = cast(Wdict, w) @@ -200,7 +200,7 @@ def test_consume_few() -> None: def test_zoom() -> None: - import pytest # type: ignore + import pytest with wrap({'aaa': 'whatever'}) as w: w = cast(Wdict, w) with pytest.raises(KeyError): diff --git a/my/core/logging.py b/my/core/logging.py index a948dd8..e7061fa 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -62,20 +62,21 @@ def setup_logger(logger: logging.Logger, level: LevelIsh) -> None: lvl = mklevel(level) try: import logzero # type: ignore[import] + except ModuleNotFoundError: + warnings.warn("You might want to install 'logzero' for nice colored logs!") + formatter = logging.Formatter(fmt=FORMAT_NOCOLOR, datefmt=DATEFMT) + use_logzero = False + else: formatter = logzero.LogFormatter( fmt=FORMAT_COLOR, datefmt=DATEFMT, ) use_logzero = True - except ModuleNotFoundError: - warnings.warn("You might want to install 'logzero' for nice colored logs!") - formatter = logging.Formatter(fmt=FORMAT_NOCOLOR, datefmt=DATEFMT) - use_logzero = False logger.addFilter(AddExceptionTraceback()) if use_logzero and not COLLAPSE_DEBUG_LOGS: # all set, nothing to do # 'simple' setup - logzero.setup_logger(logger.name, level=lvl, formatter=formatter) + logzero.setup_logger(logger.name, level=lvl, formatter=formatter) # type: ignore[possibly-undefined] return h = CollapseDebugHandler() if COLLAPSE_DEBUG_LOGS else logging.StreamHandler() @@ -101,7 +102,7 @@ class LazyLogger(logging.Logger): # oh god.. otherwise might go into an inf loop if not hasattr(logger, _init_done): setattr(logger, _init_done, False) # will setup on the first call - logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[assignment] + logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[method-assign] return cast(LazyLogger, logger) diff --git a/my/core/pandas.py b/my/core/pandas.py index 8ccacd2..ee4bcff 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -15,7 +15,7 @@ logger = LazyLogger(__name__) if TYPE_CHECKING: # this is kinda pointless at the moment, but handy to annotate DF returning methods now # later will be unignored when they implement type annotations - import pandas as pd # type: ignore + import pandas as pd # DataFrameT = pd.DataFrame # TODO ugh. pretty annoying, having any is not very useful since it would allow arbitrary coercions.. # ideally want to use a type that's like Any but doesn't allow arbitrary coercions?? @@ -26,7 +26,7 @@ else: def check_dateish(s) -> Iterable[str]: - import pandas as pd # type: ignore # noqa: F811 not actually a redefinition + import pandas as pd # noqa: F811 not actually a redefinition ctype = s.dtype if str(ctype).startswith('datetime64'): return diff --git a/my/core/preinit.py b/my/core/preinit.py index c05ee40..9d6b374 100644 --- a/my/core/preinit.py +++ b/my/core/preinit.py @@ -1,7 +1,7 @@ from pathlib import Path def get_mycfg_dir() -> Path: - import appdirs # type: ignore[import] + import appdirs import os # not sure if that's necessary, i.e. could rely on PYTHONPATH instead # on the other hand, by using MY_CONFIG we are guaranteed to load it from the desired path? diff --git a/my/core/query.py b/my/core/query.py index 8a497db..f78a1f7 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -495,7 +495,7 @@ Will attempt to call iter() on the value""") unsortable, itr = _handle_unsorted(itr, order_by_chosen, drop_unsorted, wrap_unsorted) # run the sort, with the computed order by function - itr = iter(sorted(itr, key=order_by_chosen, reverse=reverse)) # type: ignore[arg-type, type-var] + itr = iter(sorted(itr, key=order_by_chosen, reverse=reverse)) # type: ignore[arg-type] # re-attach unsortable values to the front/back of the list if reverse: diff --git a/my/core/query_range.py b/my/core/query_range.py index 33eb03c..3fdc12e 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -40,7 +40,7 @@ def parse_timedelta_string(timedelta_str: str) -> timedelta: if parts is None: raise ValueError(f"Could not parse time duration from {timedelta_str}.\nValid examples: '8h', '1w2d8h5m20s', '2m4s'") time_params = {name: float(param) for name, param in parts.groupdict().items() if param} - return timedelta(**time_params) # type: ignore[arg-type] + return timedelta(**time_params) def parse_timedelta_float(timedelta_str: str) -> float: @@ -83,7 +83,7 @@ def parse_datetime_float(date_str: str) -> float: pass try: - import dateparser # type: ignore[import] + import dateparser except ImportError: pass else: diff --git a/my/core/serialize.py b/my/core/serialize.py index ca68fef..1ef7bc0 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -188,8 +188,8 @@ def test_nt_serialize() -> None: # test orjson option kwarg data = {datetime.date(year=1970, month=1, day=1): 5} - res = jsn.loads(dumps(data, option=orjson.OPT_NON_STR_KEYS)) - assert res == {'1970-01-01': 5} + res2 = jsn.loads(dumps(data, option=orjson.OPT_NON_STR_KEYS)) + assert res2 == {'1970-01-01': 5} def test_default_serializer() -> None: diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 80dbc3f..e712a77 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -22,7 +22,7 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None: with sqlite3.connect(db) as conn: conn.execute('CREATE TABLE testtable (col)') - import pytest # type: ignore + import pytest with pytest.raises(sqlite3.OperationalError, match='readonly database'): with sqlite_connect_immutable(db) as conn: conn.execute('DROP TABLE testtable') diff --git a/my/core/util.py b/my/core/util.py index f12b578..1ca2de1 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -62,7 +62,7 @@ def _iter_all_importables(pkg: ModuleType) -> Iterable[HPIModule]: _discover_path_importables(Path(p), pkg.__name__) # todo might need to handle __path__ for individual modules too? # not sure why __path__ was duplicated, but it did happen.. - for p in set(pkg.__path__) # type: ignore[attr-defined] + for p in set(pkg.__path__) ) diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index 0a1eb73..acaa303 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -133,7 +133,7 @@ def dataframe() -> DataFrameT: dicts.append(d) - import pandas # type: ignore + import pandas return pandas.DataFrame(dicts) diff --git a/my/endomondo.py b/my/endomondo.py index 0fa396f..d314e97 100644 --- a/my/endomondo.py +++ b/my/endomondo.py @@ -66,7 +66,7 @@ def dataframe(defensive: bool=True) -> DataFrameT: # todo check for 'defensive' d = {'error': f'{e} {w}'} yield d - import pandas as pd # type: ignore + import pandas as pd df = pd.DataFrame(it()) # pandas guesses integer, which is pointless for this field (might get coerced to float too) df['id'] = df['id'].astype(str) diff --git a/my/github/gdpr.py b/my/github/gdpr.py index c41fb6c..6f7efe4 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -133,7 +133,7 @@ def _parse_repository(d: Dict) -> Event: rt = d['type'] assert url.startswith(pref); name = url[len(pref):] eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None) - return Event( # type: ignore[misc] + return Event( **_parse_common(d), summary='created ' + name, eid=eid, @@ -143,7 +143,7 @@ def _parse_repository(d: Dict) -> Event: def _parse_issue_comment(d: Dict) -> Event: url = d['url'] is_bot = "[bot]" in d["user"] - return Event( # type: ignore[misc] + return Event( **_parse_common(d), summary=f'commented on issue {url}', eid='issue_comment_' + url, @@ -155,7 +155,7 @@ def _parse_issue(d: Dict) -> Event: url = d['url'] title = d['title'] is_bot = "[bot]" in d["user"] - return Event( # type: ignore[misc] + return Event( **_parse_common(d), summary=f'opened issue {title}', eid='issue_comment_' + url, @@ -168,7 +168,7 @@ def _parse_pull_request(d: Dict) -> Event: url = d['url'] title = d['title'] is_bot = "[bot]" in d["user"] - return Event( # type: ignore[misc] + return Event( **_parse_common(d), # TODO distinguish incoming/outgoing? # TODO action? opened/closed?? @@ -195,7 +195,7 @@ def _parse_project(d: Dict) -> Event: def _parse_release(d: Dict) -> Event: tag = d['tag_name'] - return Event( # type: ignore[misc] + return Event( **_parse_common(d), summary=f'released {tag}', eid='release_' + tag, @@ -204,7 +204,7 @@ def _parse_release(d: Dict) -> Event: def _parse_commit_comment(d: Dict) -> Event: url = d['url'] - return Event( # type: ignore[misc] + return Event( **_parse_common(d), summary=f'commented on {url}', eid='commit_comment_' + url, diff --git a/my/location/fallback/via_ip.py b/my/location/fallback/via_ip.py index 303074f..f637552 100644 --- a/my/location/fallback/via_ip.py +++ b/my/location/fallback/via_ip.py @@ -71,7 +71,7 @@ def estimate_location(dt: DateExact) -> Iterator[FallbackLocation]: # search to find the first possible location which contains dt (something that started up to # config.for_duration ago, and ends after dt) - idx = bisect_left(fl, dt_ts - config.for_duration.total_seconds(), key=lambda l: l.dt.timestamp()) # type: ignore[operator,call-arg,type-var] + idx = bisect_left(fl, dt_ts - config.for_duration.total_seconds(), key=lambda l: l.dt.timestamp()) # all items are before the given dt if idx == len(fl): diff --git a/my/location/gpslogger.py b/my/location/gpslogger.py index 46fc381..17f828f 100644 --- a/my/location/gpslogger.py +++ b/my/location/gpslogger.py @@ -22,7 +22,7 @@ from datetime import datetime, timezone from pathlib import Path from typing import Iterator, Sequence, List -import gpxpy # type: ignore[import] +import gpxpy from more_itertools import unique_everseen from my.core import Stats, LazyLogger diff --git a/my/orgmode.py b/my/orgmode.py index d6d31d2..bb186d1 100644 --- a/my/orgmode.py +++ b/my/orgmode.py @@ -36,7 +36,7 @@ _rgx = re.compile(orgparse.date.gene_timestamp_regex(brtype='inactive'), re.VERB def _created(n: orgparse.OrgNode) -> Tuple[Optional[datetime], str]: heading = n.heading # meh.. support in orgparse? - pp = {} if n.is_root() else n.properties # type: ignore + pp = {} if n.is_root() else n.properties createds = pp.get('CREATED', None) if createds is None: # try to guess from heading diff --git a/my/photos/utils.py b/my/photos/utils.py index 8c16dc5..c614c4a 100644 --- a/my/photos/utils.py +++ b/my/photos/utils.py @@ -1,8 +1,8 @@ from pathlib import Path from typing import Dict -import PIL.Image # type: ignore -from PIL.ExifTags import TAGS, GPSTAGS # type: ignore +import PIL.Image +from PIL.ExifTags import TAGS, GPSTAGS Exif = Dict diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index a8ce651..b1f9e3b 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -224,9 +224,9 @@ def events(*args, **kwargs) -> List[Event]: inp = inputs() # 2.2s for 300 files without cachew # 0.2s for 300 files with cachew - evit = _get_events(inp, *args, **kwargs) # type: ignore[call-arg] + evit = _get_events(inp, *args, **kwargs) # todo mypy is confused here and thinks it's iterable of Path? perhaps something to do with mcachew? - return list(sorted(evit, key=lambda e: e.cmp_key)) # type: ignore[attr-defined,arg-type] + return list(sorted(evit, key=lambda e: e.cmp_key)) def stats() -> Stats: diff --git a/my/rss/feedbin.py b/my/rss/feedbin.py index 4cd1b8d..8ba25b8 100644 --- a/my/rss/feedbin.py +++ b/my/rss/feedbin.py @@ -33,7 +33,7 @@ from typing import Iterable from .common import SubscriptionState def states() -> Iterable[SubscriptionState]: # meh - from dateutil.parser import isoparse # type: ignore + from dateutil.parser import isoparse for f in inputs(): # TODO ugh. depends on my naming. not sure if useful? dts = f.stem.split('_')[-1] diff --git a/my/runnerup.py b/my/runnerup.py index 6140236..1f20525 100644 --- a/my/runnerup.py +++ b/my/runnerup.py @@ -78,7 +78,7 @@ def dataframe() -> DataFrameT: yield error_to_row(w) else: yield w - import pandas as pd # type: ignore + import pandas as pd df = pd.DataFrame(it()) if 'error' not in df: df['error'] = None diff --git a/my/smscalls.py b/my/smscalls.py index c383a36..dbcf8b2 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -22,7 +22,7 @@ from datetime import datetime, timezone from pathlib import Path from typing import NamedTuple, Iterator, Set, Tuple, Optional -from lxml import etree # type: ignore +from lxml import etree from my.core.common import get_files, Stats from my.core.error import Res diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 7716be0..1ed1ba7 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -65,7 +65,7 @@ logger = LazyLogger(__name__, level='warning') def _timezone_finder(fast: bool) -> Any: if fast: # less precise, but faster - from timezonefinder import TimezoneFinderL as Finder # type: ignore + from timezonefinder import TimezoneFinderL as Finder else: from timezonefinder import TimezoneFinder as Finder # type: ignore return Finder(in_memory=True) @@ -158,7 +158,7 @@ def _iter_local_dates_fallback() -> Iterator[DayWithZone]: def most_common(lst: List[DayWithZone]) -> DayWithZone: - res, _ = Counter(lst).most_common(1)[0] # type: ignore[var-annotated] + res, _ = Counter(lst).most_common(1)[0] return res diff --git a/my/twitter/archive.py b/my/twitter/archive.py index bdd1497..3f56fa0 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -12,7 +12,7 @@ except ImportError as ie: # must be caused by something else raise ie try: - from my.config import twitter as user_config # type: ignore[misc,assignment] + from my.config import twitter as user_config # type: ignore[assignment] except ImportError: raise ie # raise the original exception.. must be something else else: diff --git a/mypy.ini b/mypy.ini index bc85b74..ebc81a5 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,9 +1,19 @@ [mypy] +namespace_packages = True pretty = True show_error_context = True show_error_codes = True +show_column_numbers = True +show_error_end = True +warn_unused_ignores = True check_untyped_defs = True -namespace_packages = True +enable_error_code = possibly-undefined +strict_equality = True + +# a bit annoying, it has optional ipython import which should be ignored in mypy-core configuration.. +[mypy-my.core.__main__] +warn_unused_ignores = False + # todo ok, maybe it wasn't such a good idea.. # mainly because then tox picks it up and running against the user config, not the repository config # mypy_path=~/.config/my diff --git a/tests/calendar.py b/tests/calendar.py index f897efe..3435da3 100644 --- a/tests/calendar.py +++ b/tests/calendar.py @@ -1,6 +1,6 @@ from pathlib import Path -import pytest # type: ignore +import pytest from my.calendar.holidays import is_holiday diff --git a/tests/config.py b/tests/config.py index cef3787..e69f726 100644 --- a/tests/config.py +++ b/tests/config.py @@ -23,7 +23,7 @@ def test_dynamic_configuration(notes: Path) -> None: 0.0, ] -import pytest # type: ignore +import pytest def test_environment_variable(tmp_path: Path) -> None: diff --git a/tests/conftest.py b/tests/conftest.py index 334cd19..4e67f71 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,4 @@ -import pytest # type: ignore +import pytest # I guess makes sense by default @pytest.fixture(autouse=True) diff --git a/tests/core/test_kompress.py b/tests/core/test_kompress.py index 0e7d71b..1f37c34 100644 --- a/tests/core/test_kompress.py +++ b/tests/core/test_kompress.py @@ -6,7 +6,7 @@ import zipfile from my.core.kompress import kopen, kexists, CPath -import pytest # type: ignore +import pytest structure_data: Path = Path(__file__).parent / "structure_data" diff --git a/tests/demo.py b/tests/demo.py index 6ac937c..73a6c65 100644 --- a/tests/demo.py +++ b/tests/demo.py @@ -38,7 +38,7 @@ def test_dynamic_config_2(tmp_path: Path) -> None: assert item1.username == 'user2' -import pytest # type: ignore +import pytest @pytest.mark.skip(reason="won't work at the moment because of inheritance") def test_dynamic_config_simplenamespace(tmp_path: Path) -> None: diff --git a/tests/extra/polar.py b/tests/extra/polar.py index 1091f2a..b2bc562 100644 --- a/tests/extra/polar.py +++ b/tests/extra/polar.py @@ -7,7 +7,7 @@ ROOT = Path(__file__).parent.absolute() OUTPUTS = ROOT / 'outputs' -import pytest # type: ignore +import pytest def test_hpi(prepare: str) -> None: @@ -19,7 +19,7 @@ def test_orger(prepare: str, tmp_path: Path) -> None: om = import_file(ROOT / 'orger/modules/polar.py') # reload(om) - pv = om.PolarView() # type: ignore + pv = om.PolarView() # TODO hmm. worth making public? OUTPUTS.mkdir(exist_ok=True) out = OUTPUTS / (get_valid_filename(prepare) + '.org') diff --git a/tests/get_files.py b/tests/get_files.py index a81b34f..daeef71 100644 --- a/tests/get_files.py +++ b/tests/get_files.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING from my.core.compat import windows from my.core.common import get_files -import pytest # type: ignore +import pytest # hack to replace all /tmp with 'real' tmp dir diff --git a/tests/location.py b/tests/location.py index c47849e..2597d5e 100644 --- a/tests/location.py +++ b/tests/location.py @@ -1,6 +1,6 @@ from pathlib import Path -import pytest # type: ignore +import pytest def test() -> None: diff --git a/tests/pdfs.py b/tests/pdfs.py index 343a209..1c5eab8 100644 --- a/tests/pdfs.py +++ b/tests/pdfs.py @@ -49,7 +49,7 @@ def with_config(): import my.core.cfg as C with C.tmp_config() as config: - config.pdfs = user_config # type: ignore + config.pdfs = user_config try: yield finally: diff --git a/tests/reddit.py b/tests/reddit.py index d18b18d..6e3e65e 100644 --- a/tests/reddit.py +++ b/tests/reddit.py @@ -64,7 +64,7 @@ def test_preserves_extra_attr() -> None: assert isinstance(getattr(config, 'please_keep_me'), str) -import pytest # type: ignore +import pytest @pytest.fixture(autouse=True, scope='module') def prepare(): from .common import testdata diff --git a/tests/takeout.py b/tests/takeout.py index 7cc2164..a40e218 100644 --- a/tests/takeout.py +++ b/tests/takeout.py @@ -18,7 +18,7 @@ def test_location_perf() -> None: # in theory should support any HTML takeout file? # although IIRC bookmarks and search-history.html weren't working -import pytest # type: ignore +import pytest @pytest.mark.parametrize( 'path', [ 'YouTube/history/watch-history.html', diff --git a/tests/test_tmp_config.py b/tests/test_tmp_config.py index 197d3f7..b22f9cf 100644 --- a/tests/test_tmp_config.py +++ b/tests/test_tmp_config.py @@ -10,7 +10,7 @@ def _init_default_config() -> None: import my.config class default_config: count = 5 - my.config.simple = default_config # type: ignore[attr-defined,assignment,misc] + my.config.simple = default_config # type: ignore[assignment,misc] def test_tmp_config() -> None: diff --git a/tests/tz.py b/tests/tz.py index f2498a2..d86c5cb 100644 --- a/tests/tz.py +++ b/tests/tz.py @@ -2,8 +2,8 @@ import sys from datetime import datetime, timedelta from pathlib import Path -import pytest # type: ignore -import pytz # type: ignore +import pytest +import pytz from my.core.error import notnone From fe88380499e16b6412b08e96a426a8d3c4c52adb Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 16 May 2023 01:01:48 +0100 Subject: [PATCH 112/302] general: switch to using native 3.8 versions for cached_property/Literal/Protocol instead of compat --- my/coding/codeforces.py | 5 +-- my/coding/topcoder.py | 5 +-- my/config.py | 2 +- my/core/common.py | 10 ++--- my/core/compat.py | 68 ++++---------------------------- my/core/dataset.py | 3 +- my/core/error.py | 4 +- my/core/pandas.py | 4 +- my/core/sqlite.py | 6 +-- my/fbmessenger/common.py | 3 +- my/instagram/common.py | 3 +- my/location/common.py | 3 +- my/location/fallback/via_home.py | 2 +- my/reddit/common.py | 3 +- my/rtm.py | 4 +- my/twitter/archive.py | 2 +- 16 files changed, 29 insertions(+), 98 deletions(-) diff --git a/my/coding/codeforces.py b/my/coding/codeforces.py index a4c7de2..a8b0f65 100644 --- a/my/coding/codeforces.py +++ b/my/coding/codeforces.py @@ -3,13 +3,12 @@ from my.config import codeforces as config # type: ignore[attr-defined] from datetime import datetime, timezone -from typing import NamedTuple +from functools import cached_property import json -from typing import Dict, Iterator +from typing import NamedTuple, Dict, Iterator from ..core import get_files, Res, unwrap -from ..core.compat import cached_property from ..core.konsume import ignore, wrap diff --git a/my/coding/topcoder.py b/my/coding/topcoder.py index 32a9ff8..96bcdf7 100644 --- a/my/coding/topcoder.py +++ b/my/coding/topcoder.py @@ -3,13 +3,12 @@ from my.config import topcoder as config # type: ignore[attr-defined] from datetime import datetime -from typing import NamedTuple +from functools import cached_property import json -from typing import Dict, Iterator +from typing import NamedTuple, Dict, Iterator from ..core import get_files, Res, unwrap, Json -from ..core.compat import cached_property from ..core.error import Res, unwrap from ..core.konsume import zoom, wrap, ignore diff --git a/my/config.py b/my/config.py index 8d958f1..a59eadd 100644 --- a/my/config.py +++ b/my/config.py @@ -98,7 +98,7 @@ class location: accuracy: float = 100 -from my.core.compat import Literal +from typing import Literal class time: class tz: policy: Literal['keep', 'convert', 'throw'] diff --git a/my/core/common.py b/my/core/common.py index 0b3dc1e..359f451 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -366,10 +366,6 @@ def isoparse(s: str) -> tzdatetime: return datetime.fromisoformat(s) -# legacy import -- we should use compat directly instead -from .compat import Literal - - import re # https://stackoverflow.com/a/295466/706389 def get_valid_filename(s: str) -> str: @@ -664,5 +660,7 @@ def assert_never(value: NoReturn) -> NoReturn: assert False, f'Unhandled value: {value} ({type(value).__name__})' -# legacy deprecated import -from .compat import cached_property as cproperty +## legacy imports, keeping them here for backwards compatibility +from functools import cached_property as cproperty +from typing import Literal +## \ No newline at end of file diff --git a/my/core/compat.py b/my/core/compat.py index 0b47bdd..d7937f9 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -3,6 +3,7 @@ Some backwards compatibility stuff/deprecation helpers ''' import sys from types import ModuleType +from typing import TYPE_CHECKING from . import warnings from .common import LazyLogger @@ -53,20 +54,10 @@ import os windows = os.name == 'nt' +# keeping just for backwards compatibility, used to have compat implementation for 3.6 import sqlite3 def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwargs) -> None: - if sys.version_info[:2] >= (3, 7): - source.backup(dest, **kwargs) - else: - # https://stackoverflow.com/a/10856450/706389 - import io - tempfile = io.StringIO() - for line in source.iterdump(): - tempfile.write('%s\n' % line) - tempfile.seek(0) - - dest.cursor().executescript(tempfile.read()) - dest.commit() + source.backup(dest, **kwargs) # can remove after python3.9 @@ -76,55 +67,10 @@ def removeprefix(text: str, prefix: str) -> str: return text -# can remove after python3.8 -if sys.version_info[:2] >= (3, 8): - from functools import cached_property -else: - from typing import TypeVar, Callable - Cl = TypeVar('Cl') - R = TypeVar('R') - - def cached_property(f: Callable[[Cl], R]) -> R: - import functools - return property(functools.lru_cache(maxsize=1)(f)) - del Cl - del R - - -from typing import TYPE_CHECKING - - -if sys.version_info[:2] >= (3, 8): - from typing import Literal -else: - if TYPE_CHECKING: - from typing_extensions import Literal - else: - # erm.. I guess as long as it's not crashing, whatever... - class _Literal: - def __getitem__(self, args): - pass - Literal = _Literal() - - -if sys.version_info[:2] >= (3, 8): - from typing import Protocol -else: - if TYPE_CHECKING: - from typing_extensions import Protocol - else: - # todo could also use NamedTuple? - Protocol = object - - -if sys.version_info[:2] >= (3, 8): - from typing import TypedDict -else: - if TYPE_CHECKING: - from typing_extensions import TypedDict - else: - from typing import Dict - TypedDict = Dict +## used to have compat function before 3.8 for these +from functools import cached_property +from typing import Literal, Protocol, TypedDict +## if sys.version_info[:2] >= (3, 10): diff --git a/my/core/dataset.py b/my/core/dataset.py index 070b9b3..31de4f4 100644 --- a/my/core/dataset.py +++ b/my/core/dataset.py @@ -2,11 +2,10 @@ from __future__ import annotations from .common import assert_subpackage; assert_subpackage(__name__) from .common import PathIsh -from .compat import Protocol from .sqlite import sqlite_connect_immutable ## sadly dataset doesn't have any type definitions -from typing import Iterable, Iterator, Dict, Optional, Any +from typing import Iterable, Iterator, Dict, Optional, Any, Protocol from contextlib import AbstractContextManager diff --git a/my/core/error.py b/my/core/error.py index 236bd30..e1737c1 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -4,9 +4,7 @@ See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail """ from itertools import tee -from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast, Iterator - -from .compat import Literal +from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast, Iterator, Literal T = TypeVar('T') diff --git a/my/core/pandas.py b/my/core/pandas.py index ee4bcff..4ce62fe 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -5,7 +5,7 @@ Various pandas helpers and convenience functions # NOTE: this file is meant to be importable without Pandas installed from datetime import datetime from pprint import pformat -from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, Dict +from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, Dict, Literal from . import warnings, Res from .common import LazyLogger, Json, asdict @@ -45,8 +45,6 @@ def check_dateish(s) -> Iterable[str]: '''.strip() -from .compat import Literal - ErrorColPolicy = Literal[ 'add_if_missing', # add error column if it's missing 'warn' , # warn, but do not modify diff --git a/my/core/sqlite.py b/my/core/sqlite.py index e712a77..e04f6fc 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -6,11 +6,10 @@ from pathlib import Path import shutil import sqlite3 from tempfile import TemporaryDirectory -from typing import Tuple, Any, Iterator, Callable, Optional, Union +from typing import Tuple, Any, Iterator, Callable, Optional, Union, Literal from .common import PathIsh, assert_never -from .compat import Literal def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection: @@ -86,8 +85,7 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: for p in tocopy: shutil.copy(p, tdir / p.name) with sqlite3.connect(str(tdir / dp.name)) as conn: - from .compat import sqlite_backup - sqlite_backup(source=conn, dest=dest) + conn.backup(target=dest) conn.close() return dest diff --git a/my/fbmessenger/common.py b/my/fbmessenger/common.py index a498952..33d1b20 100644 --- a/my/fbmessenger/common.py +++ b/my/fbmessenger/common.py @@ -1,8 +1,7 @@ from my.core import __NOT_HPI_MODULE__ -from typing import Iterator, Optional +from typing import Iterator, Optional, Protocol -from my.core.compat import Protocol from my.core import datetime_aware diff --git a/my/instagram/common.py b/my/instagram/common.py index a172ac8..4df07a1 100644 --- a/my/instagram/common.py +++ b/my/instagram/common.py @@ -1,10 +1,9 @@ from dataclasses import replace from datetime import datetime from itertools import chain -from typing import Iterator, Dict, Any +from typing import Iterator, Dict, Any, Protocol from my.core import warn_if_empty, Res -from my.core.compat import Protocol class User(Protocol): diff --git a/my/location/common.py b/my/location/common.py index 5c03d5e..7824bef 100644 --- a/my/location/common.py +++ b/my/location/common.py @@ -1,9 +1,8 @@ from datetime import date, datetime -from typing import Union, Tuple, Optional, Iterable, TextIO, Iterator +from typing import Union, Tuple, Optional, Iterable, TextIO, Iterator, Protocol from dataclasses import dataclass from my.core import __NOT_HPI_MODULE__ -from my.core.compat import Protocol DateIsh = Union[datetime, date, str] diff --git a/my/location/fallback/via_home.py b/my/location/fallback/via_home.py index 240da84..590c028 100644 --- a/my/location/fallback/via_home.py +++ b/my/location/fallback/via_home.py @@ -73,7 +73,7 @@ def get_location(dt: datetime) -> LatLon: return loc[0].lat, loc[0].lon -# TODO: in python3.9, use functools.cached_property instead? +# TODO: in python3.8, use functools.cached_property instead? @lru_cache(maxsize=None) def homes_cached() -> List[Tuple[datetime, LatLon]]: return list(config._history) diff --git a/my/reddit/common.py b/my/reddit/common.py index d33e02b..c01258b 100644 --- a/my/reddit/common.py +++ b/my/reddit/common.py @@ -4,10 +4,9 @@ type of shared models have a standardized interface """ from my.core import __NOT_HPI_MODULE__ -from typing import Set, Iterator +from typing import Set, Iterator, Protocol from itertools import chain -from my.core.compat import Protocol from my.core import datetime_aware, Json diff --git a/my/rtm.py b/my/rtm.py index b4fc7a9..8d41e7a 100644 --- a/my/rtm.py +++ b/my/rtm.py @@ -6,12 +6,12 @@ REQUIRES = [ 'icalendar', ] +from datetime import datetime +from functools import cached_property import re from typing import Dict, List, Iterator -from datetime import datetime from .core.common import LazyLogger, get_files, group_by_key, make_dict -from .core.compat import cached_property from my.config import rtm as config diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 3f56fa0..d9ba562 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -22,9 +22,9 @@ except ImportError as ie: from dataclasses import dataclass +from functools import cached_property import html from ..core.common import Paths, datetime_aware -from ..core.compat import cached_property from ..core.error import Res from ..core.kompress import ZipPath From a98bc6dacafd6df3e23aa0ae3147f167bac75a3f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 18 May 2023 02:17:59 +0100 Subject: [PATCH 113/302] my.core.pandas: rely on typing annotations from types-pandas --- my/core/pandas.py | 91 +++++++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 35 deletions(-) diff --git a/my/core/pandas.py b/my/core/pandas.py index 4ce62fe..efd8c48 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -1,32 +1,46 @@ ''' Various pandas helpers and convenience functions ''' +from __future__ import annotations + # todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential # NOTE: this file is meant to be importable without Pandas installed -from datetime import datetime +import dataclasses +from datetime import datetime, timezone from pprint import pformat -from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, Dict, Literal +from typing import TYPE_CHECKING, Any, Iterable, Type, Dict, Literal, Callable, TypeVar + +from decorator import decorator + from . import warnings, Res from .common import LazyLogger, Json, asdict +from .error import error_to_json, extract_error_datetime + logger = LazyLogger(__name__) if TYPE_CHECKING: - # this is kinda pointless at the moment, but handy to annotate DF returning methods now - # later will be unignored when they implement type annotations import pandas as pd - # DataFrameT = pd.DataFrame - # TODO ugh. pretty annoying, having any is not very useful since it would allow arbitrary coercions.. - # ideally want to use a type that's like Any but doesn't allow arbitrary coercions?? - DataFrameT = Any + + DataFrameT = pd.DataFrame + SeriesT = pd.Series + from pandas._typing import S1 # meh + + FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT]) + # huh interesting -- with from __future__ import annotations don't even need else clause here? + # but still if other modules import these we do need some fake runtime types here.. else: - # in runtime, make it defensive so it works without pandas + from typing import Optional + DataFrameT = Any + SeriesT = Optional # just some type with one argument + S1 = Any -def check_dateish(s) -> Iterable[str]: +def check_dateish(s: SeriesT[S1]) -> Iterable[str]: import pandas as pd # noqa: F811 not actually a redefinition + ctype = s.dtype if str(ctype).startswith('datetime64'): return @@ -35,7 +49,7 @@ def check_dateish(s) -> Iterable[str]: return all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all() if not all_timestamps: - return # not sure why it would happen, but ok + return # not sure why it would happen, but ok tzs = s.map(lambda x: x.tzinfo).drop_duplicates() examples = s[tzs.index] # todo not so sure this warning is that useful... except for stuff without tz @@ -45,11 +59,22 @@ def check_dateish(s) -> Iterable[str]: '''.strip() +def test_check_dateish() -> None: + import pandas as pd + + # todo just a dummy test to check it doesn't crash, need something meaningful + s1 = pd.Series([1, 2, 3]) + list(check_dateish(s1)) + + +# fmt: off ErrorColPolicy = Literal[ - 'add_if_missing', # add error column if it's missing - 'warn' , # warn, but do not modify - 'ignore' , # no warnings + 'add_if_missing', # add error column if it's missing + 'warn' , # warn, but do not modify + 'ignore' , # no warnings ] +# fmt: on + def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]: if 'error' in df: @@ -69,18 +94,14 @@ No 'error' column detected. You probably forgot to handle errors defensively, wh yield wmsg -from typing import Any, Callable, TypeVar -FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT]) - -# TODO ugh. typing this is a mess... should I use mypy_extensions.VarArg/KwArgs?? or what?? -from decorator import decorator +# TODO ugh. typing this is a mess... perhaps should use .compat.ParamSpec? @decorator -def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', *args, **kwargs) -> DataFrameT: - df = f(*args, **kwargs) +def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy = 'add_if_missing', *args, **kwargs) -> DataFrameT: + df: DataFrameT = f(*args, **kwargs) tag = '{f.__module__}:{f.__name__}' # makes sense to keep super defensive try: - for col, data in df.reset_index().iteritems(): + for col, data in df.reset_index().items(): for w in check_dateish(data): warnings.low(f"{tag}, column '{col}': {w}") except Exception as e: @@ -92,11 +113,11 @@ def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy='add_if_missing', logger.exception(e) return df + # todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type? -def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Json: - from .error import error_to_json, extract_error_datetime +def error_to_row(e: Exception, *, dt_col: str = 'dt', tz: timezone | None = None) -> Json: edt = extract_error_datetime(e) if edt is not None and edt.tzinfo is None and tz is not None: edt = edt.replace(tzinfo=tz) @@ -118,11 +139,11 @@ def to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]: # no type for dataclass? Schema = Any + def _as_columns(s: Schema) -> Dict[str, Type]: # todo would be nice to extract properties; add tests for this as well - import dataclasses as D - if D.is_dataclass(s): - return {f.name: f.type for f in D.fields(s)} + if dataclasses.is_dataclass(s): + return {f.name: f.type for f in dataclasses.fields(s)} # else must be NamedTuple?? # todo assert my.core.common.is_namedtuple? return getattr(s, '_field_types') @@ -130,7 +151,7 @@ def _as_columns(s: Schema) -> Dict[str, Type]: # todo add proper types @check_dataframe -def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataFrameT: +def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFrameT: # todo warn if schema isn't specified? # ok nice supports dataframe/NT natively # https://github.com/pandas-dev/pandas/pull/27999 @@ -138,27 +159,27 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataF # https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562 # same for NamedTuple -- seems that it takes whatever schema the first NT has # so we need to convert each individually... sigh - import pandas as pd # noqa: F811 not actually a redefinition + import pandas as pd # noqa: F811 not actually a redefinition + columns = None if schema is None else list(_as_columns(schema).keys()) return pd.DataFrame(to_jsons(it), columns=columns) def test_as_dataframe() -> None: import pytest + it = (dict(i=i, s=f'str{i}') for i in range(10)) with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841 - df = as_dataframe(it) + df: DataFrameT = as_dataframe(it) # todo test other error col policies assert list(df.columns) == ['i', 's', 'error'] assert len(as_dataframe([])) == 0 - from dataclasses import dataclass - - @dataclass + @dataclasses.dataclass class X: x: int # makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable - df = as_dataframe([], schema=X) - assert list(df.columns) == ['x', 'error'] + df2: DataFrameT = as_dataframe([], schema=X) + assert list(df2.columns) == ['x', 'error'] From 04d976f93771c617241d19ee70d9a0bf9a1ce932 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 24 May 2023 22:13:00 +0100 Subject: [PATCH 114/302] my/core/pandas tests: fix weird pytest error when constructing dataclass inside a def can quickly reproduce by running pytest tests/tz.py tests/core/test_pandas.py possibly will be resolved after fix in pytest? see https://github.com/pytest-dev/pytest/issues/7856 --- my/core/pandas.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/my/core/pandas.py b/my/core/pandas.py index efd8c48..1b7a644 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -165,6 +165,16 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFr return pd.DataFrame(to_jsons(it), columns=columns) +# ugh. in principle this could be inside the test +# might be due to use of from __future__ import annotations +# can quickly reproduce by running pytest tests/tz.py tests/core/test_pandas.py +# possibly will be resolved after fix in pytest? +# see https://github.com/pytest-dev/pytest/issues/7856 +@dataclasses.dataclass +class _X: + x: int + + def test_as_dataframe() -> None: import pytest @@ -176,10 +186,6 @@ def test_as_dataframe() -> None: assert len(as_dataframe([])) == 0 - @dataclasses.dataclass - class X: - x: int - # makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable - df2: DataFrameT = as_dataframe([], schema=X) + df2: DataFrameT = as_dataframe([], schema=_X) assert list(df2.columns) == ['x', 'error'] From 9594caa1cde1be0ed8fcaeab6fa63f7a15d63b4f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 24 May 2023 23:39:21 +0100 Subject: [PATCH 115/302] general: move most core tests inside my.core.tests package - distributes tests alongside the package, might be convenient for package users - removes some weird indirection (e.g. dummy test files improting tests from modules) - makes the command line for tests cleaner (e.g. no need to remember to manually add files to tox.ini) - tests automatically covered by mypy (so makes mypy runs cleaner and ultimately better coverage) The (vague) convention is - tests/somemodule.py -- testing my.core.somemodule, contains tests directly re - tests/test_something.py -- testing a specific feature, e.g. test_get_files.py tests get_files methon only --- my/core/kompress.py | 4 +- my/core/tests/__init__.py | 3 ++ .../core/tests/denylist.py | 8 ++-- .../core/tests/kompress.py | 27 ++++++------ {tests => my/core/tests}/sqlite.py | 9 ++-- .../core/tests/structure.py | 19 +++------ .../tests}/structure_data/gdpr_export.zip | Bin .../broken_export/comments/comments.json | 0 .../broken_export/messages/index.csv | 0 .../gdpr_export/comments/comments.json | 0 .../gdpr_export/messages/index.csv | 0 .../gdpr_export/profile/settings.json | 0 tests/cli.py => my/core/tests/test_cli.py | 4 +- .../core/tests/test_get_files.py | 40 +++++++++--------- {tests => my/core/tests}/test_tmp_config.py | 10 ++--- tests/core.py | 25 ----------- tests/core/test_pandas.py | 1 - tox.ini | 29 ++++++++----- 18 files changed, 77 insertions(+), 102 deletions(-) create mode 100644 my/core/tests/__init__.py rename tests/core/test_denylist.py => my/core/tests/denylist.py (98%) rename tests/core/test_kompress.py => my/core/tests/kompress.py (91%) rename {tests => my/core/tests}/sqlite.py (91%) rename tests/core/test_structure.py => my/core/tests/structure.py (71%) rename {tests/core => my/core/tests}/structure_data/gdpr_export.zip (100%) rename {tests/core => my/core/tests}/structure_data/gdpr_subdirs/broken_export/comments/comments.json (100%) rename {tests/core => my/core/tests}/structure_data/gdpr_subdirs/broken_export/messages/index.csv (100%) rename {tests/core => my/core/tests}/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json (100%) rename {tests/core => my/core/tests}/structure_data/gdpr_subdirs/gdpr_export/messages/index.csv (100%) rename {tests/core => my/core/tests}/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json (100%) rename tests/cli.py => my/core/tests/test_cli.py (85%) rename tests/get_files.py => my/core/tests/test_get_files.py (85%) rename {tests => my/core/tests}/test_tmp_config.py (88%) delete mode 100644 tests/core.py delete mode 100644 tests/core/test_pandas.py diff --git a/my/core/kompress.py b/my/core/kompress.py index 8ee1cfa..1f00013 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -3,13 +3,13 @@ Various helpers for compression """ from __future__ import annotations -from functools import total_ordering from datetime import datetime +from functools import total_ordering +import io import pathlib from pathlib import Path import sys from typing import Union, IO, Sequence, Any, Iterator -import io PathIsh = Union[Path, str] diff --git a/my/core/tests/__init__.py b/my/core/tests/__init__.py new file mode 100644 index 0000000..9d38c26 --- /dev/null +++ b/my/core/tests/__init__.py @@ -0,0 +1,3 @@ +# hmm, sadly pytest --import-mode importlib --pyargs my.core.tests doesn't work properly without __init__.py +# although it works if you run either my.core or my.core.tests.sqlite (for example) directly +# so if it gets in the way could get rid of this later? diff --git a/tests/core/test_denylist.py b/my/core/tests/denylist.py similarity index 98% rename from tests/core/test_denylist.py rename to my/core/tests/denylist.py index 4e55a1f..cca757d 100644 --- a/tests/core/test_denylist.py +++ b/my/core/tests/denylist.py @@ -1,11 +1,10 @@ -import warnings - +from datetime import datetime import json from pathlib import Path -from datetime import datetime from typing import NamedTuple, Iterator +import warnings -from my.core.denylist import DenyList +from ..denylist import DenyList class IP(NamedTuple): @@ -30,7 +29,6 @@ def data() -> Iterator[IP]: def test_denylist(tmp_path: Path) -> None: tf = (tmp_path / "denylist.json").absolute() with warnings.catch_warnings(record=True): - # create empty denylist (though file does not have to exist for denylist to work) tf.write_text("[]") diff --git a/tests/core/test_kompress.py b/my/core/tests/kompress.py similarity index 91% rename from tests/core/test_kompress.py rename to my/core/tests/kompress.py index 1f37c34..19c4e82 100644 --- a/tests/core/test_kompress.py +++ b/my/core/tests/kompress.py @@ -1,10 +1,9 @@ -from datetime import datetime -import lzma from pathlib import Path +import lzma import sys import zipfile -from my.core.kompress import kopen, kexists, CPath +from ..kompress import kopen, kexists, CPath, ZipPath import pytest @@ -14,27 +13,31 @@ structure_data: Path = Path(__file__).parent / "structure_data" def test_kopen(tmp_path: Path) -> None: "Plaintext handled transparently" + # fmt: off assert kopen(tmp_path / 'file' ).read() == 'just plaintext' assert kopen(tmp_path / 'file.xz').read() == 'compressed text' + # fmt: on "For zips behaviour is a bit different (not sure about all this, tbh...)" assert kopen(tmp_path / 'file.zip', 'path/in/archive').read() == 'data in zip' -# TODO here? def test_kexists(tmp_path: Path) -> None: # TODO also test top level? + # fmt: off assert kexists(str(tmp_path / 'file.zip'), 'path/in/archive') assert not kexists(str(tmp_path / 'file.zip'), 'path/notin/archive') + # fmt: on # TODO not sure about this? assert not kexists(tmp_path / 'nosuchzip.zip', 'path/in/archive') def test_cpath(tmp_path: Path) -> None: + # fmt: off CPath(str(tmp_path / 'file' )).read_text() == 'just plaintext' CPath( tmp_path / 'file.xz').read_text() == 'compressed text' - # TODO not sure about zip files?? + # fmt: on @pytest.fixture(autouse=True) @@ -51,12 +54,7 @@ def prepare(tmp_path: Path): pass -@pytest.mark.skipif( - sys.version_info[:2] < (3, 8), - reason=f"ZipFile.Path is only available since 3.8", -) def test_zippath() -> None: - from my.core.kompress import ZipPath target = structure_data / 'gdpr_export.zip' assert target.exists(), target # precondition @@ -87,6 +85,7 @@ def test_zippath() -> None: rpaths = [p.relative_to(zp) for p in matched] gdpr_export = Path('gdpr_export') + # fmt: off assert rpaths == [ gdpr_export, gdpr_export / 'comments', @@ -96,7 +95,7 @@ def test_zippath() -> None: gdpr_export / 'messages', gdpr_export / 'messages' / 'index.csv', ], rpaths - + # fmt: on # TODO hmm this doesn't work atm, whereas Path does # not sure if it should be defensive or something... @@ -107,10 +106,12 @@ def test_zippath() -> None: assert (ZipPath(target) / 'gdpr_export' / 'comments').exists() jsons = [p.relative_to(zp / 'gdpr_export') for p in zp.rglob('*.json')] + # fmt: off assert jsons == [ - Path('comments','comments.json'), - Path('profile','settings.json'), + Path('comments', 'comments.json'), + Path('profile' , 'settings.json'), ] + # fmt: on # NOTE: hmm interesting, seems that ZipPath is happy with forward slash regardless OS? assert list(zp.rglob('mes*')) == [ZipPath(target, 'gdpr_export/messages')] diff --git a/tests/sqlite.py b/my/core/tests/sqlite.py similarity index 91% rename from tests/sqlite.py rename to my/core/tests/sqlite.py index f80636e..b3ecffe 100644 --- a/tests/sqlite.py +++ b/my/core/tests/sqlite.py @@ -1,10 +1,10 @@ +from concurrent.futures import ProcessPoolExecutor from pathlib import Path import shutil import sqlite3 from tempfile import TemporaryDirectory - -from my.core.sqlite import sqlite_connect_immutable, sqlite_copy_and_open +from ..sqlite import sqlite_connect_immutable, sqlite_copy_and_open def test_sqlite_read_with_wal(tmp_path: Path) -> None: @@ -27,13 +27,14 @@ def test_sqlite_read_with_wal(tmp_path: Path) -> None: assert len(wals) == 1 ## now run the tests in separate process to ensure there is no potential for reusing sqlite connections or something - from concurrent.futures import ProcessPoolExecutor as Pool - with Pool(1) as pool: + with ProcessPoolExecutor(1) as pool: # merely using it for ctx manager.. + # fmt: off pool.submit(_test_do_copy , db).result() pool.submit(_test_do_immutable , db).result() pool.submit(_test_do_copy_and_open, db).result() pool.submit(_test_open_asis , db).result() + # fmt: on def _test_do_copy(db: Path) -> None: diff --git a/tests/core/test_structure.py b/my/core/tests/structure.py similarity index 71% rename from tests/core/test_structure.py rename to my/core/tests/structure.py index 1ad46fe..beb8e7f 100644 --- a/tests/core/test_structure.py +++ b/my/core/tests/structure.py @@ -1,8 +1,8 @@ -import pytest - from pathlib import Path -from my.core.structure import match_structure +from ..structure import match_structure + +import pytest structure_data: Path = Path(__file__).parent / "structure_data" @@ -16,10 +16,7 @@ def test_gdpr_structure_exists() -> None: def test_gdpr_unzip() -> None: - - with match_structure( - structure_data / "gdpr_export.zip", expected=gdpr_expected - ) as results: + with match_structure(structure_data / "gdpr_export.zip", expected=gdpr_expected) as results: assert len(results) == 1 extracted = results[0] index_file = extracted / "messages" / "index.csv" @@ -31,15 +28,11 @@ def test_gdpr_unzip() -> None: def test_match_partial() -> None: # a partial match should match both the 'broken' and 'gdpr_export' directories - with match_structure( - structure_data / "gdpr_subdirs", expected=gdpr_expected, partial=True - ) as results: + with match_structure(structure_data / "gdpr_subdirs", expected=gdpr_expected, partial=True) as results: assert len(results) == 2 def test_not_directory() -> None: with pytest.raises(NotADirectoryError, match=r"Expected either a zipfile or a directory"): - with match_structure( - structure_data / "messages/index.csv", expected=gdpr_expected - ): + with match_structure(structure_data / "messages/index.csv", expected=gdpr_expected): pass diff --git a/tests/core/structure_data/gdpr_export.zip b/my/core/tests/structure_data/gdpr_export.zip similarity index 100% rename from tests/core/structure_data/gdpr_export.zip rename to my/core/tests/structure_data/gdpr_export.zip diff --git a/tests/core/structure_data/gdpr_subdirs/broken_export/comments/comments.json b/my/core/tests/structure_data/gdpr_subdirs/broken_export/comments/comments.json similarity index 100% rename from tests/core/structure_data/gdpr_subdirs/broken_export/comments/comments.json rename to my/core/tests/structure_data/gdpr_subdirs/broken_export/comments/comments.json diff --git a/tests/core/structure_data/gdpr_subdirs/broken_export/messages/index.csv b/my/core/tests/structure_data/gdpr_subdirs/broken_export/messages/index.csv similarity index 100% rename from tests/core/structure_data/gdpr_subdirs/broken_export/messages/index.csv rename to my/core/tests/structure_data/gdpr_subdirs/broken_export/messages/index.csv diff --git a/tests/core/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json b/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json similarity index 100% rename from tests/core/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json rename to my/core/tests/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json diff --git a/tests/core/structure_data/gdpr_subdirs/gdpr_export/messages/index.csv b/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/messages/index.csv similarity index 100% rename from tests/core/structure_data/gdpr_subdirs/gdpr_export/messages/index.csv rename to my/core/tests/structure_data/gdpr_subdirs/gdpr_export/messages/index.csv diff --git a/tests/core/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json b/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json similarity index 100% rename from tests/core/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json rename to my/core/tests/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json diff --git a/tests/cli.py b/my/core/tests/test_cli.py similarity index 85% rename from tests/cli.py rename to my/core/tests/test_cli.py index fce53b7..4d847ae 100644 --- a/tests/cli.py +++ b/my/core/tests/test_cli.py @@ -1,5 +1,7 @@ import os from subprocess import check_call +import sys + def test_lists_modules() -> None: # hack PYTHONUTF8 for windows @@ -11,4 +13,4 @@ def test_lists_modules() -> None: **os.environ, 'PYTHONUTF8': '1', } - check_call(['hpi', 'modules'], env=env) + check_call([sys.executable, '-m', 'my.core', 'modules'], env=env) diff --git a/tests/get_files.py b/my/core/tests/test_get_files.py similarity index 85% rename from tests/get_files.py rename to my/core/tests/test_get_files.py index daeef71..fdec5c0 100644 --- a/tests/get_files.py +++ b/my/core/tests/test_get_files.py @@ -1,30 +1,33 @@ import os from pathlib import Path +import shutil +import tempfile from typing import TYPE_CHECKING -from my.core.compat import windows -from my.core.common import get_files +from ..compat import windows +from ..common import get_files import pytest - # hack to replace all /tmp with 'real' tmp dir - # not ideal, but makes tests more concise +# hack to replace all /tmp with 'real' tmp dir +# not ideal, but makes tests more concise def _get_files(x, *args, **kwargs): - import my.core.common as C + from ..common import get_files as get_files_orig + def repl(x): if isinstance(x, str): return x.replace('/tmp', TMP) elif isinstance(x, Path): - assert x.parts[:2] == (os.sep, 'tmp') # meh + assert x.parts[:2] == (os.sep, 'tmp') # meh return Path(TMP) / Path(*x.parts[2:]) else: # iterable? return [repl(i) for i in x] x = repl(x) - res = C.get_files(x, *args, **kwargs) - return tuple(Path(str(i).replace(TMP, '/tmp')) for i in res) # hack back for asserts.. + res = get_files_orig(x, *args, **kwargs) + return tuple(Path(str(i).replace(TMP, '/tmp')) for i in res) # hack back for asserts.. if not TYPE_CHECKING: @@ -40,7 +43,6 @@ def test_single_file() -> None: with pytest.raises(Exception): get_files('/tmp/hpi_test/file.ext') - create('/tmp/hpi_test/file.ext') ''' @@ -48,16 +50,11 @@ def test_single_file() -> None: 1. Return type is a tuple, it's friendlier for hashing/caching 2. It always return pathlib.Path instead of plain strings ''' - assert get_files('/tmp/hpi_test/file.ext') == ( - Path('/tmp/hpi_test/file.ext'), - ) - + assert get_files('/tmp/hpi_test/file.ext') == (Path('/tmp/hpi_test/file.ext'),) "if the path starts with ~, we expand it" - if not windows: # windows dowsn't have bashrc.. ugh - assert get_files('~/.bashrc') == ( - Path('~').expanduser() / '.bashrc', - ) + if not windows: # windows doesn't have bashrc.. ugh + assert get_files('~/.bashrc') == (Path('~').expanduser() / '.bashrc',) def test_multiple_files() -> None: @@ -74,6 +71,7 @@ def test_multiple_files() -> None: create('/tmp/hpi_test/dir3/') create('/tmp/hpi_test/dir3/ttt') + # fmt: off assert get_files([ Path('/tmp/hpi_test/dir3'), # it takes in Path as well as str '/tmp/hpi_test/dir1', @@ -83,6 +81,7 @@ def test_multiple_files() -> None: Path('/tmp/hpi_test/dir1/zzz'), Path('/tmp/hpi_test/dir3/ttt'), ) + # fmt: on def test_explicit_glob() -> None: @@ -130,27 +129,26 @@ def test_no_files() -> None: ''' Test for empty matches. They work, but should result in warning ''' - assert get_files('') == () + assert get_files('') == () # todo test these for warnings? - assert get_files([]) == () + assert get_files([]) == () assert get_files('bad*glob') == () # TODO not sure if should uniquify if the filenames end up same? # TODO not sure about the symlinks? and hidden files? -import tempfile TMP = tempfile.gettempdir() test_path = Path(TMP) / 'hpi_test' + def setup(): teardown() test_path.mkdir() def teardown(): - import shutil if test_path.is_dir(): shutil.rmtree(test_path) diff --git a/tests/test_tmp_config.py b/my/core/tests/test_tmp_config.py similarity index 88% rename from tests/test_tmp_config.py rename to my/core/tests/test_tmp_config.py index b22f9cf..e5a24cc 100644 --- a/tests/test_tmp_config.py +++ b/my/core/tests/test_tmp_config.py @@ -1,15 +1,12 @@ -from pathlib import Path -import tempfile - -from my.core.cfg import tmp_config - -import pytest +from ..cfg import tmp_config def _init_default_config() -> None: import my.config + class default_config: count = 5 + my.config.simple = default_config # type: ignore[assignment,misc] @@ -19,7 +16,6 @@ def test_tmp_config() -> None: ## later would be nice to be a bit more careful about them _init_default_config() from my.simple import items - ## assert len(list(items())) == 5 diff --git a/tests/core.py b/tests/core.py deleted file mode 100644 index 339f786..0000000 --- a/tests/core.py +++ /dev/null @@ -1,25 +0,0 @@ -''' -NOTE: Sigh. it's nice to be able to define the tests next to the source code (so it serves as documentation). -However, if you run 'pytest --pyargs my.core', it detects 'core' package name (because there is no my/__init__.py) -(see https://docs.pytest.org/en/latest/goodpractices.html#tests-as-part-of-application-code) - -This results in relative imports failing (e.g. from ..core import...). - -By using this helper file, pytest can detect the package name properly. A bit meh, but perhaps later, -we can run against the tests in my.core directly. - -''' - -from my.core.cfg import * -from my.core.common import * -from my.core.core_config import * -from my.core.error import * -from my.core.util import * -from my.core.discovery_pure import * -from my.core.freezer import * -from my.core.stats import * -from my.core.query import * -from my.core.query_range import * -from my.core.serialize import test_serialize_fallback -from my.core.sqlite import * -from my.core.__main__ import * diff --git a/tests/core/test_pandas.py b/tests/core/test_pandas.py deleted file mode 100644 index bedab26..0000000 --- a/tests/core/test_pandas.py +++ /dev/null @@ -1 +0,0 @@ -from my.core.pandas import * diff --git a/tox.ini b/tox.ini index 2809e3c..3124618 100644 --- a/tox.ini +++ b/tox.ini @@ -19,12 +19,26 @@ passenv = [testenv:tests-core] commands = pip install -e .[testing] + + # seems that denylist tests rely on it? ideally we should get rid of this in tests-core + pip install orjson + {envpython} -m pytest \ - tests/core.py \ - tests/sqlite.py \ - tests/get_files.py \ - tests/test_tmp_config.py \ - {posargs} + # importlib is the new suggested import-mode + # without it test package names end up as core.tests.* instead of my.core.tests.* + --import-mode=importlib \ + --pyargs my.core \ + # ignore orgmode because it imports orgparse + # tbh not sure if it even belongs to core, maybe move somewhere else.. + # same with pandas? + --ignore my/core/orgmode.py \ + # causes error during test collection on 3.8 + # dataset is deprecated anyway so whatever + --ignore my/core/dataset.py \ + # this test uses orjson which is an optional dependency + # it would be covered by tests-all + -k 'not test_nt_serialize' \ + {posargs} # todo maybe also have core tests and misc tests? since ideally want them without dependencies @@ -94,11 +108,6 @@ commands = {posargs} cat .coverage.mypy-core/index.txt - # todo hmm might be better to move modules test in a separate subpackage? - {envpython} -m mypy --install-types --non-interactive \ - tests \ - --exclude 'tests/(bluemaestro|emfit|takeout|pdfs|jawbone).py' - # specific modules that are known to be mypy compliant (to avoid false negatives) # todo maybe split into separate jobs? need to add comment how to run From fcfc423a754f76dd62619f760e0c3f44dd8d0a0c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 25 May 2023 22:14:49 +0100 Subject: [PATCH 116/302] move some tests into the main HPI package --- my/tests/__init__.py | 8 ++++++++ {tests => my/tests}/commits.py | 24 ++++++++++++++---------- 2 files changed, 22 insertions(+), 10 deletions(-) create mode 100644 my/tests/__init__.py rename {tests => my/tests}/commits.py (66%) diff --git a/my/tests/__init__.py b/my/tests/__init__.py new file mode 100644 index 0000000..4ad5bba --- /dev/null +++ b/my/tests/__init__.py @@ -0,0 +1,8 @@ +# hmm, sadly pytest --import-mode importlib --pyargs my.core.tests doesn't work properly without __init__.py +# although it works if you run either my.core or my.core.tests.sqlite (for example) directly +# so if it gets in the way could get rid of this later? + +# this particularly sucks here, because otherwise would be nice if people could also just put tests for their my. packages into their tests/ directory +# maybe some sort of hack could be used later similar to handle_legacy_import? + +from my.core import __NOT_HPI_MODULE__ diff --git a/tests/commits.py b/my/tests/commits.py similarity index 66% rename from tests/commits.py rename to my/tests/commits.py index 1aa7aa0..c967027 100644 --- a/tests/commits.py +++ b/my/tests/commits.py @@ -1,9 +1,15 @@ +import os from pathlib import Path from more_itertools import bucket import pytest -import os + +from my.core.cfg import tmp_config + +from my.coding.commits import commits + + pytestmark = pytest.mark.skipif( os.name == 'nt', reason='TODO figure out how to install fd-find on Windows', @@ -11,7 +17,6 @@ pytestmark = pytest.mark.skipif( def test() -> None: - from my.coding.commits import commits all_commits = list(commits()) assert len(all_commits) > 100 @@ -27,15 +32,14 @@ def prepare(tmp_path: Path): # - bare repos # - canonical name # - caching? - hpi_repo_root = Path(__file__).absolute().parent.parent + hpi_repo_root = Path(__file__).absolute().parent.parent.parent assert (hpi_repo_root / '.git').exists(), hpi_repo_root - class commits: - emails = {'karlicoss@gmail.com'} - names = {'Dima'} - roots = [hpi_repo_root] + class config: + class commits: + emails = {'karlicoss@gmail.com'} + names = {'Dima'} + roots = [hpi_repo_root] - from my.core.cfg import tmp_config - with tmp_config() as config: - config.commits = commits + with tmp_config(modules='my.coding.commits', config=config): yield From f8cd31044ead61069fb2131b9040e5843ac5fe43 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 26 May 2023 00:34:24 +0100 Subject: [PATCH 117/302] general: move reddit tests into my/tests + tweak my.core.cfg to be more reliable --- my/core/cfg.py | 17 ++++++++---- {tests => my/tests}/common.py | 10 ++++--- {tests => my/tests}/reddit.py | 52 ++++++++++++++++++++--------------- tests/bluemaestro.py | 2 +- tests/config.py | 2 +- tests/emfit.py | 4 +-- tests/foursquare.py | 2 +- tests/github.py | 2 +- tests/goodreads.py | 2 +- tests/hypothesis.py | 2 +- tests/instapaper.py | 3 +- tests/jawbone.py | 2 +- tests/lastfm.py | 2 +- tests/orgmode.py | 4 +-- tests/pdfs.py | 4 +-- tests/rtm.py | 2 +- tests/shared_config.py | 4 +-- tests/smscalls.py | 2 +- tests/tweets.py | 2 +- tests/youtube.py | 4 +-- tox.ini | 7 +++++ 21 files changed, 77 insertions(+), 54 deletions(-) rename {tests => my/tests}/common.py (69%) rename {tests => my/tests}/reddit.py (66%) diff --git a/my/core/cfg.py b/my/core/cfg.py index f298e7f..0b59537 100644 --- a/my/core/cfg.py +++ b/my/core/cfg.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import TypeVar, Type, Callable, Dict, Any Attrs = Dict[str, Any] @@ -46,24 +48,29 @@ def _override_config(config: F) -> Iterator[F]: import importlib import sys -from typing import Optional, Set +from typing import Optional ModuleRegex = str @contextmanager def _reload_modules(modules: ModuleRegex) -> Iterator[None]: - def loaded_modules() -> Set[str]: - return {name for name in sys.modules if re.fullmatch(modules, name)} + # need to use list here, otherwise reordering with set might mess things up + def loaded_modules() -> list[str]: + return [name for name in sys.modules if re.fullmatch(modules, name)] modules_before = loaded_modules() - for m in modules_before: + # uhh... seems that reversed might make more sense -- not 100% sure why, but this works for tests/reddit.py + for m in reversed(modules_before): + # ugh... seems that reload works whereas pop doesn't work in some cases (e.g. on tests/reddit.py) + # sys.modules.pop(m, None) importlib.reload(sys.modules[m]) try: yield finally: modules_after = loaded_modules() + modules_before_set = set(modules_before) for m in modules_after: - if m in modules_before: + if m in modules_before_set: # was previously loaded, so need to reload to pick up old config importlib.reload(sys.modules[m]) else: diff --git a/tests/common.py b/my/tests/common.py similarity index 69% rename from tests/common.py rename to my/tests/common.py index 47c2991..c8d88ff 100644 --- a/tests/common.py +++ b/my/tests/common.py @@ -1,27 +1,29 @@ import os from pathlib import Path +import re +import sys import pytest V = 'HPI_TESTS_KARLICOSS' skip_if_not_karlicoss = pytest.mark.skipif( - V not in os.environ, reason=f'test only works on @karlicoss data for now. Set evn variable {V}=true to override.', + V not in os.environ, + reason=f'test only works on @karlicoss data for now. Set evn variable {V}=true to override.', ) + def reset_modules() -> None: ''' A hack to 'unload' HPI modules, otherwise some modules might cache the config TODO: a bit crap, need a better way.. ''' - import sys - import re to_unload = [m for m in sys.modules if re.match(r'my[.]?', m)] for m in to_unload: del sys.modules[m] def testdata() -> Path: - d = Path(__file__).absolute().parent.parent / 'testdata' + d = Path(__file__).absolute().parent.parent.parent / 'testdata' assert d.exists(), d return d diff --git a/tests/reddit.py b/my/tests/reddit.py similarity index 66% rename from tests/reddit.py rename to my/tests/reddit.py index 6e3e65e..0871041 100644 --- a/tests/reddit.py +++ b/my/tests/reddit.py @@ -1,5 +1,16 @@ -from datetime import datetime -import pytz +from datetime import datetime, timezone + +from my.core.cfg import tmp_config +from my.core.common import make_dict + +# todo ugh, it's discovered as a test??? +from .common import testdata + +import pytest + +# deliberately use mixed style imports on the top level and inside the methods to test tmp_config stuff +import my.reddit.rexport as my_reddit_rexport +import my.reddit.all as my_reddit_all def test_basic() -> None: @@ -7,17 +18,18 @@ def test_basic() -> None: # would ensure reasonable stat implementation as well and less duplication # note: deliberately use old module (instead of my.reddit.all) to test bwd compatibility from my.reddit import saved, events + assert len(list(events())) > 0 assert len(list(saved())) > 0 def test_comments() -> None: - from my.reddit.all import comments - assert len(list(comments())) > 0 + assert len(list(my_reddit_all.comments())) > 0 def test_unfav() -> None: - from my.reddit import events, saved + from my.reddit import events + ev = events() url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/' uev = [e for e in ev if e.url == url] @@ -31,52 +43,48 @@ def test_unfav() -> None: def test_saves() -> None: from my.reddit.all import saved + saves = list(saved()) assert len(saves) > 0 # just check that they are unique (makedict will throw) - from my.core.common import make_dict make_dict(saves, key=lambda s: s.sid) def test_disappearing() -> None: - from my.reddit.rexport import events # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason # but I guess it was just a short glitch... so whatever - saves = events() - favs = [s.kind for s in saves if s.text == 'favorited'] + evs = my_reddit_rexport.events() + favs = [s.kind for s in evs if s.text == 'favorited'] [deal_with_it] = [f for f in favs if f.title == '"Deal with it!"'] - assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc) + assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=timezone.utc) def test_unfavorite() -> None: - from my.reddit.rexport import events - evs = events() + evs = my_reddit_rexport.events() unfavs = [s for s in evs if s.text == 'unfavorited'] [xxx] = [u for u in unfavs if u.eid == 'unf-19ifop'] - assert xxx.dt == datetime(2019, 1, 29, 10, 10, 20, tzinfo=pytz.utc) + assert xxx.dt == datetime(2019, 1, 29, 10, 10, 20, tzinfo=timezone.utc) def test_preserves_extra_attr() -> None: # doesn't strictly belong here (not specific to reddit) - # but my.reddit does a fair bit of dyunamic hacking, so perhaps a good place to check nothing is lost + # but my.reddit does a fair bit of dynamic hacking, so perhaps a good place to check nothing is lost from my.reddit import config + assert isinstance(getattr(config, 'please_keep_me'), str) -import pytest @pytest.fixture(autouse=True, scope='module') def prepare(): - from .common import testdata data = testdata() / 'hpi-testdata' / 'reddit' assert data.exists(), data # note: deliberately using old config schema so we can test migrations - class test_config: - export_dir = data - please_keep_me = 'whatever' + class config: + class reddit: + export_dir = data + please_keep_me = 'whatever' - from my.core.cfg import tmp_config - with tmp_config() as config: - config.reddit = test_config + with tmp_config(modules='my.reddit.*', config=config): yield diff --git a/tests/bluemaestro.py b/tests/bluemaestro.py index c932d73..84d3eb0 100644 --- a/tests/bluemaestro.py +++ b/tests/bluemaestro.py @@ -50,7 +50,7 @@ def test_old_db() -> None: @pytest.fixture(autouse=True) def prepare(): - from .common import testdata + from my.tests.common import testdata bmdata = testdata() / 'hpi-testdata' / 'bluemaestro' assert bmdata.exists(), bmdata diff --git a/tests/config.py b/tests/config.py index e69f726..101f7df 100644 --- a/tests/config.py +++ b/tests/config.py @@ -121,6 +121,6 @@ Some misc stuff @pytest.fixture(autouse=True) def prepare(): - from .common import reset_modules + from my.tests.common import reset_modules reset_modules() yield diff --git a/tests/emfit.py b/tests/emfit.py index 8a779e4..b316017 100644 --- a/tests/emfit.py +++ b/tests/emfit.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark def test() -> None: @@ -13,8 +13,6 @@ def test() -> None: assert d.sleep_end.tzinfo is not None -from .common import skip_if_not_karlicoss -@skip_if_not_karlicoss def test_tz() -> None: from my.emfit import datas # TODO check errors too? diff --git a/tests/foursquare.py b/tests/foursquare.py index a9169ff..a75190f 100644 --- a/tests/foursquare.py +++ b/tests/foursquare.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark def test_checkins() -> None: from my.foursquare import get_checkins diff --git a/tests/github.py b/tests/github.py index 5fb5fb9..6b7df23 100644 --- a/tests/github.py +++ b/tests/github.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark from more_itertools import ilen # todo test against stats? not sure.. maybe both diff --git a/tests/goodreads.py b/tests/goodreads.py index 9acab5c..79e638a 100644 --- a/tests/goodreads.py +++ b/tests/goodreads.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark from more_itertools import ilen diff --git a/tests/hypothesis.py b/tests/hypothesis.py index f5ee99e..8ca76dc 100644 --- a/tests/hypothesis.py +++ b/tests/hypothesis.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark def test() -> None: from my.hypothesis import pages, highlights diff --git a/tests/instapaper.py b/tests/instapaper.py index 153a716..862654d 100644 --- a/tests/instapaper.py +++ b/tests/instapaper.py @@ -1,4 +1,5 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark + def test_pages() -> None: # TODO ugh. need lazy import to simplify testing? diff --git a/tests/jawbone.py b/tests/jawbone.py index 776ac50..0a05e9c 100644 --- a/tests/jawbone.py +++ b/tests/jawbone.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark from datetime import date, time diff --git a/tests/lastfm.py b/tests/lastfm.py index 43e8f41..b9e8887 100644 --- a/tests/lastfm.py +++ b/tests/lastfm.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark # todo maybe belongs to common from more_itertools import ilen diff --git a/tests/orgmode.py b/tests/orgmode.py index d213a5e..37d783e 100644 --- a/tests/orgmode.py +++ b/tests/orgmode.py @@ -1,9 +1,9 @@ +from my.tests.common import skip_if_not_karlicoss as pytestmark + from my import orgmode from my.core.orgmode import collect -from .common import skip_if_not_karlicoss -@skip_if_not_karlicoss def test() -> None: # meh results = list(orgmode.query().collect_all(lambda n: [n] if 'python' in n.tags else [])) diff --git a/tests/pdfs.py b/tests/pdfs.py index 1c5eab8..63b1319 100644 --- a/tests/pdfs.py +++ b/tests/pdfs.py @@ -4,7 +4,7 @@ from more_itertools import ilen import pytest -from .common import testdata +from my.tests.common import testdata def test_module(with_config) -> None: @@ -35,7 +35,7 @@ def test_with_error(with_config, tmp_path: Path) -> None: @pytest.fixture def with_config(): - from .common import reset_modules + from my.tests.common import reset_modules reset_modules() # todo ugh.. getting boilerplaty.. need to make it a bit more automatic.. # extra_data = Path(__file__).absolute().parent / 'extra/data/polar' diff --git a/tests/rtm.py b/tests/rtm.py index 93378b6..621e471 100644 --- a/tests/rtm.py +++ b/tests/rtm.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark def test() -> None: diff --git a/tests/shared_config.py b/tests/shared_config.py index 6b83a5a..c2f6973 100644 --- a/tests/shared_config.py +++ b/tests/shared_config.py @@ -15,7 +15,7 @@ class SharedConfig(NamedTuple): def _prepare_google_config(tmp_path: Path): - from .common import testdata + from my.tests.common import testdata try: track = one(testdata().rglob('italy-slovenia-2017-07-29.json')) except ValueError: @@ -35,7 +35,7 @@ def _prepare_google_config(tmp_path: Path): # pass tmp_path from pytest to this helper function # see tests/tz.py as an example def temp_config(temp_path: Path) -> Any: - from .common import reset_modules + from my.tests.common import reset_modules reset_modules() LTZ.config.fast = True diff --git a/tests/smscalls.py b/tests/smscalls.py index 51150f0..d063de1 100644 --- a/tests/smscalls.py +++ b/tests/smscalls.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark # TODO maybe instead detect if it has any data at all # if none, then skip the test, say that user doesn't have any data? diff --git a/tests/tweets.py b/tests/tweets.py index fefc24e..763fcef 100644 --- a/tests/tweets.py +++ b/tests/tweets.py @@ -1,4 +1,4 @@ -from .common import skip_if_not_karlicoss as pytestmark +from my.tests.common import skip_if_not_karlicoss as pytestmark # todo current test doesn't depend on data, in principle... # should make lazy loading the default.. diff --git a/tests/youtube.py b/tests/youtube.py index 4864ee9..f37493b 100644 --- a/tests/youtube.py +++ b/tests/youtube.py @@ -1,3 +1,5 @@ +from my.tests.common import skip_if_not_karlicoss as pytestmark + # TODO move elsewhere? # these tests would only make sense with some existing data? although some of them would work for everyone.. # not sure what's a good way of handling this.. @@ -6,8 +8,6 @@ import pytz from more_itertools import bucket -from .common import skip_if_not_karlicoss as pytestmark - # TODO ugh. if i uncomment this here (on top level), then this test vvv fails # from my.media.youtube import get_watched, Watched # HPI_TESTS_KARLICOSS=true pytest -raps tests/tz.py tests/youtube.py diff --git a/tox.ini b/tox.ini index 3124618..9ec80f1 100644 --- a/tox.ini +++ b/tox.ini @@ -79,6 +79,13 @@ commands = hpi module install my.reddit.rexport + {envpython} -m pytest \ + # importlib is the new suggested import-mode + # without it test package names end up as core.tests.* instead of my.core.tests.* + --import-mode=importlib \ + --pyargs my.tests \ + {posargs} + {envpython} -m pytest tests \ # ignore some tests which might take a while to run on ci.. --ignore tests/takeout.py \ From 5fe21240b40425bfba3a8018df447b6a1b5c320d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 7 Jun 2023 22:06:29 +0100 Subject: [PATCH 118/302] core: move mcachew into my.core.cachew; use better typing annotations (copied from cachew) --- my/bluemaestro.py | 2 +- my/core/cachew.py | 98 ++++++++++++++++++++++++++++++++++++++++--- my/core/common.py | 67 +---------------------------- my/emfit/__init__.py | 6 ++- my/github/ghexport.py | 2 +- my/orgmode.py | 12 +++++- my/rescuetime.py | 3 +- 7 files changed, 113 insertions(+), 77 deletions(-) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index b50c77c..b49e9e0 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -53,7 +53,7 @@ def is_bad_table(name: str) -> bool: from my.core.cachew import cache_dir from my.core.common import mcachew -@mcachew(depends_on=lambda: inputs(), cache_path=cache_dir('bluemaestro')) +@mcachew(depends_on=inputs, cache_path=cache_dir('bluemaestro')) def measurements() -> Iterable[Res[Measurement]]: # todo ideally this would be via arguments... but needs to be lazy dbs = inputs() diff --git a/my/core/cachew.py b/my/core/cachew.py index dbc4d49..7dd62d2 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -1,8 +1,16 @@ from .common import assert_subpackage; assert_subpackage(__name__) from contextlib import contextmanager +import logging from pathlib import Path -from typing import Optional +import sys +from typing import Optional, Iterator, cast, TYPE_CHECKING, TypeVar, Callable, overload, Union, Any, Type +import warnings + +import appdirs + +PathIsh = Union[str, Path] # avoid circular import from .common + def disable_cachew() -> None: try: @@ -12,10 +20,10 @@ def disable_cachew() -> None: return from cachew import settings + settings.ENABLE = False -from typing import Iterator @contextmanager def disabled_cachew() -> Iterator[None]: try: @@ -25,20 +33,23 @@ def disabled_cachew() -> Iterator[None]: yield return from cachew.extra import disabled_cachew + with disabled_cachew(): yield def _appdirs_cache_dir() -> Path: - import appdirs cd = Path(appdirs.user_cache_dir('my')) cd.mkdir(exist_ok=True, parents=True) return cd -from . import PathIsh +_CACHE_DIR_NONE_HACK = Path('/tmp/hpi/cachew_none_hack') + + def cache_dir(suffix: Optional[PathIsh] = None) -> Path: from . import core_config as CC + cdir_ = CC.config.get_cache_dir() sp: Optional[Path] = None @@ -55,9 +66,86 @@ def cache_dir(suffix: Optional[PathIsh] = None) -> Path: # this logic is tested via test_cachew_dir_none if cdir_ is None: - from .common import _CACHE_DIR_NONE_HACK cdir = _CACHE_DIR_NONE_HACK else: cdir = cdir_ return cdir if sp is None else cdir / sp + + +"""See core.cachew.cache_dir for the explanation""" + + +_cache_path_dflt = cast(str, object()) + + +# TODO I don't really like 'mcachew', just 'cache' would be better... maybe? +# todo ugh. I think it needs @doublewrap, otherwise @mcachew without args doesn't work +# but it's a bit problematic.. doublewrap works by defecting if the first arg is callable +# but here cache_path can also be a callable (for lazy/dynamic path)... so unclear how to detect this +def _mcachew_impl(cache_path=_cache_path_dflt, **kwargs): + """ + Stands for 'Maybe cachew'. + Defensive wrapper around @cachew to make it an optional dependency. + """ + if cache_path is _cache_path_dflt: + # wasn't specified... so we need to use cache_dir + cache_path = cache_dir() + + if isinstance(cache_path, (str, Path)): + try: + # check that it starts with 'hack' path + Path(cache_path).relative_to(_CACHE_DIR_NONE_HACK) + except: # noqa: E722 bare except + pass # no action needed, doesn't start with 'hack' string + else: + # todo show warning? tbh unclear how to detect when user stopped using 'old' way and using suffix instead? + # if it does, means that user wanted to disable cache + cache_path = None + try: + import cachew + except ModuleNotFoundError: + warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew') + return lambda orig_func: orig_func + else: + kwargs['cache_path'] = cache_path + return cachew.cachew(**kwargs) + + +if TYPE_CHECKING: + R = TypeVar('R') + if sys.version_info[:2] >= (3, 10): + from typing import ParamSpec + else: + from typing_extensions import ParamSpec + P = ParamSpec('P') + CC = Callable[P, R] # need to give it a name, if inlined into bound=, mypy runs in a bug + PathProvider = Union[PathIsh, Callable[P, PathIsh]] + # NOTE: in cachew, HashFunction type returns str + # however in practice, cachew alwasy calls str for its result + # so perhaps better to switch it to Any in cachew as well + HashFunction = Callable[P, Any] + + F = TypeVar('F', bound=Callable) + + # we need two versions due to @doublewrap + # this is when we just annotate as @cachew without any args + @overload # type: ignore[no-overload-impl] + def mcachew(fun: F) -> F: + ... + + @overload + def mcachew( + cache_path: Optional[PathProvider] = ..., + *, + force_file: bool = ..., + cls: Optional[Type] = ..., + depends_on: HashFunction = ..., + logger: Optional[logging.Logger] = ..., + chunk_by: int = ..., + synthetic_key: Optional[str] = ..., + ) -> Callable[[F], F]: + ... + +else: + mcachew = _mcachew_impl diff --git a/my/core/common.py b/my/core/common.py index 359f451..8c670fa 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -239,70 +239,6 @@ def get_files( return tuple(paths) -# TODO annotate it, perhaps use 'dependent' type (for @doublewrap stuff) -if TYPE_CHECKING: - from typing import Callable, TypeVar - from typing_extensions import Protocol - # TODO reuse types from cachew? although not sure if we want hard dependency on it in typecheck time.. - # I guess, later just define pass through once this is fixed: https://github.com/python/typing/issues/270 - # ok, that's actually a super nice 'pattern' - F = TypeVar('F') - - class McachewType(Protocol): - def __call__( - self, - cache_path: Any=None, - *, - hashf: Any=None, # todo deprecate - depends_on: Any=None, - force_file: bool=False, - chunk_by: int=0, - logger: Any=None, - ) -> Callable[[F], F]: - ... - - mcachew: McachewType - - -_CACHE_DIR_NONE_HACK = Path('/tmp/hpi/cachew_none_hack') -"""See core.cachew.cache_dir for the explanation""" - - -_cache_path_dflt = cast(str, object()) -# TODO I don't really like 'mcachew', just 'cache' would be better... maybe? -# todo ugh. I think it needs @doublewrap, otherwise @mcachew without args doesn't work -# but it's a bit problematic.. doublewrap works by defecting if the first arg is callable -# but here cache_path can also be a callable (for lazy/dynamic path)... so unclear how to detect this -def mcachew(cache_path=_cache_path_dflt, **kwargs): # type: ignore[no-redef] - """ - Stands for 'Maybe cachew'. - Defensive wrapper around @cachew to make it an optional dependency. - """ - if cache_path is _cache_path_dflt: - # wasn't specified... so we need to use cache_dir - from .cachew import cache_dir - cache_path = cache_dir() - - if isinstance(cache_path, (str, Path)): - try: - # check that it starts with 'hack' path - Path(cache_path).relative_to(_CACHE_DIR_NONE_HACK) - except: # noqa: E722 bare except - pass # no action needed, doesn't start with 'hack' string - else: - # todo show warning? tbh unclear how to detect when user stopped using 'old' way and using suffix instead? - # if it does, means that user wanted to disable cache - cache_path = None - try: - import cachew - except ModuleNotFoundError: - warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew') - return lambda orig_func: orig_func - else: - kwargs['cache_path'] = cache_path - return cachew.cachew(**kwargs) - - @functools.lru_cache(1) def _magic(): import magic # type: ignore @@ -663,4 +599,5 @@ def assert_never(value: NoReturn) -> NoReturn: ## legacy imports, keeping them here for backwards compatibility from functools import cached_property as cproperty from typing import Literal -## \ No newline at end of file +from .cachew import mcachew +## diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index acaa303..cde6ddc 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -32,8 +32,12 @@ def dir_hash(path: Path): return mtimes +def _cachew_depends_on(): + return dir_hash(config.export_path) + + # TODO take __file__ into account somehow? -@mcachew(cache_path=cache_dir() / 'emfit.cache', hashf=lambda: dir_hash(config.export_path)) +@mcachew(cache_path=cache_dir() / 'emfit.cache', depends_on=_cachew_depends_on) def datas() -> Iterable[Res[Emfit]]: import dataclasses diff --git a/my/github/ghexport.py b/my/github/ghexport.py index 67042fc..9eebbf0 100644 --- a/my/github/ghexport.py +++ b/my/github/ghexport.py @@ -61,7 +61,7 @@ def _dal() -> dal.DAL: return dal.DAL(sources) -@mcachew(depends_on=lambda: inputs()) +@mcachew(depends_on=inputs) def events() -> Results: from my.core.common import ensure_unique key = lambda e: object() if isinstance(e, Exception) else e.eid diff --git a/my/orgmode.py b/my/orgmode.py index bb186d1..8293b74 100644 --- a/my/orgmode.py +++ b/my/orgmode.py @@ -78,14 +78,22 @@ def _sanitize(p: Path) -> str: return re.sub(r'\W', '_', str(p)) +def _cachew_cache_path(_self, f: Path) -> Path: + return cache_dir() / 'orgmode' / _sanitize(f) + + +def _cachew_depends_on(_self, f: Path): + return (f, f.stat().st_mtime) + + class Query: def __init__(self, files: Sequence[Path]) -> None: self.files = files # TODO yield errors? @mcachew( - cache_path=lambda _, f: cache_dir() / 'orgmode' / _sanitize(f), force_file=True, - depends_on=lambda _, f: (f, f.stat().st_mtime), + cache_path=_cachew_cache_path, force_file=True, + depends_on=_cachew_depends_on, ) def _iterate(self, f: Path) -> Iterable[OrgNote]: o = orgparse.load(f) diff --git a/my/rescuetime.py b/my/rescuetime.py index 40aa6b7..c986d89 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -28,10 +28,9 @@ DAL = dal.DAL Entry = dal.Entry -@mcachew(depends_on=lambda: inputs()) +@mcachew(depends_on=inputs) def entries() -> Iterable[Res[Entry]]: dal = DAL(inputs()) - it = dal.entries() yield from dal.entries() From c91534b966f96b56bee9dc62fc014fa8d78a0152 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 7 Jun 2023 21:16:37 +0100 Subject: [PATCH 119/302] set json files to empty dicts so they are at least valid jsons (promnesia was stumbling over these, seems like the easiest fix :) ) --- .../gdpr_subdirs/broken_export/comments/comments.json | 1 + .../gdpr_subdirs/gdpr_export/comments/comments.json | 1 + .../gdpr_subdirs/gdpr_export/profile/settings.json | 1 + 3 files changed, 3 insertions(+) diff --git a/my/core/tests/structure_data/gdpr_subdirs/broken_export/comments/comments.json b/my/core/tests/structure_data/gdpr_subdirs/broken_export/comments/comments.json index e69de29..0967ef4 100644 --- a/my/core/tests/structure_data/gdpr_subdirs/broken_export/comments/comments.json +++ b/my/core/tests/structure_data/gdpr_subdirs/broken_export/comments/comments.json @@ -0,0 +1 @@ +{} diff --git a/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json b/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json index e69de29..0967ef4 100644 --- a/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json +++ b/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/comments/comments.json @@ -0,0 +1 @@ +{} diff --git a/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json b/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json index e69de29..0967ef4 100644 --- a/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json +++ b/my/core/tests/structure_data/gdpr_subdirs/gdpr_export/profile/settings.json @@ -0,0 +1 @@ +{} From c12224af7427ec8eb25f673eb3a4729243c95577 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 9 Jun 2023 03:04:54 +0100 Subject: [PATCH 120/302] misc: replace uses of pytz.utc with timezone.utc where it makes sense --- my/demo.py | 5 ++--- my/github/common.py | 6 ++---- my/google/takeout/html.py | 5 ++--- my/lastfm.py | 6 ++---- my/reddit/rexport.py | 5 ++--- my/roamresearch.py | 12 +++++------- my/rss/feedly.py | 17 +++++------------ tests/takeout.py | 4 ++-- tests/tweets.py | 6 ++---- 9 files changed, 24 insertions(+), 42 deletions(-) diff --git a/my/demo.py b/my/demo.py index 1023795..75954d6 100644 --- a/my/demo.py +++ b/my/demo.py @@ -5,8 +5,7 @@ Just a demo module for testing and documentation purposes from .core import Paths, PathIsh from typing import Optional -from datetime import tzinfo -import pytz +from datetime import tzinfo, timezone from my.config import demo as user_config from dataclasses import dataclass @@ -16,7 +15,7 @@ from dataclasses import dataclass class demo(user_config): data_path: Paths username: str - timezone: tzinfo = pytz.utc + timezone: tzinfo = timezone.utc external: Optional[PathIsh] = None diff --git a/my/github/common.py b/my/github/common.py index 6114045..e54bc4d 100644 --- a/my/github/common.py +++ b/my/github/common.py @@ -4,11 +4,9 @@ Github events and their metadata: comments/issues/pull requests from ..core import __NOT_HPI_MODULE__ -from datetime import datetime +from datetime import datetime, timezone from typing import Optional, NamedTuple, Iterable, Set, Tuple -import pytz - from ..core import warn_if_empty, LazyLogger from ..core.error import Res @@ -48,7 +46,7 @@ def merge_events(*sources: Results) -> Results: def parse_dt(s: str) -> datetime: # TODO isoformat? - return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ')) + return datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) # experimental way of supportint event ids... not sure diff --git a/my/google/takeout/html.py b/my/google/takeout/html.py index d4d6830..c01788d 100644 --- a/my/google/takeout/html.py +++ b/my/google/takeout/html.py @@ -5,12 +5,11 @@ Google Takeout exports: browsing history, search/youtube/google play activity from enum import Enum import re from pathlib import Path -from datetime import datetime +from datetime import datetime, timezone from html.parser import HTMLParser from typing import List, Optional, Any, Callable, Iterable, Tuple from collections import OrderedDict from urllib.parse import unquote -import pytz from ...core.time import abbr_to_timezone @@ -30,7 +29,7 @@ def parse_dt(s: str) -> datetime: # old takeouts didn't have timezone # hopefully it was utc? Legacy, so no that much of an issue anymore.. # todo although maybe worth adding timezone from location provider? - tz = pytz.utc + tz = timezone.utc else: s, tzabbr = s.rsplit(maxsplit=1) tz = abbr_to_timezone(tzabbr) diff --git a/my/lastfm.py b/my/lastfm.py index ffec05c..97c112c 100644 --- a/my/lastfm.py +++ b/my/lastfm.py @@ -17,13 +17,11 @@ from .core.cfg import make_config config = make_config(lastfm) -from datetime import datetime +from datetime import datetime, timezone import json from pathlib import Path from typing import NamedTuple, Sequence, Iterable -import pytz - from .core.common import mcachew, Json, get_files @@ -44,7 +42,7 @@ class Scrobble(NamedTuple): @property def dt(self) -> datetime: ts = int(self.raw['date']) - return datetime.fromtimestamp(ts, tz=pytz.utc) + return datetime.fromtimestamp(ts, tz=timezone.utc) @property def artist(self) -> str: diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index b1f9e3b..2d2b9a3 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -112,9 +112,8 @@ def upvoted() -> Iterator[Upvote]: from typing import Dict, Iterable, Iterator, NamedTuple from functools import lru_cache -import pytz import re -from datetime import datetime +from datetime import datetime, timezone from multiprocessing import Pool # TODO hmm. apparently decompressing takes quite a bit of time... @@ -151,7 +150,7 @@ def _get_bdate(bfile: Path) -> datetime: stem = stem.replace('T', '').replace('Z', '') # adapt for arctee match = RE.search(stem) assert match is not None - bdt = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S")) + bdt = datetime.strptime(match.group(1), "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc) return bdt diff --git a/my/roamresearch.py b/my/roamresearch.py index 0c1192f..2fe06d4 100644 --- a/my/roamresearch.py +++ b/my/roamresearch.py @@ -1,14 +1,12 @@ """ [[https://roamresearch.com][Roam]] data """ -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from itertools import chain import re from typing import NamedTuple, Iterator, List, Optional -import pytz - from .core import get_files, LazyLogger, Json from my.config import roamresearch as config @@ -38,7 +36,7 @@ class Node(NamedTuple): def created(self) -> datetime: ct = self.raw.get(Keys.CREATED) if ct is not None: - return datetime.fromtimestamp(ct / 1000, tz=pytz.utc) + return datetime.fromtimestamp(ct / 1000, tz=timezone.utc) # ugh. daily notes don't have create time for some reason??? title = self.title @@ -50,13 +48,13 @@ class Node(NamedTuple): return self.edited # fallback TODO log? # strip off 'th'/'rd' crap dts = m.group(1) + ' ' + m.group(2) + ' ' + m.group(3) - dt = datetime.strptime(dts, '%B %d %Y') - return pytz.utc.localize(dt) + dt = datetime.strptime(dts, '%B %d %Y').replace(tzinfo=timezone.utc) + return dt @property def edited(self) -> datetime: rt = self.raw[Keys.EDITED] - return datetime.fromtimestamp(rt / 1000, tz=pytz.utc) + return datetime.fromtimestamp(rt / 1000, tz=timezone.utc) @property def title(self) -> Optional[str]: diff --git a/my/rss/feedly.py b/my/rss/feedly.py index df38435..4611ced 100644 --- a/my/rss/feedly.py +++ b/my/rss/feedly.py @@ -1,23 +1,21 @@ """ Feedly RSS reader """ - from my.config import feedly as config +from datetime import datetime, timezone +import json from pathlib import Path -from typing import Sequence +from typing import Iterable, Sequence from ..core.common import listify, get_files -from .common import Subscription +from .common import Subscription, SubscriptionState def inputs() -> Sequence[Path]: return get_files(config.export_path) -import json - - @listify def parse_file(f: Path): raw = json.loads(f.read_text()) @@ -33,14 +31,9 @@ def parse_file(f: Path): ) -from datetime import datetime -from typing import Iterable -from .common import SubscriptionState def states() -> Iterable[SubscriptionState]: - import pytz for f in inputs(): dts = f.stem.split('_')[-1] - dt = datetime.strptime(dts, '%Y%m%d%H%M%S') - dt = pytz.utc.localize(dt) + dt = datetime.strptime(dts, '%Y%m%d%H%M%S').replace(tzinfo=timezone.utc) subs = parse_file(f) yield dt, subs diff --git a/tests/takeout.py b/tests/takeout.py index a40e218..cddc684 100644 --- a/tests/takeout.py +++ b/tests/takeout.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from datetime import datetime +from datetime import datetime, timezone from itertools import islice import pytz @@ -43,7 +43,7 @@ def test_myactivity_search() -> None: results = list(read_html(tpath, path)) res = ( - datetime(year=2018, month=12, day=17, hour=8, minute=16, second=18, tzinfo=pytz.utc), + datetime(year=2018, month=12, day=17, hour=8, minute=16, second=18, tzinfo=timezone.utc), 'https://en.wikipedia.org/wiki/Emmy_Noether&usg=AFQjCNGrSW-iDnVA2OTcLsG3I80H_a6y_Q', 'Emmy Noether - Wikipedia', ) diff --git a/tests/tweets.py b/tests/tweets.py index 763fcef..3545296 100644 --- a/tests/tweets.py +++ b/tests/tweets.py @@ -3,11 +3,9 @@ from my.tests.common import skip_if_not_karlicoss as pytestmark # should make lazy loading the default.. -from datetime import datetime +from datetime import datetime, timezone import json -import pytz - def test_tweet() -> None: from my.twitter.archive import Tweet @@ -45,7 +43,7 @@ def test_tweet() -> None: """ t = Tweet(json.loads(raw), screen_name='whatever') assert t.permalink is not None - assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc) + assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=timezone.utc) assert t.text == 'this is a test tweet' assert t.tid == '2328934829084' assert t.entities is not None From ab7135d42f5b3cbdf5dedbd3a50f7aceb539c4d3 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 21 Jun 2023 02:58:17 +0100 Subject: [PATCH 121/302] core: experimental import of my._init_hook to configure logging/warnings/env variables --- my/core/__init__.py | 10 ++++++++++ my/core/init.py | 1 + 2 files changed, 11 insertions(+) diff --git a/my/core/__init__.py b/my/core/__init__.py index 78e20e7..a7af46c 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -37,3 +37,13 @@ __all__ = [ 'dataclass', 'Path', ] + + +## experimental for now +# you could put _init_hook.py next to your private my/config +# that way you can configure logging/warnings/env variables on every HPI import +try: + import my._init_hook # type: ignore[import] +except: + pass +## diff --git a/my/core/init.py b/my/core/init.py index 2e47e87..6bf766e 100644 --- a/my/core/init.py +++ b/my/core/init.py @@ -1,5 +1,6 @@ ''' A hook to insert user's config directory into Python's search path. +Note that this file is imported only if we don't have custom user config (under my.config namespace) in PYTHONPATH Ideally that would be in __init__.py (so it's executed without having to import explicitly) But, with namespace packages, we can't have __init__.py in the parent subpackage From 6aa3d4225e0804fa49e40fc58bfd9ca97b03a2a5 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 21 Jun 2023 03:13:58 +0100 Subject: [PATCH 122/302] sort out mypy after its update --- my/core/query.py | 4 ++-- my/core/serialize.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/my/core/query.py b/my/core/query.py index f78a1f7..4e00569 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -52,9 +52,9 @@ def locate_function(module_name: str, function_name: str) -> Callable[[], Iterab """ try: mod = importlib.import_module(module_name) - for (fname, func) in inspect.getmembers(mod, inspect.isfunction): + for (fname, f) in inspect.getmembers(mod, inspect.isfunction): if fname == function_name: - return func + return f # in case the function is defined dynamically, # like with a globals().setdefault(...) or a module-level __getattr__ function func = getattr(mod, function_name, None) diff --git a/my/core/serialize.py b/my/core/serialize.py index 1ef7bc0..c5f4cba 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -62,10 +62,9 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: if _additional_default is not None and callable(_additional_default): def wrapped_default(obj: Any) -> Any: + assert _additional_default is not None try: - # hmm... shouldn't mypy know that _additional_default is not None here? - # assert _additional_default is not None - return _additional_default(obj) # type: ignore[misc] + return _additional_default(obj) except TypeError: # expected TypeError, signifies couldn't be encoded by custom # serializer function. Try _default_encode from here From 661714f1d9d56f8c422fb2a12578ee04ccc96329 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 21 Jun 2023 16:43:20 +0100 Subject: [PATCH 123/302] core/logging: overhaul and many improvements -- mainly to deprecate abandoned logzero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - generally saner/cleaner logger initialization In particular now it doesn't override logging level specified by the user code prior to instantiating the logger. Also remove the `LazyLogger` hack, doesn't seem like it's necessary when the above is implemented. - get rid of `logzero` which is archived and abandoned now, use `colorlog` for coloured logging formatter - allow configuring log level via shell via `LOGGING_LEVEL_module_name=` E.g. `LOGGING_LEVEL_rescuexport_dal=WARNING LOGGING_LEVEL_my_rescuetime=debug ./script.py` - port `AddExceptionTraceback` from HPI/promnesia - port `CollapseLogsHandler` from HPI/promnesia Also allow configuring from the shell, e.g. `LOGGING_COLLAPSE=` - add support for `enlighten` progress bar, so it can be shared between different projects See https://github.com/Rockhopper-Technologies/enlighten#readme This allows nice CLI progressbars, e.g. for parallel processing of different files from HPI: ghexport.dal[111] 29%|████████████████████████████████████████████████████████████████▏ | 29/100 [00:03<00:07, 10.03 files/s] rexport.dal[comments] 17%|████████████████████████████████████▋ | 115/682 [00:03<00:14, 39.15 files/s] my.instagram.android 0%|▎ | 3/2631 [00:02<34:50, 1.26 files/s] Currently off by default, and hidden behind an env variable (`ENLIGHTEN_ENABLE=true`) --- doc/SETUP.org | 5 +- my/core/__init__.py | 9 +- my/core/logging.py | 237 +++++++++++++++++++++++++++++--------------- setup.py | 3 +- 4 files changed, 166 insertions(+), 88 deletions(-) diff --git a/doc/SETUP.org b/doc/SETUP.org index 6605f66..904331f 100644 --- a/doc/SETUP.org +++ b/doc/SETUP.org @@ -105,10 +105,11 @@ You can also install some optional packages They aren't necessary, but will improve your experience. At the moment these are: -- [[https://github.com/karlicoss/cachew][cachew]]: automatic caching library, which can greatly speedup data access -- [[https://github.com/metachris/logzero][logzero]]: a nice logging library, supporting colors - [[https://github.com/ijl/orjson][orjson]]: a library for serializing data to JSON, used in ~my.core.serialize~ and the ~hpi query~ interface +- [[https://github.com/karlicoss/cachew][cachew]]: automatic caching library, which can greatly speedup data access - [[https://github.com/python/mypy][mypy]]: mypy is used for checking configs and troubleshooting +- [[https://github.com/borntyping/python-colorlog][colorlog]]: colored formatter for ~logging~ module +- [[https://github.com/Rockhopper-Technologies/enlighten]]: console progress bar library * Setting up modules This is an *optional step* as few modules work without extra setup. diff --git a/my/core/__init__.py b/my/core/__init__.py index a7af46c..0f09eef 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -1,17 +1,15 @@ # this file only keeps the most common & critical types/utility functions from .common import get_files, PathIsh, Paths from .common import Json -from .common import LazyLogger from .common import warn_if_empty from .common import stat, Stats from .common import datetime_naive, datetime_aware from .common import assert_never from .cfg import make_config - -from .util import __NOT_HPI_MODULE__ - from .error import Res, unwrap +from .logging import make_logger, LazyLogger +from .util import __NOT_HPI_MODULE__ # just for brevity in modules @@ -23,7 +21,8 @@ from pathlib import Path __all__ = [ 'get_files', 'PathIsh', 'Paths', 'Json', - 'LazyLogger', + 'make_logger', + 'LazyLogger', # legacy import 'warn_if_empty', 'stat', 'Stats', 'datetime_aware', 'datetime_naive', diff --git a/my/core/logging.py b/my/core/logging.py index e7061fa..47cd135 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -1,47 +1,61 @@ -#!/usr/bin/env python3 -''' -Default logger is a bit meh, see 'test'/run this file for a demo -''' +from __future__ import annotations + +from functools import lru_cache +import logging +import os +from typing import Union +import warnings + def test() -> None: - import logging import sys from typing import Callable M: Callable[[str], None] = lambda s: print(s, file=sys.stderr) - M(" Logging module's defaults are not great...'") - l = logging.getLogger('test_logger') + ## prepare exception for later + try: + None.whatever # type: ignore[attr-defined] + except Exception as e: + ex = e + ## + + M(" Logging module's defaults are not great:") + l = logging.getLogger('default_logger') l.error("For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level") - M(" The reason is that you need to remember to call basicConfig() first") + M("\n The reason is that you need to remember to call basicConfig() first. Let's do it now:") logging.basicConfig() l.error("OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number") - M("") - M(" With LazyLogger you get a reasonable logging format, colours and other neat things") + M("\n Also exception logging is kinda lame, doesn't print traceback by default unless you remember to pass exc_info:") + l.exception(ex) # type: ignore[possibly-undefined] - ll = LazyLogger('test') # No need for basicConfig! + M("\n\n With make_logger you get a reasonable logging format, colours (via colorlog library) and other neat things:") + + ll = make_logger('test') # No need for basicConfig! ll.info("default level is INFO") - ll.debug(".. so this shouldn't be displayed") + ll.debug("... so this shouldn't be displayed") ll.warning("warnings are easy to spot!") - ll.exception(RuntimeError("exceptions as well")) + + M("\n Exceptions print traceback by default now:") + ll.exception(ex) + + M("\n You can (and should) use it via regular logging.getLogger after that, e.g. let's set logging level to DEBUG now") + logging.getLogger('test').setLevel(logging.DEBUG) + ll.debug("... now debug messages are also displayed") -import logging -from typing import Union, Optional, cast -import os -import warnings +DEFAULT_LEVEL = 'INFO' +FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)-4d]{end} %(message)s' +FORMAT_NOCOLOR = FORMAT.format(start='', end='') + Level = int -LevelIsh = Optional[Union[Level, str]] +LevelIsh = Union[Level, str, None] def mklevel(level: LevelIsh) -> Level: - # todo put in some global file, like envvars.py - glevel = os.environ.get('HPI_LOGS', None) - if glevel is not None: - level = glevel if level is None: return logging.NOTSET if isinstance(level, int): @@ -49,69 +63,85 @@ def mklevel(level: LevelIsh) -> Level: return getattr(logging, level.upper()) -FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)d]{end} %(message)s' -FORMAT_COLOR = FORMAT.format(start='%(color)s', end='%(end_color)s') -FORMAT_NOCOLOR = FORMAT.format(start='', end='') -DATEFMT = '%Y-%m-%d %H:%M:%S' +def get_collapse_level() -> Level | None: + # TODO not sure if should be specific to logger name? + cl = os.environ.get('LOGGING_COLLAPSE', None) + if cl is not None: + return mklevel(cl) + # legacy name, maybe deprecate? + cl = os.environ.get('COLLAPSE_DEBUG_LOGS', None) + if cl is not None: + return logging.DEBUG + return None -COLLAPSE_DEBUG_LOGS = os.environ.get('COLLAPSE_DEBUG_LOGS', False) -_init_done = 'lazylogger_init_done' +def get_env_level(name: str) -> Level | None: + PREFIX = 'LOGGING_LEVEL_' # e.g. LOGGING_LEVEL_my_hypothesis=debug + # shell doesn't allow using dots in var names without escaping, so also support underscore syntax + lvl = os.environ.get(PREFIX + name, None) or os.environ.get(PREFIX + name.replace('.', '_'), None) + if lvl is not None: + return mklevel(lvl) + return None -def setup_logger(logger: logging.Logger, level: LevelIsh) -> None: - lvl = mklevel(level) - try: - import logzero # type: ignore[import] - except ModuleNotFoundError: - warnings.warn("You might want to install 'logzero' for nice colored logs!") - formatter = logging.Formatter(fmt=FORMAT_NOCOLOR, datefmt=DATEFMT) - use_logzero = False + +def setup_logger(logger: str | logging.Logger, *, level: LevelIsh = None) -> None: + """ + Wrapper to simplify logging setup. + """ + if isinstance(logger, str): + logger = logging.getLogger(logger) + + if level is None: + level = DEFAULT_LEVEL + + # env level always takes precedence + env_level = get_env_level(logger.name) + if env_level is not None: + lvl = env_level else: - formatter = logzero.LogFormatter( - fmt=FORMAT_COLOR, - datefmt=DATEFMT, - ) - use_logzero = True + lvl = mklevel(level) + + if logger.level == logging.NOTSET: + # if it's already set, the user requested a different logging level, let's respect that + logger.setLevel(lvl) logger.addFilter(AddExceptionTraceback()) - if use_logzero and not COLLAPSE_DEBUG_LOGS: # all set, nothing to do - # 'simple' setup - logzero.setup_logger(logger.name, level=lvl, formatter=formatter) # type: ignore[possibly-undefined] - return - h = CollapseDebugHandler() if COLLAPSE_DEBUG_LOGS else logging.StreamHandler() - logger.setLevel(lvl) - h.setLevel(lvl) - h.setFormatter(formatter) - logger.addHandler(h) - logger.propagate = False # ugh. otherwise it duplicates log messages? not sure about it.. + ch = logging.StreamHandler() + collapse_level = get_collapse_level() + ch = logging.StreamHandler() if collapse_level is None else CollapseLogsHandler(maxlevel=collapse_level) + + # default level for handler is NOTSET, which will make it process all messages + # we rely on the logger to actually accept/reject log msgs + logger.addHandler(ch) + + # this attribute is set to True by default, which causes log entries to be passed to root logger (e.g. if you call basicConfig beforehand) + # even if log entry is handled by this logger ... not sure what's the point of this behaviour?? + logger.propagate = False + + try: + # try colorlog first, so user gets nice colored logs + import colorlog + except ModuleNotFoundError: + warnings.warn("You might want to 'pip install colorlog' for nice colored logs") + formatter = logging.Formatter(FORMAT_NOCOLOR) + else: + # log_color/reset are specific to colorlog + FORMAT_COLOR = FORMAT.format(start='%(log_color)s', end='%(reset)s') + fmt = FORMAT_COLOR if ch.stream.isatty() else FORMAT_NOCOLOR + # colorlog should detect tty in principle, but doesn't handle everything for some reason + # see https://github.com/borntyping/python-colorlog/issues/71 + formatter = colorlog.ColoredFormatter(fmt) + + ch.setFormatter(formatter) -class LazyLogger(logging.Logger): - def __new__(cls, name: str, level: LevelIsh = 'INFO') -> 'LazyLogger': - logger = logging.getLogger(name) - - # this is called prior to all _log calls so makes sense to do it here? - def isEnabledFor_lazyinit(*args, logger=logger, orig=logger.isEnabledFor, **kwargs) -> bool: - if not getattr(logger, _init_done, False): # init once, if necessary - setup_logger(logger, level=level) - setattr(logger, _init_done, True) - logger.isEnabledFor = orig # restore the callback - return orig(*args, **kwargs) - - # oh god.. otherwise might go into an inf loop - if not hasattr(logger, _init_done): - setattr(logger, _init_done, False) # will setup on the first call - logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[method-assign] - return cast(LazyLogger, logger) - - -# by default, logging.exception isn't logging traceback -# which is a bit annoying since we have to +# by default, logging.exception isn't logging traceback unless called inside of the exception handler +# which is a bit annoying since we have to pass exc_info explicitly # also see https://stackoverflow.com/questions/75121925/why-doesnt-python-logging-exception-method-log-traceback-by-default -# tod also amend by post about defensive error handling? +# todo also amend by post about defensive error handling? class AddExceptionTraceback(logging.Filter): - def filter(self, record): + def filter(self, record: logging.LogRecord) -> bool: s = super().filter(record) if s is False: return False @@ -125,25 +155,31 @@ class AddExceptionTraceback(logging.Filter): # todo also save full log in a file? -class CollapseDebugHandler(logging.StreamHandler): +class CollapseLogsHandler(logging.StreamHandler): ''' Collapses subsequent debug log lines and redraws on the same line. Hopefully this gives both a sense of progress and doesn't clutter the terminal as much? ''' - last = False + + last: bool = False + + maxlevel: Level = logging.DEBUG # everything with less or equal level will be collapsed + + def __init__(self, *args, maxlevel: Level, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.maxlevel = maxlevel def emit(self, record: logging.LogRecord) -> None: try: msg = self.format(record) - cur = record.levelno == logging.DEBUG and '\n' not in msg + cur = record.levelno <= self.maxlevel and '\n' not in msg if cur: if self.last: - self.stream.write('\033[K' + '\r') # clear line + return carriage + self.stream.write('\033[K' + '\r') # clear line + return carriage else: if self.last: - self.stream.write('\n') # clean up after the last debug line + self.stream.write('\n') # clean up after the last line self.last = cur - import os columns, _ = os.get_terminal_size(0) # ugh. the columns thing is meh. dunno I guess ultimately need curses for that # TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keywords (INFO/DEBUG/etc) @@ -153,5 +189,46 @@ class CollapseDebugHandler(logging.StreamHandler): self.handleError(record) +@lru_cache(None) # cache so it's only initialized once +def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger: + logger = logging.getLogger(name) + setup_logger(logger, level=level) + return logger + + +# ughh. hacky way to have a single enlighten instance per interpreter, so it can be shared between modules +# not sure about this. I guess this should definitely be behind some flag +# OK, when stdout is not a tty, enlighten doesn't log anything, good +def get_enlighten(): + # TODO could add env variable to disable enlighten for a module? + from unittest.mock import Mock + # Mock to return stub so cients don't have to think about it + + # for now hidden behind the flag since it's a little experimental + if os.environ.get('ENLIGHTEN_ENABLE', None) is None: + return Mock() + + try: + import enlighten # type: ignore[import] + except ModuleNotFoundError: + warnings.warn("You might want to 'pip install enlighten' for a nice progress bar") + + return Mock() + + # dirty, but otherwise a bit unclear how to share enlighten manager between packages that call each other + instance = getattr(enlighten, 'INSTANCE', None) + if instance is not None: + return instance + instance = enlighten.get_manager() + setattr(enlighten, 'INSTANCE', instance) + return instance + + if __name__ == '__main__': test() + + +## legacy/deprecated methods for backwards compatilibity +LazyLogger = make_logger +logger = make_logger +## diff --git a/setup.py b/setup.py index f3f8511..5a4f75b 100644 --- a/setup.py +++ b/setup.py @@ -55,11 +55,12 @@ def main() -> None: ], 'optional': [ # todo document these? - 'logzero', 'orjson', # for my.core.serialize 'pyfzf_iter', # for my.core.denylist 'cachew>=0.8.0', 'mypy', # used for config checks + 'colorlog', # for colored logs + 'enlighten', # for CLI progress bars ], }, entry_points={'console_scripts': ['hpi=my.core.__main__:main']}, From dff31455f17da85bcb58ff1e36efe083221ee844 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 21 Jun 2023 18:01:51 +0100 Subject: [PATCH 124/302] general: switch to make_logger in a few modules, use a bit more consistent logging, rely on default INFO level --- my/bluemaestro.py | 8 +++----- my/github/gdpr.py | 5 +++-- my/instagram/android.py | 8 +++++--- my/instagram/gdpr.py | 5 +++-- my/reddit/rexport.py | 7 ++++--- my/rescuetime.py | 8 ++++---- my/twitter/archive.py | 6 +++--- my/zotero.py | 4 ++-- 8 files changed, 27 insertions(+), 24 deletions(-) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index b49e9e0..6b58d05 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -10,15 +10,13 @@ import re import sqlite3 from typing import Iterable, Sequence, Set, Optional -from my.core import get_files, LazyLogger, dataclass, Res +from my.core import get_files, make_logger, dataclass, Res from my.core.sqlite import sqlite_connect_immutable from my.config import bluemaestro as config -# todo control level via env variable? -# i.e. HPI_LOGGING_MY_BLUEMAESTRO_LEVEL=debug -logger = LazyLogger(__name__, level='debug') +logger = make_logger(__name__) def inputs() -> Sequence[Path]: @@ -63,7 +61,7 @@ def measurements() -> Iterable[Res[Measurement]]: # tables are immutable, so can save on processing.. processed_tables: Set[str] = set() for f in dbs: - logger.debug('processing %s', f) + logger.info('processing %s', f) tot = 0 new = 0 # todo assert increasing timestamp? diff --git a/my/github/gdpr.py b/my/github/gdpr.py index 6f7efe4..3d23565 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -25,8 +25,8 @@ class github(user_config): ### -from ..core import LazyLogger -logger = LazyLogger(__name__) +from ..core import make_logger +logger = make_logger(__name__) from ..core.cfg import make_config @@ -85,6 +85,7 @@ def events() -> Iterable[Res[Event]]: 'repository_files_': None, # repository artifacts, probs not very useful } for f in files: + logger.info(f'{f} : processing...') handler: Any for prefix, h in handler_map.items(): if not f.name.startswith(prefix): diff --git a/my/instagram/android.py b/my/instagram/android.py index 48e8021..e1db55a 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -14,7 +14,7 @@ from more_itertools import unique_everseen from my.core import ( get_files, Paths, make_config, - LazyLogger, + make_logger, datetime_naive, Json, Res, assert_never, @@ -24,7 +24,7 @@ from my.core.sqlite import sqlite_connect_immutable, select from my.config import instagram as user_config -logger = LazyLogger(__name__, level='debug') +logger = make_logger(__name__) @dataclass class instagram_android_config(user_config.android): @@ -132,7 +132,9 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # NOTE: definitely need to merge multiple, app seems to recycle old messages # TODO: hmm hard to guarantee timestamp ordering when we use synthetic input data... # todo use TypedDict? - for f in inputs(): + dbs = inputs() + for f in dbs: + logger.info(f'{f} : processing...') with sqlite_connect_immutable(f) as db: # TODO ugh. seems like no way to extract username? # sometimes messages (e.g. media_share) contain it in message field diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index b9f8780..348d69d 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -15,14 +15,14 @@ from my.core import ( datetime_naive, Res, assert_never, - LazyLogger, + make_logger, ) from my.core.kompress import ZipPath from my.config import instagram as user_config -logger = LazyLogger(__name__, level='debug') +logger = make_logger(__name__) @dataclass @@ -113,6 +113,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: for fname, ffiles in file_map.items(): for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])): + logger.info(f'{ffile} : processing...') j = json.loads(ffile.read_text()) id_len = 10 diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index 2d2b9a3..f20d00e 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -64,10 +64,11 @@ except ModuleNotFoundError as e: ############################ from typing import List, Sequence, Mapping, Iterator, Any -from my.core.common import mcachew, get_files, LazyLogger, make_dict, Stats +from my.core import make_logger +from my.core.common import mcachew, get_files, make_dict, Stats -logger = LazyLogger(__name__, level='info') +logger = make_logger(__name__) from pathlib import Path @@ -85,8 +86,8 @@ Upvote = dal.Upvote def _dal() -> dal.DAL: inp = list(inputs()) return dal.DAL(inp) -cache = mcachew(depends_on=inputs, logger=logger) # depends on inputs only +cache = mcachew(depends_on=inputs) @cache def saved() -> Iterator[Save]: diff --git a/my/rescuetime.py b/my/rescuetime.py index c986d89..75684d9 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -9,14 +9,14 @@ from pathlib import Path from datetime import timedelta from typing import Sequence, Iterable -from .core import get_files, LazyLogger -from .core.common import mcachew -from .core.error import Res, split_errors +from my.core import get_files, make_logger +from my.core.common import mcachew +from my.core.error import Res, split_errors from my.config import rescuetime as config -log = LazyLogger(__name__, level='info') +logger = make_logger(__name__) def inputs() -> Sequence[Path]: diff --git a/my/twitter/archive.py b/my/twitter/archive.py index d9ba562..44ebc5f 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -44,11 +44,11 @@ from typing import List, Optional, NamedTuple, Sequence, Iterator from pathlib import Path import json -from ..core.common import get_files, LazyLogger, Json +from my.core import get_files, make_logger, Json -logger = LazyLogger(__name__, level="warning") +logger = make_logger(__name__) def inputs() -> Sequence[Path]: @@ -175,7 +175,7 @@ class ZipExport: self.old_format = True def raw(self, what: str, *, fname: Optional[str]=None) -> Iterator[Json]: - logger.info('processing: %s %s', self.zpath, what) + logger.info(f'{self.zpath} : processing {what}') path = fname or what if not self.old_format: diff --git a/my/zotero.py b/my/zotero.py index 3afc512..4440aae 100644 --- a/my/zotero.py +++ b/my/zotero.py @@ -5,11 +5,11 @@ from typing import Iterator, Optional, Dict, Any, Sequence from pathlib import Path import sqlite3 -from my.core import LazyLogger, Res, datetime_aware +from my.core import make_logger, Res, datetime_aware from my.core.sqlite import sqlite_copy_and_open -logger = LazyLogger(__name__, level='debug') +logger = make_logger(__name__) def inputs() -> Sequence[Path]: From 6f6be5c78e1984cdbffd9c833362a9e1b56c61e5 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 21 Jun 2023 19:44:26 +0100 Subject: [PATCH 125/302] my.hackernews.materialistic: process and merge all db exports + minor cleanup --- my/hackernews/materialistic.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/my/hackernews/materialistic.py b/my/hackernews/materialistic.py index e0d634a..eddf053 100644 --- a/my/hackernews/materialistic.py +++ b/my/hackernews/materialistic.py @@ -5,11 +5,14 @@ from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, Iterator, NamedTuple, Sequence -from my.core import get_files +from more_itertools import unique_everseen + +from my.core import get_files, datetime_aware from my.core.sqlite import sqlite_connection -from my.config import materialistic as config -# todo migrate config to my.hackernews.materialistic +from my.config import materialistic as config # todo migrate config to my.hackernews.materialistic + +from .common import hackernews_link def inputs() -> Sequence[Path]: @@ -17,13 +20,16 @@ def inputs() -> Sequence[Path]: Row = Dict[str, Any] -from .common import hackernews_link + class Saved(NamedTuple): row: Row + # NOTE: seems like it's the time item was saved (not created originally??) + # https://github.com/hidroh/materialistic/blob/b631d5111b7487d2328f463bd95e8507c74c3566/app/src/main/java/io/github/hidroh/materialistic/data/MaterialisticDatabase.java#L224 + # but not 100% sure. @property - def when(self) -> datetime: + def when(self) -> datetime_aware: ts = int(self.row['time']) / 1000 return datetime.fromtimestamp(ts, tz=timezone.utc) @@ -44,11 +50,14 @@ class Saved(NamedTuple): return hackernews_link(self.uid) +def _all_raw() -> Iterator[Row]: + for db in inputs(): + with sqlite_connection(db, immutable=True, row_factory='dict') as conn: + yield from conn.execute('SELECT * FROM saved ORDER BY time') + + def raw() -> Iterator[Row]: - last = max(inputs()) - with sqlite_connection(last, immutable=True, row_factory='dict') as conn: - yield from conn.execute('SELECT * FROM saved ORDER BY time') - # TODO wonder if it's 'save time' or creation time? + yield from unique_everseen(_all_raw(), key=lambda r: r['itemid']) def saves() -> Iterator[Saved]: From c25ab516642febdf3edf165282e1e946cfa81c32 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 21 Jun 2023 19:56:33 +0100 Subject: [PATCH 126/302] core: some tweaks for better colour handling when we're redirecting stdout/stderr --- my/core/logging.py | 6 ++++-- my/core/preinit.py | 3 +++ my/core/warnings.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/my/core/logging.py b/my/core/logging.py index 47cd135..43f0767 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -128,10 +128,12 @@ def setup_logger(logger: str | logging.Logger, *, level: LevelIsh = None) -> Non else: # log_color/reset are specific to colorlog FORMAT_COLOR = FORMAT.format(start='%(log_color)s', end='%(reset)s') - fmt = FORMAT_COLOR if ch.stream.isatty() else FORMAT_NOCOLOR # colorlog should detect tty in principle, but doesn't handle everything for some reason # see https://github.com/borntyping/python-colorlog/issues/71 - formatter = colorlog.ColoredFormatter(fmt) + if ch.stream.isatty(): + formatter = colorlog.ColoredFormatter(FORMAT_COLOR) + else: + formatter = logging.Formatter(FORMAT_NOCOLOR) ch.setFormatter(formatter) diff --git a/my/core/preinit.py b/my/core/preinit.py index 9d6b374..88bcb27 100644 --- a/my/core/preinit.py +++ b/my/core/preinit.py @@ -1,5 +1,8 @@ from pathlib import Path +# todo preinit isn't really a good name? it's only in a separate file because +# - it's imported from my.core.init (so we wan't to keep this file as small/reliable as possible, hence not common or something) +# - we still need this function in __main__, so has to be separate from my/core/init.py def get_mycfg_dir() -> Path: import appdirs import os diff --git a/my/core/warnings.py b/my/core/warnings.py index b5c1a9b..7051f34 100644 --- a/my/core/warnings.py +++ b/my/core/warnings.py @@ -16,7 +16,7 @@ def _colorize(x: str, color: Optional[str]=None) -> str: if color is None: return x - if not sys.stdout.isatty(): + if not sys.stderr.isatty(): return x # click handles importing/initializing colorama if necessary # on windows it installs it if necessary From 88a3aa8d67bd067000795c2e76e09296def6c405 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 21 Jun 2023 20:10:40 +0100 Subject: [PATCH 127/302] my.bluemaestro: minor cleanup --- my/bluemaestro.py | 50 ++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index 6b58d05..1586426 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -4,13 +4,26 @@ """ # todo most of it belongs to DAL... but considering so few people use it I didn't bother for now +from dataclasses import dataclass from datetime import datetime, timedelta from pathlib import Path import re import sqlite3 from typing import Iterable, Sequence, Set, Optional -from my.core import get_files, make_logger, dataclass, Res +import pytz + +from my.core import ( + get_files, + make_logger, + Res, + stat, + Stats, + influxdb, +) +from my.core.common import mcachew +from my.core.error import unwrap +from my.core.pandas import DataFrameT, as_dataframe from my.core.sqlite import sqlite_connect_immutable from my.config import bluemaestro as config @@ -25,12 +38,13 @@ def inputs() -> Sequence[Path]: Celsius = float Percent = float -mBar = float +mBar = float + @dataclass class Measurement: - dt: datetime # todo aware/naive - temp : Celsius + dt: datetime # todo aware/naive + temp: Celsius humidity: Percent pressure: mBar dewpoint: Celsius @@ -38,7 +52,6 @@ class Measurement: # fixme: later, rely on the timezone provider # NOTE: the timezone should be set with respect to the export date!!! -import pytz tz = pytz.timezone('Europe/London') # TODO when I change tz, check the diff @@ -49,9 +62,7 @@ def is_bad_table(name: str) -> bool: return False if delegate is None else delegate(name) -from my.core.cachew import cache_dir -from my.core.common import mcachew -@mcachew(depends_on=inputs, cache_path=cache_dir('bluemaestro')) +@mcachew(depends_on=inputs) def measurements() -> Iterable[Res[Measurement]]: # todo ideally this would be via arguments... but needs to be lazy dbs = inputs() @@ -68,14 +79,16 @@ def measurements() -> Iterable[Res[Measurement]]: with sqlite_connect_immutable(f) as db: db_dt: Optional[datetime] = None try: - datas = db.execute(f'SELECT "{f.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index') + datas = db.execute( + f'SELECT "{f.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index' + ) oldfmt = True db_dts = list(db.execute('SELECT last_download FROM info'))[0][0] if db_dts == 'N/A': # ??? happens for 20180923-20180928 continue if db_dts.endswith(':'): - db_dts += '00' # wtf.. happens on some day + db_dts += '00' # wtf.. happens on some day db_dt = tz.localize(datetime.strptime(db_dts, '%Y-%m-%d %H:%M:%S')) except sqlite3.OperationalError: # Right, this looks really bad. @@ -113,7 +126,7 @@ def measurements() -> Iterable[Res[Measurement]]: f'SELECT "{t}" AS name, unix, tempReadings / 10.0, humiReadings / 10.0, pressReadings / 10.0, dewpReadings / 10.0 FROM {t}' for t in log_tables ) - if len(log_tables) > 0: # ugh. otherwise end up with syntax error.. + if len(log_tables) > 0: # ugh. otherwise end up with syntax error.. query = f'SELECT * FROM ({query}) ORDER BY name, unix' datas = db.execute(query) oldfmt = False @@ -139,8 +152,8 @@ def measurements() -> Iterable[Res[Measurement]]: ## sanity checks (todo make defensive/configurable?) # not sure how that happens.. but basically they'd better be excluded - lower = timedelta(days=6000 / 24) # ugh some time ago I only did it once in an hour.. in theory can detect from meta? - upper = timedelta(days=10) # kinda arbitrary + lower = timedelta(days=6000 / 24) # ugh some time ago I only did it once in an hour.. in theory can detect from meta? + upper = timedelta(days=10) # kinda arbitrary if not (db_dt - lower < dt < db_dt + timedelta(days=10)): # todo could be more defenive?? yield RuntimeError('timestamp too far out', f, name, db_dt, dt) @@ -178,12 +191,11 @@ def measurements() -> Iterable[Res[Measurement]]: # for k, v in merged.items(): # yield Point(dt=k, temp=v) # meh? -from my.core import stat, Stats + def stats() -> Stats: return stat(measurements) -from my.core.pandas import DataFrameT, as_dataframe def dataframe() -> DataFrameT: """ %matplotlib gtk @@ -197,7 +209,6 @@ def dataframe() -> DataFrameT: def fill_influxdb() -> None: - from my.core import influxdb influxdb.fill(measurements(), measurement=__name__) @@ -205,7 +216,6 @@ def check() -> None: temps = list(measurements()) latest = temps[:-2] - from my.core.error import unwrap prev = unwrap(latest[-2]).dt last = unwrap(latest[-1]).dt @@ -215,12 +225,12 @@ def check() -> None: # # TODO also needs to be filtered out on processing, should be rejected on the basis of export date? - POINTS_STORED = 6000 # on device? + POINTS_STORED = 6000 # on device? FREQ_SEC = 60 SECS_STORED = POINTS_STORED * FREQ_SEC - HOURS_STORED = POINTS_STORED / (60 * 60 / FREQ_SEC) # around 4 days + HOURS_STORED = POINTS_STORED / (60 * 60 / FREQ_SEC) # around 4 days NOW = datetime.now() assert NOW - last < timedelta(hours=HOURS_STORED / 2), f'old backup! {last}' - assert last - prev < timedelta(minutes=3), f'bad interval! {last - prev}' + assert last - prev < timedelta(minutes=3), f'bad interval! {last - prev}' single = (last - prev).seconds From d6af4dec11b3d81a19641a7bf5f330bc92dd0792 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 21 Jun 2023 20:15:03 +0100 Subject: [PATCH 128/302] my.instagram.android: minor cleanup + cachew --- my/instagram/android.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/my/instagram/android.py b/my/instagram/android.py index e1db55a..8ebbf9f 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -12,13 +12,16 @@ from typing import Iterator, Sequence, Optional, Dict, Union from more_itertools import unique_everseen from my.core import ( - get_files, Paths, + get_files, + Paths, make_config, make_logger, datetime_naive, Json, - Res, assert_never, + Res, + assert_never, ) +from my.core.cachew import mcachew from my.core.sqlite import sqlite_connect_immutable, select from my.config import instagram as user_config @@ -26,6 +29,7 @@ from my.config import instagram as user_config logger = make_logger(__name__) + @dataclass class instagram_android_config(user_config.android): # paths[s]/glob to the exported sqlite databases @@ -156,7 +160,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: uid = r.get('id') or r.get('pk_id') assert uid is not None yield User( - id=str(uid), # for some reason it's int in the db + id=str(uid), # for some reason it's int in the db full_name=r['full_name'], username=r['username'], ) @@ -172,6 +176,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: yield e +@mcachew(depends_on=inputs) def messages() -> Iterator[Res[Message]]: id2user: Dict[str, User] = {} for x in unique_everseen(_entities()): From fcaa7c1561c85f6db4121e4e7b33e1603fa0eae1 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Wed, 9 Aug 2023 16:40:37 -0700 Subject: [PATCH 129/302] core/cli: allow user to bypass PEP 668 when installing dependencies with 'hpi module install', this now lets a user pass '--break-system-packages' (or '-B'), which passes the same option down to pip, to allow the user to bypass PEP 668 and install packages that could possibly conflict with system packages. --- my/core/__main__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index feb83bb..ad7a454 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -362,7 +362,7 @@ def module_requires(*, module: Sequence[str]) -> None: click.echo(x) -def module_install(*, user: bool, module: Sequence[str], parallel: bool=False) -> None: +def module_install(*, user: bool, module: Sequence[str], parallel: bool=False, break_system_packages: bool=False) -> None: if isinstance(module, str): # legacy behavior, used to take a since argument module = [module] @@ -377,6 +377,7 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool=False) - sys.executable, '-m', 'pip', 'install', *(['--user'] if user else []), # todo maybe instead, forward all the remaining args to pip? + *(['--break-system-packages'] if break_system_packages else []), # https://peps.python.org/pep-0668/ ] cmds = [] @@ -696,15 +697,19 @@ def module_requires_cmd(modules: Sequence[str]) -> None: @module_grp.command(name='install', short_help='install module deps') @click.option('--user', is_flag=True, help='same as pip --user') @click.option('--parallel', is_flag=True, help='EXPERIMENTAL. Install dependencies in parallel.') +@click.option('-B', + '--break-system-packages', + is_flag=True, + help='Bypass PEP 668 and install dependencies into the system-wide python package directory.') @click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True) -def module_install_cmd(user: bool, parallel: bool, modules: Sequence[str]) -> None: +def module_install_cmd(user: bool, parallel: bool, break_system_packages: bool, modules: Sequence[str]) -> None: ''' Install dependencies for modules using pip MODULES is one or more specific module names (e.g. my.reddit.rexport) ''' # todo could add functions to check specific module etc.. - module_install(user=user, module=modules, parallel=parallel) + module_install(user=user, module=modules, parallel=parallel, break_system_packages=break_system_packages) @main.command(name='query', short_help='query the results of a HPI function') From 7ec894807f682d290510da885e9cd5fc2b2b4b14 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 6 Aug 2023 20:08:20 +0100 Subject: [PATCH 130/302] my.bumble.android: handle more msg types --- my/bumble/android.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/my/bumble/android.py b/my/bumble/android.py index 6bc27dc..86c9d1e 100644 --- a/my/bumble/android.py +++ b/my/bumble/android.py @@ -89,7 +89,7 @@ def _handle_db(db: sqlite3.Connection) -> Iterator[EntitiesRes]: db=db ): try: - key = {'TEXT': 'text', 'QUESTION_GAME': 'text', 'IMAGE': 'url', 'GIF': 'url'}[payload_type] + key = {'TEXT': 'text', 'QUESTION_GAME': 'text', 'IMAGE': 'url', 'GIF': 'url', 'AUDIO': 'url', 'VIDEO': 'url'}[payload_type] text = json.loads(payload)[key] yield _Message( id=id, From 642e3b14d5529ad63b7c2c9c83aeb821305b9a30 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 24 Aug 2023 23:04:36 +0100 Subject: [PATCH 131/302] my.github.gdpr: some minor enhancements - better error context - handle some unknown files - handle user=None in some cases - cleanup imports --- my/github/gdpr.py | 68 ++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/my/github/gdpr.py b/my/github/gdpr.py index 3d23565..1ff0f93 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -1,38 +1,33 @@ """ Github data (uses [[https://github.com/settings/admin][official GDPR export]]) """ - +from dataclasses import dataclass import json from pathlib import Path import tarfile -from typing import Iterable, Dict, Any, Sequence +from typing import Iterable, Any, Sequence, Dict, Optional -from ..core import get_files, Res -from ..core.error import notnone +from my.core import get_files, Res, PathIsh, stat, Stats, make_logger +from my.core.cfg import make_config +from my.core.error import notnone, echain from .common import Event, parse_dt, EventIds # TODO later, use a separate user config? (github_gdpr) from my.config import github as user_config -from dataclasses import dataclass -from ..core import PathIsh @dataclass class github(user_config): gdpr_dir: PathIsh # path to unpacked GDPR archive -### - -from ..core import make_logger -logger = make_logger(__name__) - - -from ..core.cfg import make_config config = make_config(github) +logger = make_logger(__name__) + + def inputs() -> Sequence[Path]: gdir = config.gdpr_dir res = get_files(gdir) @@ -54,22 +49,22 @@ def events() -> Iterable[Res[Event]]: # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples # another one is zulip archive if last.is_dir(): - files = list(sorted(last.glob('*.json'))) # looks like all files are in the root + files = list(sorted(last.glob('*.json'))) # looks like all files are in the root open_file = lambda f: f.open() else: # treat as .tar.gz tfile = tarfile.open(last) files = list(sorted(map(Path, tfile.getnames()))) files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json'] - open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ - + open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ + # fmt: off handler_map = { 'schema' : None, - 'issue_events_': None, # eh, doesn't seem to have any useful bodies - 'attachments_' : None, # not sure if useful - 'users' : None, # just contains random users - 'bots' : None, # just contains random bots + 'issue_events_': None, # eh, doesn't seem to have any useful bodies + 'attachments_' : None, # not sure if useful + 'users' : None, # just contains random users + 'bots' : None, # just contains random bots 'repositories_' : _parse_repository, 'issue_comments_': _parse_issue_comment, 'issues_' : _parse_issue, @@ -82,8 +77,11 @@ def events() -> Iterable[Res[Event]]: 'pull_request_review_threads_': None, 'pull_request_reviews_': None, ## - 'repository_files_': None, # repository artifacts, probs not very useful + 'repository_files_': None, # repository artifacts, probs not very useful + 'discussion_categories_': None, # doesn't seem to contain any useful info, just some repo metadata + 'organizations_': None, # no useful info, just some org metadata } + # fmt: on for f in files: logger.info(f'{f} : processing...') handler: Any @@ -106,11 +104,10 @@ def events() -> Iterable[Res[Event]]: try: yield handler(r) except Exception as e: - yield e + yield echain(RuntimeError(f'While processing file: {f}'), e) -def stats(): - from ..core import stat +def stats() -> Stats: return { **stat(events), } @@ -121,7 +118,7 @@ def _parse_common(d: Dict) -> Dict: url = d['url'] body = d.get('body') return { - 'dt' : parse_dt(d['created_at']), + 'dt': parse_dt(d['created_at']), 'link': url, 'body': body, } @@ -131,8 +128,9 @@ def _parse_repository(d: Dict) -> Event: pref = 'https://github.com/' url = d['url'] dts = d['created_at'] - rt = d['type'] - assert url.startswith(pref); name = url[len(pref):] + rt = d['type'] + assert url.startswith(pref) + name = url[len(pref) :] eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None) return Event( **_parse_common(d), @@ -141,26 +139,31 @@ def _parse_repository(d: Dict) -> Event: ) +# user may be None if the user was deleted +def _is_bot(user: Optional[str]) -> bool: + if user is None: + return False + return "[bot]" in "user" + + def _parse_issue_comment(d: Dict) -> Event: url = d['url'] - is_bot = "[bot]" in d["user"] return Event( **_parse_common(d), summary=f'commented on issue {url}', eid='issue_comment_' + url, - is_bot=is_bot, + is_bot=_is_bot(d['user']), ) def _parse_issue(d: Dict) -> Event: url = d['url'] title = d['title'] - is_bot = "[bot]" in d["user"] return Event( **_parse_common(d), summary=f'opened issue {title}', eid='issue_comment_' + url, - is_bot=is_bot, + is_bot=_is_bot(d['user']), ) @@ -168,14 +171,13 @@ def _parse_pull_request(d: Dict) -> Event: dts = d['created_at'] url = d['url'] title = d['title'] - is_bot = "[bot]" in d["user"] return Event( **_parse_common(d), # TODO distinguish incoming/outgoing? # TODO action? opened/closed?? summary=f'opened PR {title}', eid=EventIds.pr(dts=dts, action='opened', url=url), - is_bot=is_bot, + is_bot=_is_bot(d['user']), ) From c283e542e3457ecd778fb09e54e725d67104a49a Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 24 Aug 2023 23:29:14 +0100 Subject: [PATCH 132/302] general: fix some issues after mypy update --- my/core/query_range.py | 1 + my/instagram/common.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/my/core/query_range.py b/my/core/query_range.py index 3fdc12e..a1cfaed 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -155,6 +155,7 @@ def _parse_range( return parsed_range err_msg = error_message or RangeTuple.__doc__ + assert err_msg is not None # make mypy happy after, before, within = None, None, None none_count = more_itertools.ilen(filter(lambda o: o is None, list(unparsed_range))) diff --git a/my/instagram/common.py b/my/instagram/common.py index 4df07a1..36c6b83 100644 --- a/my/instagram/common.py +++ b/my/instagram/common.py @@ -68,6 +68,6 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: if user is not None: repls['user'] = user if len(repls) > 0: - m = replace(m, **repls) # type: ignore[type-var] # ugh mypy is confused because of Protocol? + m = replace(m, **repls) # type: ignore[type-var, misc] # ugh mypy is confused because of Protocol? mmap[k] = m yield m From ff84d8fc8825c80445dee052120df7dafdfa0074 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 26 Aug 2023 09:55:30 -0700 Subject: [PATCH 133/302] core/cli: update vendored completion files update required click version to 8.1 so we dont regenerate the vendored completions wrong in the future --- misc/completion/fish/hpi.fish | 6 +----- misc/completion/zsh/_hpi | 8 +++++++- setup.py | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/misc/completion/fish/hpi.fish b/misc/completion/fish/hpi.fish index e8a8e56..23abca9 100644 --- a/misc/completion/fish/hpi.fish +++ b/misc/completion/fish/hpi.fish @@ -1,9 +1,5 @@ function _hpi_completion; - set -l response; - - for value in (env _HPI_COMPLETE=fish_complete COMP_WORDS=(commandline -cp) COMP_CWORD=(commandline -t) hpi); - set response $response $value; - end; + set -l response (env _HPI_COMPLETE=fish_complete COMP_WORDS=(commandline -cp) COMP_CWORD=(commandline -t) hpi); for completion in $response; set -l metadata (string split "," $completion); diff --git a/misc/completion/zsh/_hpi b/misc/completion/zsh/_hpi index 95190b0..805f564 100644 --- a/misc/completion/zsh/_hpi +++ b/misc/completion/zsh/_hpi @@ -31,5 +31,11 @@ _hpi_completion() { fi } -compdef _hpi_completion hpi; +if [[ $zsh_eval_context[-1] == loadautofunc ]]; then + # autoload from fpath, call function directly + _hpi_completion "$@" +else + # eval/source/. command, register function for later + compdef _hpi_completion hpi +fi diff --git a/setup.py b/setup.py index 5a4f75b..ee4c2f0 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ INSTALL_REQUIRES = [ 'appdirs', # very common, and makes it portable 'more-itertools', # it's just too useful and very common anyway 'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core - 'click>=8.0' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI + 'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI ] From 2a46341ce2733be15e26863aa8a356180df2b0d0 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Wed, 6 Sep 2023 16:35:26 -0700 Subject: [PATCH 134/302] my.core.logging: compatibility with HPI_LOGS re-adds a removed check for HPI_LOGS, add some docs fix the checks for browserexport/takeout logs to use the computed level from my.core.logging --- doc/MODULE_DESIGN.org | 14 ++++++++++++++ doc/SETUP.org | 6 +++++- my/browser/active_browser.py | 3 +-- my/browser/common.py | 15 ++++++++------- my/browser/export.py | 3 +-- my/core/__main__.py | 4 ++-- my/core/logging.py | 13 +++++++++++++ my/google/takeout/parser.py | 9 +++------ 8 files changed, 47 insertions(+), 20 deletions(-) diff --git a/doc/MODULE_DESIGN.org b/doc/MODULE_DESIGN.org index d57f8fb..c0ab4f6 100644 --- a/doc/MODULE_DESIGN.org +++ b/doc/MODULE_DESIGN.org @@ -233,3 +233,17 @@ The main goals are: It could be argued that namespace packages and editable installs are a bit complex for a new user to get the hang of, and this is true. But fortunately ~import_source~ means any user just using HPI only needs to follow the instructions when a warning is printed, or peruse the docs here a bit -- there's no need to clone or create your own override to just use the ~all.py~ file. There's no requirement to use this for individual modules, it just seems to be the best solution we've arrived at so far + +* Logging + +The ~my.core~ module exports a ~make_logger~ function which works nicely with +~cachew~ and gives you colored logs. You can use it like this: + +#+begin_src python + from my.core import make_logger + + logger = make_logger(__name__) + + # or to set a custom level + logger = make_logger(__name__, level='warning') +#+end_src diff --git a/doc/SETUP.org b/doc/SETUP.org index 904331f..0fced62 100644 --- a/doc/SETUP.org +++ b/doc/SETUP.org @@ -192,7 +192,11 @@ HPI comes with a command line tool that can help you detect potential issues. Ru If you only have a few modules set up, lots of them will error for you, which is expected, so check the ones you expect to work. -If you're having issues with ~cachew~ or want to show logs to troubleshoot what may be happening, you can pass the debug flag (e.g., ~hpi --debug doctor my.module_name~) or set the ~HPI_LOGS~ environment variable (e.g., ~HPI_LOGS=debug hpi query my.module_name~) to print all logs, including the ~cachew~ dependencies. ~HPI_LOGS~ could also be used to silence ~info~ logs, like ~HPI_LOGS=warning hpi ...~ +If you're having issues with ~cachew~ or want to show logs to troubleshoot what may be happening, you can pass the debug flag (e.g., ~hpi --debug doctor my.module_name~) or set the ~LOGGING_LEVEL_HPI~ environment variable (e.g., ~LOGGING_LEVEL_HPI=debug hpi query my.module_name~) to print all logs, including the ~cachew~ dependencies. ~LOGGING_LEVEL_HPI~ could also be used to silence ~info~ logs, like ~LOGGING_LEVEL_HPI=warning hpi ...~ + +If you want to enable logs for a particular module, you can use the +~LOGGING_LEVEL_~ prefix and then the module name with underscores, like +~LOGGING_LEVEL_my_hypothesis=debug hpi query my.hypothesis~ If you want ~HPI~ to autocomplete the module names for you, this comes with shell completion, see [[../misc/completion/][misc/completion]] diff --git a/my/browser/active_browser.py b/my/browser/active_browser.py index 4dc52e4..c25c64d 100644 --- a/my/browser/active_browser.py +++ b/my/browser/active_browser.py @@ -26,8 +26,7 @@ from browserexport.merge import read_visits, Visit from sqlite_backup import sqlite_backup from .common import _patch_browserexport_logs - -_patch_browserexport_logs() +_patch_browserexport_logs(__name__) def inputs() -> Sequence[Path]: diff --git a/my/browser/common.py b/my/browser/common.py index 9427f61..2c85ced 100644 --- a/my/browser/common.py +++ b/my/browser/common.py @@ -1,11 +1,12 @@ -import os +from my.core import make_logger from my.core.util import __NOT_HPI_MODULE__ -def _patch_browserexport_logs(): - # patch browserexport logs if HPI_LOGS is present - if "HPI_LOGS" in os.environ: - from browserexport.log import setup as setup_browserexport_logger - from my.core.logging import mklevel +def _patch_browserexport_logs(module_name: str): + # get the logger for the module this is being called from + module_logger = make_logger(module_name) - setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"])) + # grab the computed level (respects LOGGING_LEVEL_ prefixes) and set it on the browserexport logger + from browserexport.log import setup as setup_browserexport_logger + + setup_browserexport_logger(module_logger.level) diff --git a/my/browser/export.py b/my/browser/export.py index 3185d53..e9d6252 100644 --- a/my/browser/export.py +++ b/my/browser/export.py @@ -26,8 +26,7 @@ from .common import _patch_browserexport_logs logger = LazyLogger(__name__, level="warning") - -_patch_browserexport_logs() +_patch_browserexport_logs(__name__) # all of my backed up databases diff --git a/my/core/__main__.py b/my/core/__main__.py index ad7a454..643df50 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -596,9 +596,9 @@ def main(debug: bool) -> None: Tool for HPI Work in progress, will be used for config management, troubleshooting & introspection ''' - # should overwrite anything else in HPI_LOGS + # should overwrite anything else in LOGGING_LEVEL_HPI if debug: - os.environ["HPI_LOGS"] = "debug" + os.environ['LOGGING_LEVEL_HPI'] = 'debug' # for potential future reference, if shared state needs to be added to groups # https://click.palletsprojects.com/en/7.x/commands/#group-invocation-without-command diff --git a/my/core/logging.py b/my/core/logging.py index 43f0767..7914093 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -81,6 +81,19 @@ def get_env_level(name: str) -> Level | None: lvl = os.environ.get(PREFIX + name, None) or os.environ.get(PREFIX + name.replace('.', '_'), None) if lvl is not None: return mklevel(lvl) + # if LOGGING_LEVEL_HPI is set, use that. This should override anything the module may set as its default + # this is also set when the user passes the --debug flag in the CLI + # + # check after LOGGING_LEVEL_ prefix since that is more specific + if 'LOGGING_LEVEL_HPI' in os.environ: + return mklevel(os.environ['LOGGING_LEVEL_HPI']) + # legacy name, for backwards compatibility + if 'HPI_LOGS' in os.environ: + from my.core.warnings import medium + + medium('The HPI_LOGS environment variable is deprecated, use LOGGING_LEVEL_HPI instead') + + return mklevel(os.environ['HPI_LOGS']) return None diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 09cbe57..9a90c8f 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -52,12 +52,9 @@ config = make_config(google) logger = LazyLogger(__name__, level="warning") -# patch TAKEOUT_LOGS to match HPI_LOGS -if "HPI_LOGS" in os.environ: - from google_takeout_parser.log import setup as setup_takeout_logger - from my.core.logging import mklevel - - setup_takeout_logger(mklevel(os.environ["HPI_LOGS"])) +# patch the takeout parser logger to match the computed loglevel +from google_takeout_parser.log import setup as setup_takeout_logger +setup_takeout_logger(logger.level) DISABLE_TAKEOUT_CACHE = "DISABLE_TAKEOUT_CACHE" in os.environ From be81466871b33bcca6bf19476bb68d4e24711a26 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Thu, 14 Sep 2023 17:39:21 -0700 Subject: [PATCH 135/302] browser: fix duplicate logs when fetching loglevel --- my/browser/active_browser.py | 6 ++++-- my/browser/common.py | 8 ++------ my/browser/export.py | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/my/browser/active_browser.py b/my/browser/active_browser.py index c25c64d..601182a 100644 --- a/my/browser/active_browser.py +++ b/my/browser/active_browser.py @@ -21,12 +21,14 @@ class config(user_config.active_browser): from pathlib import Path from typing import Sequence, Iterator -from my.core import get_files, Stats +from my.core import get_files, Stats, make_logger from browserexport.merge import read_visits, Visit from sqlite_backup import sqlite_backup +logger = make_logger(__name__) + from .common import _patch_browserexport_logs -_patch_browserexport_logs(__name__) +_patch_browserexport_logs(logger.level) def inputs() -> Sequence[Path]: diff --git a/my/browser/common.py b/my/browser/common.py index 2c85ced..058c134 100644 --- a/my/browser/common.py +++ b/my/browser/common.py @@ -1,12 +1,8 @@ -from my.core import make_logger from my.core.util import __NOT_HPI_MODULE__ -def _patch_browserexport_logs(module_name: str): - # get the logger for the module this is being called from - module_logger = make_logger(module_name) - +def _patch_browserexport_logs(level: int): # grab the computed level (respects LOGGING_LEVEL_ prefixes) and set it on the browserexport logger from browserexport.log import setup as setup_browserexport_logger - setup_browserexport_logger(module_logger.level) + setup_browserexport_logger(level) diff --git a/my/browser/export.py b/my/browser/export.py index e9d6252..46a4217 100644 --- a/my/browser/export.py +++ b/my/browser/export.py @@ -26,7 +26,7 @@ from .common import _patch_browserexport_logs logger = LazyLogger(__name__, level="warning") -_patch_browserexport_logs(__name__) +_patch_browserexport_logs(logger.level) # all of my backed up databases From 01480ec8eb1e2d423afc18ce460f36406f1753f7 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 19 Sep 2023 21:21:42 +0100 Subject: [PATCH 136/302] core/logging: fix issue with logger setup called multiple times when called with different levels should resolve https://github.com/karlicoss/HPI/issues/308 --- my/core/logging.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/my/core/logging.py b/my/core/logging.py index 7914093..5580168 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -118,6 +118,14 @@ def setup_logger(logger: str | logging.Logger, *, level: LevelIsh = None) -> Non # if it's already set, the user requested a different logging level, let's respect that logger.setLevel(lvl) + _setup_handlers_and_formatters(name=logger.name) + + +# cached since this should only be done once per logger instance +@lru_cache(None) +def _setup_handlers_and_formatters(name: str) -> None: + logger = logging.getLogger(name) + logger.addFilter(AddExceptionTraceback()) ch = logging.StreamHandler() @@ -204,7 +212,6 @@ class CollapseLogsHandler(logging.StreamHandler): self.handleError(record) -@lru_cache(None) # cache so it's only initialized once def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger: logger = logging.getLogger(name) setup_logger(logger, level=level) @@ -216,8 +223,7 @@ def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger: # OK, when stdout is not a tty, enlighten doesn't log anything, good def get_enlighten(): # TODO could add env variable to disable enlighten for a module? - from unittest.mock import Mock - # Mock to return stub so cients don't have to think about it + from unittest.mock import Mock # Mock to return stub so cients don't have to think about it # for now hidden behind the flag since it's a little experimental if os.environ.get('ENLIGHTEN_ENABLE', None) is None: From 8addd2d58ab16bc13e2d822c2297181e20aafe78 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 25 Sep 2023 10:41:36 +0100 Subject: [PATCH 137/302] new module: Harmonic app for Hackernews --- my/config.py | 4 ++ my/hackernews/common.py | 18 ++++++ my/hackernews/harmonic.py | 115 ++++++++++++++++++++++++++++++++++++++ tox.ini | 1 + 4 files changed, 138 insertions(+) create mode 100644 my/hackernews/harmonic.py diff --git a/my/config.py b/my/config.py index a59eadd..9cc9c11 100644 --- a/my/config.py +++ b/my/config.py @@ -265,3 +265,7 @@ class whatsapp: class android: export_path: Paths my_user_id: Optional[str] + + +class harmonic: + export_path: Paths diff --git a/my/hackernews/common.py b/my/hackernews/common.py index 8c7dd1e..0c5ff9b 100644 --- a/my/hackernews/common.py +++ b/my/hackernews/common.py @@ -1,2 +1,20 @@ +from typing import Protocol + +from my.core import datetime_aware, Json + + def hackernews_link(id: str) -> str: return f'https://news.ycombinator.com/item?id={id}' + + +class SavedBase(Protocol): + @property + def when(self) -> datetime_aware: ... + @property + def uid(self) -> str: ... + @property + def url(self) -> str: ... + @property + def title(self) -> str: ... + @property + def hackernews_link(self) -> str: ... diff --git a/my/hackernews/harmonic.py b/my/hackernews/harmonic.py new file mode 100644 index 0000000..a4eb28e --- /dev/null +++ b/my/hackernews/harmonic.py @@ -0,0 +1,115 @@ +""" +[[https://play.google.com/store/apps/details?id=com.simon.harmonichackernews][Harmonic]] app for Hackernews +""" +REQUIRES = ['lxml'] + +from dataclasses import dataclass +from datetime import datetime, timezone +import json +import html +from pathlib import Path +from typing import Any, Dict, Iterator, List, Optional, Sequence, TypedDict, cast + +from lxml import etree +from more_itertools import unique_everseen, one + +from my.core import ( + Paths, + Res, + Stats, + datetime_aware, + get_files, + stat, +) +from .common import hackernews_link, SavedBase + +from my.config import harmonic as user_config + + +@dataclass +class harmonic(user_config): + export_path: Paths + + +def inputs() -> Sequence[Path]: + return get_files(harmonic.export_path) + + +class Cached(TypedDict): + author: str + created_at_i: int + id: str + points: int + test: Optional[str] + title: str + type: str # TODO Literal['story', 'comment']? comments are only in 'children' field tho + url: str + # TODO also has children with comments, but not sure I need it? + + +# TODO reuse savedbase in materialistic? +@dataclass +class Saved(SavedBase): + raw: Cached + + @property + def when(self) -> datetime_aware: + ts = self.raw['created_at_i'] + return datetime.fromtimestamp(ts, tz=timezone.utc) + + @property + def uid(self) -> str: + return self.raw['id'] + + @property + def url(self) -> str: + return self.raw['url'] + + @property + def title(self) -> str: + return self.raw['title'] + + @property + def hackernews_link(self) -> str: + return hackernews_link(self.uid) + + +_PREFIX = 'com.simon.harmonichackernews.KEY_SHARED_PREFERENCES' + + +def _saved() -> Iterator[Res[Saved]]: + for p in inputs(): + # TODO defensive for each item! + tr = etree.parse(p) + + res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORIES_STRINGS"]'))) + cached_ids = [x.text.split('-')[0] for x in res] + + cached: Dict[str, Cached] = {} + for sid in cached_ids: + res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORY{sid}"]'))) + j = json.loads(html.unescape(res.text)) + cached[sid] = j + + res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_BOOKMARKS"]'))) + for x in res.text.split('-'): + ids, item_timestamp = x.split('q') + # not sure if timestamp is any useful? + + cc = cached.get(ids, None) + if cc is None: + # TODO warn or error? + continue + + yield Saved(cc) + + +def saved() -> Iterator[Res[Saved]]: + yield from unique_everseen(_saved()) + + +def stats() -> Stats: + return { + **stat(inputs), + **stat(saved), + } diff --git a/tox.ini b/tox.ini index 9ec80f1..9487ae3 100644 --- a/tox.ini +++ b/tox.ini @@ -133,6 +133,7 @@ commands = my.github.ghexport \ my.goodreads \ my.google.takeout.parser \ + my.hackernews.harmonic \ my.hypothesis \ my.instapaper \ my.ip.all \ From f3507613f00c8a994a09b466ff1ec5a9ae3a5d74 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 30 Sep 2023 14:23:56 -0700 Subject: [PATCH 138/302] location: make accuracy default config floats previously they were ints which could possibly break caching with cachew --- my/location/fallback/via_home.py | 2 +- my/location/google_takeout_semantic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/my/location/fallback/via_home.py b/my/location/fallback/via_home.py index 590c028..259dcaa 100644 --- a/my/location/fallback/via_home.py +++ b/my/location/fallback/via_home.py @@ -25,7 +25,7 @@ class Config(user_config): # default ~30km accuracy # this is called 'home_accuracy' since it lives on the base location.config object, # to differentiate it from accuracy for other providers - home_accuracy: float = 30_000 + home_accuracy: float = 30_000.0 # TODO could make current Optional and somehow determine from system settings? @property diff --git a/my/location/google_takeout_semantic.py b/my/location/google_takeout_semantic.py index 4d3514e..fcf7f01 100644 --- a/my/location/google_takeout_semantic.py +++ b/my/location/google_takeout_semantic.py @@ -28,7 +28,7 @@ class semantic_locations_config(user_config.google_takeout_semantic): # https://locationhistoryformat.com/reference/semantic/#/$defs/placeVisit/properties/locationConfidence require_confidence: int = 40 # default accuracy for semantic locations - accuracy: float = 100 + accuracy: float = 100.0 config = make_config(semantic_locations_config) From 8cd74a9fc4f61cfc893f8663bf005d519440818e Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 1 Oct 2023 23:20:22 +0100 Subject: [PATCH 139/302] ci: attempt to use --parallel flag in tox --- scripts/ci/run | 2 +- tox.ini | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/ci/run b/scripts/ci/run index 47014ec..fe2719e 100755 --- a/scripts/ci/run +++ b/scripts/ci/run @@ -38,4 +38,4 @@ if ! command -v python3 &> /dev/null; then fi "$PY_BIN" -m pip install --user tox -"$PY_BIN" -m tox +"$PY_BIN" -m tox --parallel --parallel-live "$@" diff --git a/tox.ini b/tox.ini index 9487ae3..860641f 100644 --- a/tox.ini +++ b/tox.ini @@ -15,10 +15,14 @@ passenv = PYTHONPYCACHEPREFIX +# note: --use-pep517 below is necessary for tox --parallel flag to work properly +# otherwise it seems that it tries to modify .eggs dir in parallel and it fails + + # just the very core tests with minimal dependencies [testenv:tests-core] commands = - pip install -e .[testing] + pip install --use-pep517 -e .[testing] # seems that denylist tests rely on it? ideally we should get rid of this in tests-core pip install orjson @@ -47,7 +51,7 @@ commands = # TODO not sure if need it? setenv = MY_CONFIG = nonexistent commands = - pip install -e .[testing] + pip install --use-pep517 -e .[testing] # installed to test my.core.serialize while using simplejson and not orjson pip install simplejson @@ -104,7 +108,7 @@ commands = [testenv:mypy-core] allowlist_externals = cat commands = - pip install -e .[testing,optional] + pip install --use-pep517 -e .[testing,optional] pip install orgparse # used it core.orgmode? pip install gpxpy # for hpi query --output gpx @@ -121,7 +125,7 @@ commands = [testenv:mypy-misc] allowlist_externals = cat commands = - pip install -e .[testing,optional] + pip install --use-pep517 -e .[testing,optional] hpi module install --parallel \ my.arbtt \ From fabcbab7510b7dd47c9036ad81994aaf562c2f0b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 1 Oct 2023 23:30:50 +0100 Subject: [PATCH 140/302] fix mypy errors after version update --- demo.py | 11 ++++++++--- my/jawbone/__init__.py | 7 ++++--- my/jawbone/plots.py | 6 +++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/demo.py b/demo.py index 3c08cce..080bc4c 100755 --- a/demo.py +++ b/demo.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 from subprocess import check_call, DEVNULL -from shutil import copy, copytree +from shutil import copytree, ignore_patterns import os from os.path import abspath from sys import executable as python @@ -9,12 +9,17 @@ from pathlib import Path my_repo = Path(__file__).absolute().parent -def run(): +def run() -> None: # uses fixed paths; worth it for the sake of demonstration # assumes we're in /tmp/my_demo now # 1. clone git@github.com:karlicoss/my.git - copytree(my_repo, 'my_repo', symlinks=True) + copytree( + my_repo, + 'my_repo', + symlinks=True, + ignore=ignore_patterns('.tox*'), # tox dir might have broken symlinks while tests are running in parallel + ) # 2. prepare repositories you'd be using. For this demo we only set up Hypothesis tox = 'TOX' in os.environ diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 9f53abe..4b41242 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -115,14 +115,15 @@ def pre_dataframe() -> Iterable[Res[SleepEntry]]: yield group[0] else: err = RuntimeError(f'Multiple sleeps per night, not supported yet: {group}') - set_error_datetime(err, dt=dd) + set_error_datetime(err, dt=dd) # type: ignore[arg-type] logger.exception(err) yield err def dataframe(): - dicts: List[Dict] = [] + dicts: List[Dict[str, Any]] = [] for s in pre_dataframe(): + d: Dict[str, Any] if isinstance(s, Exception): dt = extract_error_datetime(s) d = { @@ -141,7 +142,7 @@ def dataframe(): } dicts.append(d) - import pandas as pd # type: ignore + import pandas as pd return pd.DataFrame(dicts) # TODO tz is in sleeps json diff --git a/my/jawbone/plots.py b/my/jawbone/plots.py index 5332fe6..5dcb63d 100755 --- a/my/jawbone/plots.py +++ b/my/jawbone/plots.py @@ -15,7 +15,7 @@ from typing import Dict, Any, NamedTuple # print(line) import matplotlib.pyplot as plt # type: ignore -from numpy import genfromtxt # type: ignore +from numpy import genfromtxt import matplotlib.pylab as pylab # type: ignore pylab.rcParams['figure.figsize'] = (32.0, 24.0) @@ -109,7 +109,7 @@ dates = [parse_date(u.date, yearfirst=True, dayfirst=False) for u in useful] # TODO don't need this anymore? it's gonna be in dashboards package from kython.plotting import plot_timestamped # type: ignore -for attr, lims, mavg, fig in [ # type: ignore +for attr, lims, mavg, fig in [ ('light', (0, 400), 5, None), ('deep', (0, 600), 5, None), ('total', (200, 600), 5, None), @@ -128,7 +128,7 @@ for attr, lims, mavg, fig in [ # type: ignore if mavg is not None: mavgs.append((mavg, 'green')) fig = plot_timestamped( - dts, # type: ignore + dts, [getattr(u, attr) for u in useful], marker='.', ratio=(16, 4), From 05124882416efdebfa5a3241903cf2d7375f6d00 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 6 Oct 2023 02:02:43 +0100 Subject: [PATCH 141/302] ci: sync configs to pymplate - add python3.12 - add ruff --- {scripts => .ci}/release | 6 +- {scripts/ci => .ci}/run | 0 .github/workflows/main.yml | 31 ++++++---- my/core/core_config.py | 6 +- my/fbmessenger/all.py | 4 +- my/twitter/all.py | 4 +- ruff.toml | 25 ++++++++ setup.py | 1 + tox.ini | 114 ++++++++++++++++++++----------------- 9 files changed, 117 insertions(+), 74 deletions(-) rename {scripts => .ci}/release (92%) rename {scripts/ci => .ci}/run (100%) create mode 100644 ruff.toml diff --git a/scripts/release b/.ci/release similarity index 92% rename from scripts/release rename to .ci/release index 0ec687f..6cff663 100755 --- a/scripts/release +++ b/.ci/release @@ -21,7 +21,7 @@ import shutil is_ci = os.environ.get('CI') is not None -def main(): +def main() -> None: import argparse p = argparse.ArgumentParser() p.add_argument('--test', action='store_true', help='use test pypi') @@ -29,7 +29,7 @@ def main(): extra = [] if args.test: - extra.extend(['--repository-url', 'https://test.pypi.org/legacy/']) + extra.extend(['--repository', 'testpypi']) root = Path(__file__).absolute().parent.parent os.chdir(root) # just in case @@ -42,7 +42,7 @@ def main(): if dist.exists(): shutil.rmtree(dist) - check_call('python3 setup.py sdist bdist_wheel', shell=True) + check_call(['python3', '-m', 'build']) TP = 'TWINE_PASSWORD' password = os.environ.get(TP) diff --git a/scripts/ci/run b/.ci/run similarity index 100% rename from scripts/ci/run rename to .ci/run diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 47f84cb..cf85155 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -5,11 +5,17 @@ on: push: branches: '*' tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi - # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them". + # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug: pull_request: # needed to trigger on others' PRs # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them". workflow_dispatch: # needed to trigger workflows manually - # todo cron? + # todo cron? + inputs: + debug_enabled: + type: boolean + description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)' + required: false + default: false jobs: @@ -17,15 +23,17 @@ jobs: strategy: matrix: platform: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] exclude: [ # windows runners are pretty scarce, so let's only run lowest and highest python version - {platform: windows-latest, python-version: '3.9'}, + {platform: windows-latest, python-version: '3.9' }, {platform: windows-latest, python-version: '3.10'}, + {platform: windows-latest, python-version: '3.11'}, # same, macos is a bit too slow and ubuntu covers python quirks well - {platform: macos-latest , python-version: '3.9' }, + {platform: macos-latest , python-version: '3.9' }, {platform: macos-latest , python-version: '3.10' }, + {platform: macos-latest , python-version: '3.11' }, ] runs-on: ${{ matrix.platform }} @@ -46,11 +54,11 @@ jobs: submodules: recursive fetch-depth: 0 # nicer to have all git history when debugging/for tests - # uncomment for SSH debugging - # - uses: mxschmitt/action-tmate@v3 + - uses: mxschmitt/action-tmate@v3 + if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }} # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd... - - run: bash scripts/ci/run + - run: bash .ci/run - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms uses: actions/upload-artifact@v3 @@ -71,7 +79,7 @@ jobs: # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation - run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 with: python-version: '3.8' @@ -84,8 +92,7 @@ jobs: if: github.event_name != 'pull_request' && github.event.ref == 'refs/heads/master' env: TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD_TEST }} - run: pip3 install --user wheel twine && scripts/release --test - # TODO run pip install just to test? + run: pip3 install --user --upgrade build twine && .ci/release --test - name: 'release to pypi' # always deploy tags to release pypi @@ -93,4 +100,4 @@ jobs: if: github.event_name != 'pull_request' && startsWith(github.event.ref, 'refs/tags') env: TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} - run: pip3 install --user wheel twine && scripts/release + run: pip3 install --user --upgrade build twine && .ci/release --test diff --git a/my/core/core_config.py b/my/core/core_config.py index 5c696ce..e70dc05 100644 --- a/my/core/core_config.py +++ b/my/core/core_config.py @@ -144,9 +144,9 @@ def test_active_modules() -> None: with reset() as cc: cc.enabled_modules = ['my.whatever'] cc.disabled_modules = ['my.body.*'] - assert cc._is_module_active('my.whatever' ) is True - assert cc._is_module_active('my.core' ) is None - assert not cc._is_module_active('my.body.exercise') is True + assert cc._is_module_active('my.whatever' ) is True + assert cc._is_module_active('my.core' ) is None + assert cc._is_module_active('my.body.exercise') is False with reset() as cc: # if both are set, enable all diff --git a/my/fbmessenger/all.py b/my/fbmessenger/all.py index f98b5f3..13689db 100644 --- a/my/fbmessenger/all.py +++ b/my/fbmessenger/all.py @@ -5,8 +5,8 @@ from my.core.source import import_source from .common import Message, _merge_messages -src_export = import_source(module_name=f'my.fbmessenger.export') -src_android = import_source(module_name=f'my.fbmessenger.android') +src_export = import_source(module_name='my.fbmessenger.export') +src_android = import_source(module_name='my.fbmessenger.android') @src_export diff --git a/my/twitter/all.py b/my/twitter/all.py index b203511..4714021 100644 --- a/my/twitter/all.py +++ b/my/twitter/all.py @@ -8,8 +8,8 @@ from .common import merge_tweets, Tweet # NOTE: you can comment out the sources you don't need -src_twint = import_source(module_name=f'my.twitter.twint') -src_archive = import_source(module_name=f'my.twitter.archive') +src_twint = import_source(module_name='my.twitter.twint') +src_archive = import_source(module_name='my.twitter.archive') @src_twint diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..0be93e0 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,25 @@ +ignore = [ +### too opinionated style checks + "E501", # too long lines + "E702", # Multiple statements on one line (semicolon) + "E731", # assigning lambda instead of using def + "E741", # Ambiguous variable name: `l` + "E742", # Ambiguous class name: `O + "E401", # Multiple imports on one line + "F403", # import *` used; unable to detect undefined names +### + +### + "E722", # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing.. + "F811", # Redefinition of unused # this gets in the way of pytest fixtures (e.g. in cachew) + +## might be nice .. but later and I don't wanna make it strict + "E402", # Module level import not at top of file + +### maybe consider these soon +# sometimes it's useful to give a variable a name even if we don't use it as a documentation +# on the other hand, often is a sign of error + "F841", # Local variable `count` is assigned to but never used + "F401", # imported but unused +### +] diff --git a/setup.py b/setup.py index ee4c2f0..5fa988e 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ def main() -> None: extras_require={ 'testing': [ 'pytest', + 'ruff', 'mypy', 'lxml', # for mypy coverage diff --git a/tox.ini b/tox.ini index 860641f..ac0a68d 100644 --- a/tox.ini +++ b/tox.ini @@ -1,31 +1,41 @@ [tox] -minversion = 3.5 -envlist = tests-core,tests-all,demo,mypy-core,mypy-misc +minversion = 3.21 +# relies on the correct version of Python installed +envlist = ruff,tests-core,tests-all,demo,mypy-core,mypy-misc # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333 # hack to prevent .tox from crapping to the project directory -toxworkdir={env:TOXWORKDIR_BASE:}{toxinidir}/.tox +toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox [testenv] +# TODO how to get package name from setuptools? +package_name = "my" passenv = # useful for tests to know they are running under ci - CI - CI_* + CI + CI_* # respect user's cache dirs to prevent tox from crapping into project dir - MYPY_CACHE_DIR - PYTHONPYCACHEPREFIX + PYTHONPYCACHEPREFIX + MYPY_CACHE_DIR + RUFF_CACHE_DIR # note: --use-pep517 below is necessary for tox --parallel flag to work properly # otherwise it seems that it tries to modify .eggs dir in parallel and it fails +[testenv:ruff] +commands = + {envpython} -m pip install --use-pep517 -e .[testing] + {envpython} -m ruff my/ + + # just the very core tests with minimal dependencies [testenv:tests-core] commands = - pip install --use-pep517 -e .[testing] + {envpython} -m pip install --use-pep517 -e .[testing] # seems that denylist tests rely on it? ideally we should get rid of this in tests-core - pip install orjson + {envpython} -m pip install orjson {envpython} -m pytest \ # importlib is the new suggested import-mode @@ -51,72 +61,72 @@ commands = # TODO not sure if need it? setenv = MY_CONFIG = nonexistent commands = - pip install --use-pep517 -e .[testing] + {envpython} -m pip install --use-pep517 -e .[testing] # installed to test my.core.serialize while using simplejson and not orjson - pip install simplejson + {envpython} -m pip install simplejson {envpython} -m pytest \ - tests/serialize_simplejson.py \ - {posargs} + tests/serialize_simplejson.py \ + {posargs} - pip install cachew - pip install orjson + {envpython} -m pip install cachew + {envpython} -m pip install orjson - hpi module install my.location.google - pip install ijson # optional dependency + {envpython} -m my.core module install my.location.google + {envpython} -m pip install ijson # optional dependency # tz/location - hpi module install my.time.tz.via_location - hpi module install my.ip.all - hpi module install my.location.gpslogger - hpi module install my.location.fallback.via_ip - hpi module install my.google.takeout.parser + {envpython} -m my.core module install my.time.tz.via_location + {envpython} -m my.core module install my.ip.all + {envpython} -m my.core module install my.location.gpslogger + {envpython} -m my.core module install my.location.fallback.via_ip + {envpython} -m my.core module install my.google.takeout.parser - hpi module install my.calendar.holidays + {envpython} -m my.core module install my.calendar.holidays # my.body.weight dep - hpi module install my.orgmode + {envpython} -m my.core module install my.orgmode - hpi module install my.coding.commits + {envpython} -m my.core module install my.coding.commits - hpi module install my.pdfs + {envpython} -m my.core module install my.pdfs - hpi module install my.reddit.rexport + {envpython} -m my.core module install my.reddit.rexport {envpython} -m pytest \ # importlib is the new suggested import-mode # without it test package names end up as core.tests.* instead of my.core.tests.* --import-mode=importlib \ - --pyargs my.tests \ + --pyargs {[testenv]package_name}.tests \ {posargs} {envpython} -m pytest tests \ - # ignore some tests which might take a while to run on ci.. - --ignore tests/takeout.py \ - --ignore tests/extra/polar.py \ - # dont run simplejson compatibility test since orjson is now installed - --ignore tests/serialize_simplejson.py \ - {posargs} + # ignore some tests which might take a while to run on ci.. + --ignore tests/takeout.py \ + --ignore tests/extra/polar.py \ + # dont run simplejson compatibility test since orjson is now installed + --ignore tests/serialize_simplejson.py \ + {posargs} [testenv:demo] commands = - pip install git+https://github.com/karlicoss/hypexport + {envpython} -m pip install git+https://github.com/karlicoss/hypexport {envpython} ./demo.py [testenv:mypy-core] allowlist_externals = cat commands = - pip install --use-pep517 -e .[testing,optional] - pip install orgparse # used it core.orgmode? - pip install gpxpy # for hpi query --output gpx + {envpython} -m pip install --use-pep517 -e .[testing,optional] + {envpython} -m pip install orgparse # used it core.orgmode? + {envpython} -m pip install gpxpy # for hpi query --output gpx {envpython} -m mypy --install-types --non-interactive \ - -p my.core \ - --txt-report .coverage.mypy-core \ - --html-report .coverage.mypy-core \ - {posargs} + -p {[testenv]package_name}.core \ + --txt-report .coverage.mypy-core \ + --html-report .coverage.mypy-core \ + {posargs} cat .coverage.mypy-core/index.txt @@ -125,9 +135,9 @@ commands = [testenv:mypy-misc] allowlist_externals = cat commands = - pip install --use-pep517 -e .[testing,optional] + {envpython} -m pip install --use-pep517 -e .[testing,optional] - hpi module install --parallel \ + {envpython} -m my.core module install --parallel \ my.arbtt \ my.browser.export \ my.coding.commits \ @@ -157,18 +167,18 @@ commands = {envpython} -m mypy --install-types --non-interactive \ - -p my \ - --exclude 'my/coding/codeforces.py' \ - --exclude 'my/coding/topcoder.py' \ - --exclude 'my/jawbone/.*' \ - --txt-report .coverage.mypy-misc \ - --html-report .coverage.mypy-misc \ - {posargs} + -p {[testenv]package_name} \ + --exclude 'my/coding/codeforces.py' \ + --exclude 'my/coding/topcoder.py' \ + --exclude 'my/jawbone/.*' \ + --txt-report .coverage.mypy-misc \ + --html-report .coverage.mypy-misc \ + {posargs} # txt report is a bit more convenient to view on CI cat .coverage.mypy-misc/index.txt {envpython} -m mypy --install-types --non-interactive \ - tests + tests # note: this comment doesn't seem relevant anymore, but keeping it in case the issue happens again # > ugh ... need to reset HOME, otherwise user's site-packages are somehow leaking into mypy's path... From 68289c1be39ed9189116ba711a6785c484f00a42 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Thu, 12 Oct 2023 21:42:32 +0100 Subject: [PATCH 142/302] general: fix ignores after mypy version update --- my/core/__init__.py | 2 +- my/core/common.py | 2 +- my/core/logging.py | 2 +- my/runnerup.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/my/core/__init__.py b/my/core/__init__.py index 0f09eef..d753760 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -42,7 +42,7 @@ __all__ = [ # you could put _init_hook.py next to your private my/config # that way you can configure logging/warnings/env variables on every HPI import try: - import my._init_hook # type: ignore[import] + import my._init_hook # type: ignore[import-not-found] except: pass ## diff --git a/my/core/common.py b/my/core/common.py index 8c670fa..738f6f1 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -86,7 +86,7 @@ def ensure_unique( for i in it: k = key(i) v = value(i) - pv = key2value.get(k, None) # type: ignore + pv = key2value.get(k, None) if pv is not None: raise RuntimeError(f"Duplicate key: {k}. Previous value: {pv}, new value: {v}") key2value[k] = v diff --git a/my/core/logging.py b/my/core/logging.py index 5580168..11567f1 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -230,7 +230,7 @@ def get_enlighten(): return Mock() try: - import enlighten # type: ignore[import] + import enlighten # type: ignore[import-untyped] except ModuleNotFoundError: warnings.warn("You might want to 'pip install enlighten' for a nice progress bar") diff --git a/my/runnerup.py b/my/runnerup.py index 1f20525..f12d9b3 100644 --- a/my/runnerup.py +++ b/my/runnerup.py @@ -13,7 +13,7 @@ from typing import Iterable from .core import Res, get_files from .core.common import isoparse, Json -import tcxparser # type: ignore[import] +import tcxparser # type: ignore[import-untyped] from my.config import runnerup as config From bb478f369d896c1ac4fac90a99307b828c65e084 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Thu, 12 Oct 2023 22:29:14 +0100 Subject: [PATCH 143/302] core/logging: no need for super call in Filter --- my/core/logging.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/my/core/logging.py b/my/core/logging.py index 11567f1..accfc2e 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -165,16 +165,13 @@ def _setup_handlers_and_formatters(name: str) -> None: # todo also amend by post about defensive error handling? class AddExceptionTraceback(logging.Filter): def filter(self, record: logging.LogRecord) -> bool: - s = super().filter(record) - if s is False: - return False if record.levelname == 'ERROR': exc = record.msg if isinstance(exc, BaseException): if record.exc_info is None or record.exc_info == (None, None, None): exc_info = (type(exc), exc, exc.__traceback__) record.exc_info = exc_info - return s + return True # todo also save full log in a file? From fe26efaea849e9c2b0fb57a4cc75878a45c3f8bf Mon Sep 17 00:00:00 2001 From: karlicoss Date: Thu, 12 Oct 2023 23:00:00 +0100 Subject: [PATCH 144/302] core/kompress: move vendorized to _deprecated, use kompress library directly --- my/core/_deprecated/kompress.py | 260 +++++++++++++++++++++++++++++++ my/core/common.py | 6 +- my/core/kompress.py | 266 ++------------------------------ my/core/tests/kompress.py | 128 --------------- my/kython/kompress.py | 7 +- setup.py | 1 + 6 files changed, 283 insertions(+), 385 deletions(-) create mode 100644 my/core/_deprecated/kompress.py delete mode 100644 my/core/tests/kompress.py mode change 120000 => 100644 my/kython/kompress.py diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py new file mode 100644 index 0000000..e4840f6 --- /dev/null +++ b/my/core/_deprecated/kompress.py @@ -0,0 +1,260 @@ +""" +Various helpers for compression +""" +# fmt: off +from __future__ import annotations + +from datetime import datetime +from functools import total_ordering +import io +import pathlib +from pathlib import Path +import sys +from typing import Union, IO, Sequence, Any, Iterator + +PathIsh = Union[Path, str] + + +class Ext: + xz = '.xz' + zip = '.zip' + lz4 = '.lz4' + zstd = '.zstd' + zst = '.zst' + targz = '.tar.gz' + + +def is_compressed(p: Path) -> bool: + # todo kinda lame way for now.. use mime ideally? + # should cooperate with kompress.kopen? + return any(p.name.endswith(ext) for ext in {Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.zst, Ext.targz}) + + +def _zstd_open(path: Path, *args, **kwargs) -> IO: + import zstandard as zstd # type: ignore + fh = path.open('rb') + dctx = zstd.ZstdDecompressor() + reader = dctx.stream_reader(fh) + + mode = kwargs.get('mode', 'rt') + if mode == 'rb': + return reader + else: + # must be text mode + kwargs.pop('mode') # TextIOWrapper doesn't like it + return io.TextIOWrapper(reader, **kwargs) # meh + + +# TODO use the 'dependent type' trick for return type? +def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO: + # just in case, but I think this shouldn't be necessary anymore + # since when we call .read_text, encoding is passed already + if mode in {'r', 'rt'}: + encoding = kwargs.get('encoding', 'utf8') + else: + encoding = None + kwargs['encoding'] = encoding + + pp = Path(path) + name = pp.name + if name.endswith(Ext.xz): + import lzma + + # ugh. for lzma, 'r' means 'rb' + # https://github.com/python/cpython/blob/d01cf5072be5511595b6d0c35ace6c1b07716f8d/Lib/lzma.py#L97 + # whereas for regular open, 'r' means 'rt' + # https://docs.python.org/3/library/functions.html#open + if mode == 'r': + mode = 'rt' + kwargs['mode'] = mode + return lzma.open(pp, *args, **kwargs) + elif name.endswith(Ext.zip): + # eh. this behaviour is a bit dodgy... + from zipfile import ZipFile + zfile = ZipFile(pp) + + [subpath] = args # meh? + + ## oh god... https://stackoverflow.com/a/5639960/706389 + ifile = zfile.open(subpath, mode='r') + ifile.readable = lambda: True # type: ignore + ifile.writable = lambda: False # type: ignore + ifile.seekable = lambda: False # type: ignore + ifile.read1 = ifile.read # type: ignore + # TODO pass all kwargs here?? + # todo 'expected "BinaryIO"'?? + return io.TextIOWrapper(ifile, encoding=encoding) + elif name.endswith(Ext.lz4): + import lz4.frame # type: ignore + return lz4.frame.open(str(pp), mode, *args, **kwargs) + elif name.endswith(Ext.zstd) or name.endswith(Ext.zst): + kwargs['mode'] = mode + return _zstd_open(pp, *args, **kwargs) + elif name.endswith(Ext.targz): + import tarfile + # FIXME pass mode? + tf = tarfile.open(pp) + # TODO pass encoding? + x = tf.extractfile(*args); assert x is not None + return x + else: + return pp.open(mode, *args, **kwargs) + + +import typing +import os + +if typing.TYPE_CHECKING: + # otherwise mypy can't figure out that BasePath is a type alias.. + BasePath = pathlib.Path +else: + BasePath = pathlib.WindowsPath if os.name == 'nt' else pathlib.PosixPath + + +class CPath(BasePath): + """ + Hacky way to support compressed files. + If you can think of a better way to do this, please let me know! https://github.com/karlicoss/HPI/issues/20 + + Ugh. So, can't override Path because of some _flavour thing. + Path only has _accessor and _closed slots, so can't directly set .open method + _accessor.open has to return file descriptor, doesn't work for compressed stuff. + """ + def open(self, *args, **kwargs): + kopen_kwargs = {} + mode = kwargs.get('mode') + if mode is not None: + kopen_kwargs['mode'] = mode + encoding = kwargs.get('encoding') + if encoding is not None: + kopen_kwargs['encoding'] = encoding + # TODO assert read only? + return kopen(str(self), **kopen_kwargs) + + +open = kopen # TODO deprecate + + +# meh +# TODO ideally switch to ZipPath or smth similar? +# nothing else supports subpath properly anyway +def kexists(path: PathIsh, subpath: str) -> bool: + try: + kopen(path, subpath) + return True + except Exception: + return False + + +import zipfile +if sys.version_info[:2] >= (3, 8): + # meh... zipfile.Path is not available on 3.7 + zipfile_Path = zipfile.Path +else: + if typing.TYPE_CHECKING: + zipfile_Path = Any + else: + zipfile_Path = object + + +@total_ordering +class ZipPath(zipfile_Path): + # NOTE: is_dir/is_file might not behave as expected, the base class checks it only based on the slash in path + + # seems that root/at are not exposed in the docs, so might be an implementation detail + root: zipfile.ZipFile + at: str + + @property + def filepath(self) -> Path: + res = self.root.filename + assert res is not None # make mypy happy + return Path(res) + + @property + def subpath(self) -> Path: + return Path(self.at) + + def absolute(self) -> ZipPath: + return ZipPath(self.filepath.absolute(), self.at) + + def expanduser(self) -> ZipPath: + return ZipPath(self.filepath.expanduser(), self.at) + + def exists(self) -> bool: + if self.at == '': + # special case, the base class returns False in this case for some reason + return self.filepath.exists() + return super().exists() or self._as_dir().exists() + + def _as_dir(self) -> zipfile_Path: + # note: seems that zip always uses forward slash, regardless OS? + return zipfile_Path(self.root, self.at + '/') + + def rglob(self, glob: str) -> Sequence[ZipPath]: + # note: not 100% sure about the correctness, but seem fine? + # Path.match() matches from the right, so need to + rpaths = [p for p in self.root.namelist() if p.startswith(self.at)] + rpaths = [p for p in rpaths if Path(p).match(glob)] + return [ZipPath(self.root, p) for p in rpaths] + + def relative_to(self, other: ZipPath) -> Path: + assert self.filepath == other.filepath, (self.filepath, other.filepath) + return self.subpath.relative_to(other.subpath) + + @property + def parts(self) -> Sequence[str]: + # messy, but might be ok.. + return self.filepath.parts + self.subpath.parts + + def __truediv__(self, key) -> ZipPath: + # need to implement it so the return type is not zipfile.Path + tmp = zipfile_Path(self.root) / self.at / key + return ZipPath(self.root, tmp.at) + + def iterdir(self) -> Iterator[ZipPath]: + for s in self._as_dir().iterdir(): + yield ZipPath(s.root, s.at) # type: ignore[attr-defined] + + @property + def stem(self) -> str: + return self.subpath.stem + + @property # type: ignore[misc] + def __class__(self): + return Path + + def __eq__(self, other) -> bool: + # hmm, super class doesn't seem to treat as equals unless they are the same object + if not isinstance(other, ZipPath): + return False + return (self.filepath, self.subpath) == (other.filepath, other.subpath) + + def __lt__(self, other) -> bool: + if not isinstance(other, ZipPath): + return False + return (self.filepath, self.subpath) < (other.filepath, other.subpath) + + def __hash__(self) -> int: + return hash((self.filepath, self.subpath)) + + def stat(self) -> os.stat_result: + # NOTE: zip datetimes have no notion of time zone, usually they just keep local time? + # see https://en.wikipedia.org/wiki/ZIP_(file_format)#Structure + dt = datetime(*self.root.getinfo(self.at).date_time) + ts = int(dt.timestamp()) + params = dict( + st_mode=0, + st_ino=0, + st_dev=0, + st_nlink=1, + st_uid=1000, + st_gid=1000, + st_size=0, # todo compute it properly? + st_atime=ts, + st_mtime=ts, + st_ctime=ts, + ) + return os.stat_result(tuple(params.values())) + +# fmt: on diff --git a/my/core/common.py b/my/core/common.py index 738f6f1..cd6de49 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -162,7 +162,7 @@ Paths = Union[Sequence[PathIsh], PathIsh] def _is_zippath(p: Path) -> bool: - # weak type check here, don't want to depend on .kompress module in get_files + # weak type check here, don't want to depend on kompress library in get_files return type(p).__name__ == 'ZipPath' @@ -234,8 +234,8 @@ def get_files( traceback.print_stack() if guess_compression: - from .kompress import CPath, is_compressed - paths = [CPath(p) if is_compressed(p) and not _is_zippath(p) else p for p in paths] + from kompress import CPath, is_compressed + paths = [CPath(p) if is_compressed(p) and not _is_zippath(p) else p for p in paths] # TODO fwtf is going on here?... make sure it's tested return tuple(paths) diff --git a/my/core/kompress.py b/my/core/kompress.py index 1f00013..25dba8c 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -1,257 +1,17 @@ -""" -Various helpers for compression -""" -from __future__ import annotations +from .common import assert_subpackage; assert_subpackage(__name__) +from . import warnings -from datetime import datetime -from functools import total_ordering -import io -import pathlib -from pathlib import Path -import sys -from typing import Union, IO, Sequence, Any, Iterator +# do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath) +# warnings.high('my.core.kompress is deprecated, please use "kompress" library directly. See https://github.com/karlicoss/kompress') -PathIsh = Union[Path, str] - - -class Ext: - xz = '.xz' - zip = '.zip' - lz4 = '.lz4' - zstd = '.zstd' - zst = '.zst' - targz = '.tar.gz' - - -def is_compressed(p: Path) -> bool: - # todo kinda lame way for now.. use mime ideally? - # should cooperate with kompress.kopen? - return any(p.name.endswith(ext) for ext in {Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.zst, Ext.targz}) - - -def _zstd_open(path: Path, *args, **kwargs) -> IO: - import zstandard as zstd # type: ignore - fh = path.open('rb') - dctx = zstd.ZstdDecompressor() - reader = dctx.stream_reader(fh) - - mode = kwargs.get('mode', 'rt') - if mode == 'rb': - return reader +try: + from kompress import * +except ModuleNotFoundError as e: + if e.name == 'kompress': + warnings.high('Please install kompress (pip3 install kompress), it will be required in the future. Falling onto vendorized kompress for now.') + from ._deprecated.kompress import * # type: ignore[assignment] else: - # must be text mode - kwargs.pop('mode') # TextIOWrapper doesn't like it - return io.TextIOWrapper(reader, **kwargs) # meh + raise e - -# TODO use the 'dependent type' trick for return type? -def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO: - # just in case, but I think this shouldn't be necessary anymore - # since when we call .read_text, encoding is passed already - if mode in {'r', 'rt'}: - encoding = kwargs.get('encoding', 'utf8') - else: - encoding = None - kwargs['encoding'] = encoding - - pp = Path(path) - name = pp.name - if name.endswith(Ext.xz): - import lzma - - # ugh. for lzma, 'r' means 'rb' - # https://github.com/python/cpython/blob/d01cf5072be5511595b6d0c35ace6c1b07716f8d/Lib/lzma.py#L97 - # whereas for regular open, 'r' means 'rt' - # https://docs.python.org/3/library/functions.html#open - if mode == 'r': - mode = 'rt' - kwargs['mode'] = mode - return lzma.open(pp, *args, **kwargs) - elif name.endswith(Ext.zip): - # eh. this behaviour is a bit dodgy... - from zipfile import ZipFile - zfile = ZipFile(pp) - - [subpath] = args # meh? - - ## oh god... https://stackoverflow.com/a/5639960/706389 - ifile = zfile.open(subpath, mode='r') - ifile.readable = lambda: True # type: ignore - ifile.writable = lambda: False # type: ignore - ifile.seekable = lambda: False # type: ignore - ifile.read1 = ifile.read # type: ignore - # TODO pass all kwargs here?? - # todo 'expected "BinaryIO"'?? - return io.TextIOWrapper(ifile, encoding=encoding) - elif name.endswith(Ext.lz4): - import lz4.frame # type: ignore - return lz4.frame.open(str(pp), mode, *args, **kwargs) - elif name.endswith(Ext.zstd) or name.endswith(Ext.zst): - kwargs['mode'] = mode - return _zstd_open(pp, *args, **kwargs) - elif name.endswith(Ext.targz): - import tarfile - # FIXME pass mode? - tf = tarfile.open(pp) - # TODO pass encoding? - x = tf.extractfile(*args); assert x is not None - return x - else: - return pp.open(mode, *args, **kwargs) - - -import typing -import os - -if typing.TYPE_CHECKING: - # otherwise mypy can't figure out that BasePath is a type alias.. - BasePath = pathlib.Path -else: - BasePath = pathlib.WindowsPath if os.name == 'nt' else pathlib.PosixPath - - -class CPath(BasePath): - """ - Hacky way to support compressed files. - If you can think of a better way to do this, please let me know! https://github.com/karlicoss/HPI/issues/20 - - Ugh. So, can't override Path because of some _flavour thing. - Path only has _accessor and _closed slots, so can't directly set .open method - _accessor.open has to return file descriptor, doesn't work for compressed stuff. - """ - def open(self, *args, **kwargs): - kopen_kwargs = {} - mode = kwargs.get('mode') - if mode is not None: - kopen_kwargs['mode'] = mode - encoding = kwargs.get('encoding') - if encoding is not None: - kopen_kwargs['encoding'] = encoding - # TODO assert read only? - return kopen(str(self), **kopen_kwargs) - - -open = kopen # TODO deprecate - - -# meh -# TODO ideally switch to ZipPath or smth similar? -# nothing else supports subpath properly anyway -def kexists(path: PathIsh, subpath: str) -> bool: - try: - kopen(path, subpath) - return True - except Exception: - return False - - -import zipfile -if sys.version_info[:2] >= (3, 8): - # meh... zipfile.Path is not available on 3.7 - zipfile_Path = zipfile.Path -else: - if typing.TYPE_CHECKING: - zipfile_Path = Any - else: - zipfile_Path = object - - -@total_ordering -class ZipPath(zipfile_Path): - # NOTE: is_dir/is_file might not behave as expected, the base class checks it only based on the slash in path - - # seems that root/at are not exposed in the docs, so might be an implementation detail - root: zipfile.ZipFile - at: str - - @property - def filepath(self) -> Path: - res = self.root.filename - assert res is not None # make mypy happy - return Path(res) - - @property - def subpath(self) -> Path: - return Path(self.at) - - def absolute(self) -> ZipPath: - return ZipPath(self.filepath.absolute(), self.at) - - def expanduser(self) -> ZipPath: - return ZipPath(self.filepath.expanduser(), self.at) - - def exists(self) -> bool: - if self.at == '': - # special case, the base class returns False in this case for some reason - return self.filepath.exists() - return super().exists() or self._as_dir().exists() - - def _as_dir(self) -> zipfile_Path: - # note: seems that zip always uses forward slash, regardless OS? - return zipfile_Path(self.root, self.at + '/') - - def rglob(self, glob: str) -> Sequence[ZipPath]: - # note: not 100% sure about the correctness, but seem fine? - # Path.match() matches from the right, so need to - rpaths = [p for p in self.root.namelist() if p.startswith(self.at)] - rpaths = [p for p in rpaths if Path(p).match(glob)] - return [ZipPath(self.root, p) for p in rpaths] - - def relative_to(self, other: ZipPath) -> Path: - assert self.filepath == other.filepath, (self.filepath, other.filepath) - return self.subpath.relative_to(other.subpath) - - @property - def parts(self) -> Sequence[str]: - # messy, but might be ok.. - return self.filepath.parts + self.subpath.parts - - def __truediv__(self, key) -> ZipPath: - # need to implement it so the return type is not zipfile.Path - tmp = zipfile_Path(self.root) / self.at / key - return ZipPath(self.root, tmp.at) - - def iterdir(self) -> Iterator[ZipPath]: - for s in self._as_dir().iterdir(): - yield ZipPath(s.root, s.at) # type: ignore[attr-defined] - - @property - def stem(self) -> str: - return self.subpath.stem - - @property # type: ignore[misc] - def __class__(self): - return Path - - def __eq__(self, other) -> bool: - # hmm, super class doesn't seem to treat as equals unless they are the same object - if not isinstance(other, ZipPath): - return False - return (self.filepath, self.subpath) == (other.filepath, other.subpath) - - def __lt__(self, other) -> bool: - if not isinstance(other, ZipPath): - return False - return (self.filepath, self.subpath) < (other.filepath, other.subpath) - - def __hash__(self) -> int: - return hash((self.filepath, self.subpath)) - - def stat(self) -> os.stat_result: - # NOTE: zip datetimes have no notion of time zone, usually they just keep local time? - # see https://en.wikipedia.org/wiki/ZIP_(file_format)#Structure - dt = datetime(*self.root.getinfo(self.at).date_time) - ts = int(dt.timestamp()) - params = dict( - st_mode=0, - st_ino=0, - st_dev=0, - st_nlink=1, - st_uid=1000, - st_gid=1000, - st_size=0, # todo compute it properly? - st_atime=ts, - st_mtime=ts, - st_ctime=ts, - ) - return os.stat_result(tuple(params.values())) +# this is deprecated in compress, keep here for backwards compatibility +open = kopen # noqa: F405 diff --git a/my/core/tests/kompress.py b/my/core/tests/kompress.py deleted file mode 100644 index 19c4e82..0000000 --- a/my/core/tests/kompress.py +++ /dev/null @@ -1,128 +0,0 @@ -from pathlib import Path -import lzma -import sys -import zipfile - -from ..kompress import kopen, kexists, CPath, ZipPath - -import pytest - - -structure_data: Path = Path(__file__).parent / "structure_data" - - -def test_kopen(tmp_path: Path) -> None: - "Plaintext handled transparently" - # fmt: off - assert kopen(tmp_path / 'file' ).read() == 'just plaintext' - assert kopen(tmp_path / 'file.xz').read() == 'compressed text' - # fmt: on - - "For zips behaviour is a bit different (not sure about all this, tbh...)" - assert kopen(tmp_path / 'file.zip', 'path/in/archive').read() == 'data in zip' - - -def test_kexists(tmp_path: Path) -> None: - # TODO also test top level? - # fmt: off - assert kexists(str(tmp_path / 'file.zip'), 'path/in/archive') - assert not kexists(str(tmp_path / 'file.zip'), 'path/notin/archive') - # fmt: on - - # TODO not sure about this? - assert not kexists(tmp_path / 'nosuchzip.zip', 'path/in/archive') - - -def test_cpath(tmp_path: Path) -> None: - # fmt: off - CPath(str(tmp_path / 'file' )).read_text() == 'just plaintext' - CPath( tmp_path / 'file.xz').read_text() == 'compressed text' - # fmt: on - - -@pytest.fixture(autouse=True) -def prepare(tmp_path: Path): - (tmp_path / 'file').write_text('just plaintext') - with (tmp_path / 'file.xz').open('wb') as f: - with lzma.open(f, 'w') as lzf: - lzf.write(b'compressed text') - with zipfile.ZipFile(tmp_path / 'file.zip', 'w') as zf: - zf.writestr('path/in/archive', 'data in zip') - try: - yield None - finally: - pass - - -def test_zippath() -> None: - target = structure_data / 'gdpr_export.zip' - assert target.exists(), target # precondition - - zp = ZipPath(target) - - # magic! convenient to make third party libraries agnostic of ZipPath - assert isinstance(zp, Path) - assert isinstance(zp, ZipPath) - assert isinstance(zp / 'subpath', Path) - # TODO maybe change __str__/__repr__? since it's a bit misleading: - # Path('/code/hpi/tests/core/structure_data/gdpr_export.zip', 'gdpr_export/') - - assert ZipPath(target) == ZipPath(target) - assert zp.absolute() == zp - - # shouldn't crash - hash(zp) - - assert zp.exists() - assert (zp / 'gdpr_export' / 'comments').exists() - # check str constructor just in case - assert (ZipPath(str(target)) / 'gdpr_export' / 'comments').exists() - assert not (ZipPath(str(target)) / 'whatever').exists() - - matched = list(zp.rglob('*')) - assert len(matched) > 0 - assert all(p.filepath == target for p in matched), matched - - rpaths = [p.relative_to(zp) for p in matched] - gdpr_export = Path('gdpr_export') - # fmt: off - assert rpaths == [ - gdpr_export, - gdpr_export / 'comments', - gdpr_export / 'comments' / 'comments.json', - gdpr_export / 'profile', - gdpr_export / 'profile' / 'settings.json', - gdpr_export / 'messages', - gdpr_export / 'messages' / 'index.csv', - ], rpaths - # fmt: on - - # TODO hmm this doesn't work atm, whereas Path does - # not sure if it should be defensive or something... - # ZipPath('doesnotexist') - # same for this one - # assert ZipPath(Path('test'), 'whatever').absolute() == ZipPath(Path('test').absolute(), 'whatever') - - assert (ZipPath(target) / 'gdpr_export' / 'comments').exists() - - jsons = [p.relative_to(zp / 'gdpr_export') for p in zp.rglob('*.json')] - # fmt: off - assert jsons == [ - Path('comments', 'comments.json'), - Path('profile' , 'settings.json'), - ] - # fmt: on - - # NOTE: hmm interesting, seems that ZipPath is happy with forward slash regardless OS? - assert list(zp.rglob('mes*')) == [ZipPath(target, 'gdpr_export/messages')] - - iterdir_res = list((zp / 'gdpr_export').iterdir()) - assert len(iterdir_res) == 3 - assert all(isinstance(p, Path) for p in iterdir_res) - - # date recorded in the zip archive - assert (zp / 'gdpr_export' / 'comments' / 'comments.json').stat().st_mtime > 1625000000 - # TODO ugh. - # unzip -l shows the date as 2021-07-01 09:43 - # however, python reads it as 2021-07-01 01:43 ?? - # don't really feel like dealing with this for now, it's not tz aware anyway diff --git a/my/kython/kompress.py b/my/kython/kompress.py deleted file mode 120000 index 59edcd1..0000000 --- a/my/kython/kompress.py +++ /dev/null @@ -1 +0,0 @@ -../core/kompress.py \ No newline at end of file diff --git a/my/kython/kompress.py b/my/kython/kompress.py new file mode 100644 index 0000000..01e24e4 --- /dev/null +++ b/my/kython/kompress.py @@ -0,0 +1,6 @@ +from my.core import __NOT_HPI_MODULE__ +from my.core import warnings + +warnings.high('my.kython.kompress is deprecated, please use "kompress" library directly. See https://github.com/karlicoss/kompress') + +from my.core.kompress import * diff --git a/setup.py b/setup.py index 5fa988e..42ffeaa 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,7 @@ INSTALL_REQUIRES = [ 'more-itertools', # it's just too useful and very common anyway 'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core 'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI + 'kompress' , # for transparent access to compressed files via pathlib.Path ] From 28d2450a214ac927b00209b56d841723ee18e8b6 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 19 Oct 2023 01:08:50 +0100 Subject: [PATCH 145/302] reddit.rexport: some cleanup, move get_events stuff into personal overlay --- my/reddit/rexport.py | 171 +++++++------------------------------------ my/tests/reddit.py | 43 ++--------- 2 files changed, 36 insertions(+), 178 deletions(-) diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index f20d00e..dadfb5a 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -5,13 +5,24 @@ REQUIRES = [ 'git+https://github.com/karlicoss/rexport', ] -from pathlib import Path -from my.core.common import Paths from dataclasses import dataclass -from typing import Any +from pathlib import Path +from typing import Iterator, Sequence + +from my.core import ( + get_files, + make_logger, + stat, + Paths, + Stats, +) +from my.core.cfg import make_config, Attrs +from my.core.common import mcachew from my.config import reddit as uconfig +logger = make_logger(__name__) + @dataclass class reddit(uconfig): @@ -23,7 +34,6 @@ class reddit(uconfig): export_path: Paths -from my.core.cfg import make_config, Attrs # hmm, also nice thing about this is that migration is possible to test without the rest of the config? def migration(attrs: Attrs) -> Attrs: # new structure, take top-level config and extract 'rexport' class @@ -33,6 +43,7 @@ def migration(attrs: Attrs) -> Attrs: attrs['export_path'] = ex.export_path else: from my.core.warnings import high + high("""DEPRECATED! Please modify your reddit config to look like: class reddit: @@ -45,15 +56,15 @@ class reddit: high(f'"{export_dir}" is deprecated! Please use "export_path" instead."') return attrs + config = make_config(reddit, migration=migration) ### -# TODO not sure about the laziness... - try: from rexport import dal except ModuleNotFoundError as e: from my.core.compat import pre_pip_dal_handler + dal = pre_pip_dal_handler('rexport', e, config, requires=REQUIRES) # TODO ugh. this would import too early # but on the other hand we do want to bring the objects into the scope for easier imports, etc. ugh! @@ -61,34 +72,28 @@ except ModuleNotFoundError as e: # maybe, the config could dynamically detect change and reimport itself? dunno. ### -############################ -from typing import List, Sequence, Mapping, Iterator, Any -from my.core import make_logger -from my.core.common import mcachew, get_files, make_dict, Stats - - -logger = make_logger(__name__) - - -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) -Uid = dal.Sid # str -Save = dal.Save -Comment = dal.Comment -Submission = dal.Submission -Upvote = dal.Upvote +# fmt: off +Uid = dal.Sid # str +Save = dal.Save +Comment = dal.Comment +Submission = dal.Submission +Upvote = dal.Upvote +# fmt: on def _dal() -> dal.DAL: inp = list(inputs()) return dal.DAL(inp) + cache = mcachew(depends_on=inputs) + @cache def saved() -> Iterator[Save]: return _dal().saved() @@ -109,132 +114,12 @@ def upvoted() -> Iterator[Upvote]: return _dal().upvoted() -### the rest of the file is some elaborate attempt of restoring favorite/unfavorite times - -from typing import Dict, Iterable, Iterator, NamedTuple -from functools import lru_cache -import re -from datetime import datetime, timezone -from multiprocessing import Pool - -# TODO hmm. apparently decompressing takes quite a bit of time... - -class SaveWithDt(NamedTuple): - save: Save - backup_dt: datetime - - def __getattr__(self, x): - return getattr(self.save, x) - -# TODO for future events? -EventKind = SaveWithDt - - -class Event(NamedTuple): - dt: datetime - text: str - kind: EventKind - eid: str - title: str - url: str - - @property - def cmp_key(self): - return (self.dt, (1 if 'unfavorited' in self.text else 0)) - - -Url = str - -def _get_bdate(bfile: Path) -> datetime: - RE = re.compile(r'reddit.(\d{14})') - stem = bfile.stem - stem = stem.replace('T', '').replace('Z', '') # adapt for arctee - match = RE.search(stem) - assert match is not None - bdt = datetime.strptime(match.group(1), "%Y%m%d%H%M%S").replace(tzinfo=timezone.utc) - return bdt - - -def _get_state(bfile: Path) -> Dict[Uid, SaveWithDt]: - logger.debug('handling %s', bfile) - - bdt = _get_bdate(bfile) - - saves = [SaveWithDt(save, bdt) for save in dal.DAL([bfile]).saved()] - return make_dict( - sorted(saves, key=lambda p: p.save.created), - key=lambda s: s.save.sid, - ) - -# TODO hmm. think about it.. if we set default backups=inputs() -# it's called early so it ends up as a global variable that we can't monkey patch easily -@mcachew(lambda backups: backups) -def _get_events(backups: Sequence[Path], parallel: bool=True) -> Iterator[Event]: - # todo cachew: let it transform return type? so you don't have to write a wrapper for lists? - - prev_saves: Mapping[Uid, SaveWithDt] = {} - # TODO suppress first batch?? - # TODO for initial batch, treat event time as creation time - - states: Iterable[Mapping[Uid, SaveWithDt]] - if parallel: - with Pool() as p: - states = p.map(_get_state, backups) - else: - # also make it lazy... - states = map(_get_state, backups) - # TODO mm, need to make that iterative too? - - for i, (bfile, saves) in enumerate(zip(backups, states)): - bdt = _get_bdate(bfile) - - first = i == 0 - - for key in set(prev_saves.keys()).symmetric_difference(set(saves.keys())): - ps = prev_saves.get(key, None) - if ps is not None: - # TODO use backup date, that is more precise... - # eh. I guess just take max and it will always be correct? - assert not first - yield Event( - dt=bdt, # TODO average with ps.save_dt? - text="unfavorited", - kind=ps, - eid=f'unf-{ps.sid}', - url=ps.url, - title=ps.title, - ) - else: # already in saves - s = saves[key] - last_saved = s.backup_dt - yield Event( - dt=s.created if first else last_saved, - text=f"favorited{' [initial]' if first else ''}", - kind=s, - eid=f'fav-{s.sid}', - url=s.url, - title=s.title, - ) - prev_saves = saves - - # TODO a bit awkward, favorited should compare lower than unfavorited? - -@lru_cache(1) -def events(*args, **kwargs) -> List[Event]: - inp = inputs() - # 2.2s for 300 files without cachew - # 0.2s for 300 files with cachew - evit = _get_events(inp, *args, **kwargs) - # todo mypy is confused here and thinks it's iterable of Path? perhaps something to do with mcachew? - return list(sorted(evit, key=lambda e: e.cmp_key)) - - def stats() -> Stats: - from my.core import stat return { + # fmt: off **stat(saved ), **stat(comments ), **stat(submissions), **stat(upvoted ), + # fmt: on } - diff --git a/my/tests/reddit.py b/my/tests/reddit.py index 0871041..4af95ae 100644 --- a/my/tests/reddit.py +++ b/my/tests/reddit.py @@ -1,5 +1,3 @@ -from datetime import datetime, timezone - from my.core.cfg import tmp_config from my.core.common import make_dict @@ -13,34 +11,25 @@ import my.reddit.rexport as my_reddit_rexport import my.reddit.all as my_reddit_all -def test_basic() -> None: +def test_basic_1() -> None: # todo maybe this should call stat or something instead? # would ensure reasonable stat implementation as well and less duplication # note: deliberately use old module (instead of my.reddit.all) to test bwd compatibility - from my.reddit import saved, events + from my.reddit import saved - assert len(list(events())) > 0 assert len(list(saved())) > 0 +def test_basic_2() -> None: + # deliberately check call from a different style of import to make sure tmp_config works + saves = list(my_reddit_rexport.saved()) + assert len(saves) > 0 + + def test_comments() -> None: assert len(list(my_reddit_all.comments())) > 0 -def test_unfav() -> None: - from my.reddit import events - - ev = events() - url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/' - uev = [e for e in ev if e.url == url] - assert len(uev) == 2 - ff = uev[0] - # TODO could recover these from takeout perhaps? - assert ff.text == 'favorited [initial]' - uf = uev[1] - assert uf.text == 'unfavorited' - - def test_saves() -> None: from my.reddit.all import saved @@ -51,22 +40,6 @@ def test_saves() -> None: make_dict(saves, key=lambda s: s.sid) -def test_disappearing() -> None: - # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason - # but I guess it was just a short glitch... so whatever - evs = my_reddit_rexport.events() - favs = [s.kind for s in evs if s.text == 'favorited'] - [deal_with_it] = [f for f in favs if f.title == '"Deal with it!"'] - assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=timezone.utc) - - -def test_unfavorite() -> None: - evs = my_reddit_rexport.events() - unfavs = [s for s in evs if s.text == 'unfavorited'] - [xxx] = [u for u in unfavs if u.eid == 'unf-19ifop'] - assert xxx.dt == datetime(2019, 1, 29, 10, 10, 20, tzinfo=timezone.utc) - - def test_preserves_extra_attr() -> None: # doesn't strictly belong here (not specific to reddit) # but my.reddit does a fair bit of dynamic hacking, so perhaps a good place to check nothing is lost From 29832a9f75416ac60f47ee1af5473f2c932e6f8d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 19 Oct 2023 01:52:57 +0100 Subject: [PATCH 146/302] core: fix test_get_files after updating kompress --- my/core/tests/test_get_files.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/my/core/tests/test_get_files.py b/my/core/tests/test_get_files.py index fdec5c0..80d9e8f 100644 --- a/my/core/tests/test_get_files.py +++ b/my/core/tests/test_get_files.py @@ -89,20 +89,20 @@ def test_explicit_glob() -> None: You can pass a glob to restrict the extensions ''' - create('/tmp/hpi_test/file_3.zip') - create('/tmp/hpi_test/file_2.zip') + create('/tmp/hpi_test/file_3.gz') + create('/tmp/hpi_test/file_2.gz') create('/tmp/hpi_test/ignoreme') - create('/tmp/hpi_test/file.zip') + create('/tmp/hpi_test/file.gz') # todo walrus operator would be great here... expected = ( - Path('/tmp/hpi_test/file_2.zip'), - Path('/tmp/hpi_test/file_3.zip'), + Path('/tmp/hpi_test/file_2.gz'), + Path('/tmp/hpi_test/file_3.gz'), ) - assert get_files('/tmp/hpi_test', 'file_*.zip') == expected + assert get_files('/tmp/hpi_test', 'file_*.gz') == expected "named argument should work too" - assert get_files('/tmp/hpi_test', glob='file_*.zip') == expected + assert get_files('/tmp/hpi_test', glob='file_*.gz') == expected def test_implicit_glob() -> None: @@ -114,14 +114,14 @@ def test_implicit_glob() -> None: create('/tmp/hpi_test/123/') create('/tmp/hpi_test/123/dummy') - create('/tmp/hpi_test/123/file.zip') + create('/tmp/hpi_test/123/file.gz') create('/tmp/hpi_test/456/') create('/tmp/hpi_test/456/dummy') - create('/tmp/hpi_test/456/file.zip') + create('/tmp/hpi_test/456/file.gz') - assert get_files(['/tmp/hpi_test/*/*.zip']) == ( - Path('/tmp/hpi_test/123/file.zip'), - Path('/tmp/hpi_test/456/file.zip'), + assert get_files(['/tmp/hpi_test/*/*.gz']) == ( + Path('/tmp/hpi_test/123/file.gz'), + Path('/tmp/hpi_test/456/file.gz'), ) From 9ffce1b696dcd0cfa8d1b22b1eddbc1b069e22a1 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 19 Oct 2023 02:05:43 +0100 Subject: [PATCH 147/302] reddit.rexport: add accessors for subreddits, multireddits and profile --- my/reddit/rexport.py | 56 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index dadfb5a..1f72133 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -1,17 +1,20 @@ """ Reddit data: saved items/comments/upvotes/etc. """ +from __future__ import annotations + REQUIRES = [ 'git+https://github.com/karlicoss/rexport', ] from dataclasses import dataclass from pathlib import Path -from typing import Iterator, Sequence +from typing import TYPE_CHECKING, Iterator, Sequence from my.core import ( get_files, make_logger, + warnings, stat, Paths, Stats, @@ -42,9 +45,7 @@ def migration(attrs: Attrs) -> Attrs: ex: uconfig.rexport = attrs['rexport'] attrs['export_path'] = ex.export_path else: - from my.core.warnings import high - - high("""DEPRECATED! Please modify your reddit config to look like: + warnings.high("""DEPRECATED! Please modify your reddit config to look like: class reddit: class rexport: @@ -53,7 +54,7 @@ class reddit: export_dir = 'export_dir' if export_dir in attrs: # legacy name attrs['export_path'] = attrs[export_dir] - high(f'"{export_dir}" is deprecated! Please use "export_path" instead."') + warnings.high(f'"{export_dir}" is deprecated! Please use "export_path" instead."') return attrs @@ -77,6 +78,11 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) +# TODO hmm so maybe these import here are not so great +# the issue is when the dal is updated (e.g. more types added) +# then user's state can be inconsistent if they update HPI, but don't update the dal +# maybe best to keep things begind the DAL after all + # fmt: off Uid = dal.Sid # str Save = dal.Save @@ -114,6 +120,46 @@ def upvoted() -> Iterator[Upvote]: return _dal().upvoted() +# uhh.. so with from __future__ import annotations, in principle we don't need updated export +# (with new entity types for function definitions below) +# however, cachew (as of 0.14.20231004) will crash during to get_type_hints call with these +# so we need to make cachew decorating defensive here +# will need to keep this for some time for backwards compatibility till cachew fix catches up +if not TYPE_CHECKING: + # in runtime need to be defensive + try: + # here we just check that types are available, we don't actually want to import them + # fmt: off + dal.Subreddit + dal.Profile + dal.Multireddit + # fmt: on + except AttributeError as ae: + warnings.high(f'{ae} : please update "rexport" installation') + _cache = lambda f: f + _USING_NEW_REXPORT = False + else: + _cache = cache + _USING_NEW_REXPORT = True +else: + _cache = cache + + +@_cache +def subreddits() -> Iterator[dal.Subreddit]: + return _dal().subreddits() + + +@_cache +def multireddits() -> Iterator[dal.Multireddit]: + return _dal().multireddits() + + +@_cache +def profile() -> dal.Profile: + return _dal().profile() + + def stats() -> Stats: return { # fmt: off From c63e80ce944580530d9f8ac175c9a24fbd75e3cf Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 14 Oct 2023 19:21:58 +0100 Subject: [PATCH 148/302] core: more consistent handling of zip archives in get_files + tests --- my/core/common.py | 41 ++++++++++++++++++++------------- my/core/tests/test_get_files.py | 36 +++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 18 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index cd6de49..1284565 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -3,6 +3,7 @@ from pathlib import Path from datetime import datetime import functools from contextlib import contextmanager +import os import sys import types from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple, TYPE_CHECKING, NoReturn @@ -161,11 +162,6 @@ from .logging import setup_logger, LazyLogger Paths = Union[Sequence[PathIsh], PathIsh] -def _is_zippath(p: Path) -> bool: - # weak type check here, don't want to depend on kompress library in get_files - return type(p).__name__ == 'ZipPath' - - DEFAULT_GLOB = '*' def get_files( pp: Paths, @@ -185,22 +181,19 @@ def get_files( elif isinstance(pp, str): if pp == '': # special case -- makes sense for optional data sources, etc - return () # early return to prevent warnings etc + return () # early return to prevent warnings etc sources = [Path(pp)] else: sources = [p if isinstance(p, Path) else Path(p) for p in pp] def caller() -> str: import traceback + # TODO ugh. very flaky... -3 because [, get_files(), ] return traceback.extract_stack()[-3].filename paths: List[Path] = [] for src in sources: - if _is_zippath(src): - paths.append(src) - continue - if src.parts[0] == '~': src = src.expanduser() # note: glob handled first, because e.g. on Windows asterisk makes is_dir unhappy @@ -209,15 +202,18 @@ def get_files( if glob != DEFAULT_GLOB: warnings.warn(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!") paths.extend(map(Path, do_glob(gs))) - elif src.is_dir(): + elif os.path.isdir(str(src)): + # NOTE: we're using os.path here on purpose instead of src.is_dir + # the reason is is_dir for archives might return True and then + # this clause would try globbing insize the archives + # this is generally undesirable (since modules handle archives themselves) + # todo not sure if should be recursive? # note: glob='**/*.ext' works without any changes.. so perhaps it's ok as it is gp: Iterable[Path] = src.glob(glob) paths.extend(gp) else: - if not src.is_file(): - # todo not sure, might be race condition? - raise RuntimeError(f"Expected '{src}' to exist") + assert src.exists(), src # todo assert matches glob?? paths.append(src) @@ -231,11 +227,24 @@ def get_files( '''.strip()) # traceback is useful to figure out what config caused it? import traceback + traceback.print_stack() if guess_compression: - from kompress import CPath, is_compressed - paths = [CPath(p) if is_compressed(p) and not _is_zippath(p) else p for p in paths] # TODO fwtf is going on here?... make sure it's tested + from .kompress import CPath, is_compressed, ZipPath + + # NOTE: wrap is just for backwards compat with vendorized kompress + # with kompress library, only is_compressed check and Cpath should be enough + def wrap(p: Path) -> Path: + if isinstance(p, ZipPath): + return p + if p.suffix == '.zip': + return ZipPath(p) # type: ignore[return-value] + if is_compressed(p): + return CPath(p) + return p + + paths = [wrap(p) for p in paths] return tuple(paths) diff --git a/my/core/tests/test_get_files.py b/my/core/tests/test_get_files.py index 80d9e8f..2bdc903 100644 --- a/my/core/tests/test_get_files.py +++ b/my/core/tests/test_get_files.py @@ -3,15 +3,18 @@ from pathlib import Path import shutil import tempfile from typing import TYPE_CHECKING +import zipfile -from ..compat import windows from ..common import get_files +from ..compat import windows +from ..kompress import CPath, ZipPath import pytest # hack to replace all /tmp with 'real' tmp dir # not ideal, but makes tests more concise +# TODO get rid of this, it's super confusing.. def _get_files(x, *args, **kwargs): from ..common import get_files as get_files_orig @@ -27,9 +30,10 @@ def _get_files(x, *args, **kwargs): x = repl(x) res = get_files_orig(x, *args, **kwargs) - return tuple(Path(str(i).replace(TMP, '/tmp')) for i in res) # hack back for asserts.. + return tuple(type(i)(str(i).replace(TMP, '/tmp')) for i in res) # hack back for asserts.. +get_files_orig = get_files if not TYPE_CHECKING: get_files = _get_files @@ -136,6 +140,34 @@ def test_no_files() -> None: assert get_files('bad*glob') == () +def test_compressed(tmp_path: Path) -> None: + file1 = tmp_path / 'file_1.zstd' + file2 = tmp_path / 'file_2.zip' + file3 = tmp_path / 'file_3.csv' + + file1.touch() + with zipfile.ZipFile(file2, 'w') as zf: + zf.writestr('path/in/archive', 'data in zip') + file3.touch() + + results = get_files_orig(tmp_path) + [res1, res2, res3] = results + assert isinstance(res1, CPath) + assert isinstance(res2, ZipPath) # NOTE this didn't work on vendorized kompress, but it's fine, was never used? + assert not isinstance(res3, CPath) + + results = get_files_orig( + [CPath(file1), ZipPath(file2), file3], + # sorting a mixture of ZipPath/Path was broken in old kompress + # it almost never happened though (usually it's only a bunch of ZipPath, so not a huge issue) + sort=False, + ) + [res1, res2, res3] = results + assert isinstance(res1, CPath) + assert isinstance(res2, ZipPath) + assert not isinstance(res3, CPath) + + # TODO not sure if should uniquify if the filenames end up same? # TODO not sure about the symlinks? and hidden files? From 8c2d1c9463f76507a62db75eb9c4c111c48270a0 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 14 Oct 2023 23:18:01 +0100 Subject: [PATCH 149/302] general: use less explicit kompress boilerplate in modules now get_files/kompress library can handle it transparently --- my/core/_deprecated/kompress.py | 4 ++++ my/core/structure.py | 3 ++- my/google/takeout/html.py | 3 +-- my/google/takeout/parser.py | 3 +-- my/google/takeout/paths.py | 4 +--- my/instagram/gdpr.py | 3 +-- my/location/google.py | 3 +-- my/stackexchange/gdpr.py | 5 ++--- my/twitter/archive.py | 5 +---- 9 files changed, 14 insertions(+), 19 deletions(-) diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index e4840f6..cd1bd9d 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -257,4 +257,8 @@ class ZipPath(zipfile_Path): ) return os.stat_result(tuple(params.values())) + @property + def suffix(self) -> str: + return Path(self.parts[-1]).suffix + # fmt: on diff --git a/my/core/structure.py b/my/core/structure.py index 88b75b8..7a0c2a2 100644 --- a/my/core/structure.py +++ b/my/core/structure.py @@ -123,7 +123,8 @@ def match_structure( searchdir = Path(tempfile.mkdtemp(dir=tdir)) - zf = zipfile.ZipFile(base) + # base might already be a ZipPath, and str(base) would end with / + zf = zipfile.ZipFile(str(base).rstrip('/')) zf.extractall(path=str(searchdir)) else: diff --git a/my/google/takeout/html.py b/my/google/takeout/html.py index c01788d..5d65a86 100644 --- a/my/google/takeout/html.py +++ b/my/google/takeout/html.py @@ -146,12 +146,11 @@ class TakeoutHTMLParser(HTMLParser): def read_html(tpath: Path, file: str) -> Iterable[Parsed]: - from ...core.kompress import kopen results: List[Parsed] = [] def cb(dt: datetime, url: Url, title: Title) -> None: results.append((dt, url, title)) parser = TakeoutHTMLParser(callback=cb) - with kopen(tpath, file) as fo: + with (tpath / file).open() as fo: data = fo.read() parser.feed(data) return results diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 9a90c8f..96acfff 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -94,10 +94,9 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: for path in reversed(inputs()): with ExitStack() as exit_stack: if config._use_zippath: - from my.core.kompress import ZipPath # for later takeouts it's just 'Takeout' dir, # but for older (pre 2015) it contains email/date in the subdir name - results = tuple(cast(Sequence[Path], ZipPath(path).iterdir())) + results = tuple(cast(Sequence[Path], path.iterdir())) else: results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True)) for m in results: diff --git a/my/google/takeout/paths.py b/my/google/takeout/paths.py index ee3e1e7..5b53149 100644 --- a/my/google/takeout/paths.py +++ b/my/google/takeout/paths.py @@ -23,8 +23,6 @@ config = make_config(google) from pathlib import Path from typing import Optional, Iterable -from ...core.kompress import kexists - def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]: """ @@ -33,7 +31,7 @@ def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]: # TODO FIXME zip is not great.. # allow a lambda expression? that way the user could restrict it for takeout in get_files(config.takeout_path, glob='*.zip'): - if path is None or kexists(takeout, path): + if path is None or (takeout / path).exists(): yield takeout diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 348d69d..afa4c96 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -17,7 +17,6 @@ from my.core import ( assert_never, make_logger, ) -from my.core.kompress import ZipPath from my.config import instagram as user_config @@ -70,7 +69,7 @@ def _decode(s: str) -> str: def _entities() -> Iterator[Res[Union[User, _Message]]]: - last = ZipPath(max(inputs())) + last = max(inputs()) # TODO make sure it works both with plan directory # idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here # e.g. possible options are: diff --git a/my/location/google.py b/my/location/google.py index fdddd92..ed37231 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -21,7 +21,6 @@ import geopy # type: ignore from ..core.common import LazyLogger, mcachew from ..core.cachew import cache_dir -from ..core import kompress from my.core.warnings import high @@ -135,7 +134,7 @@ def _iter_locations(path: Path, start=0, stop=None) -> Iterable[Location]: ctx = path.open('r') else: # must be a takeout archive # todo CPath? although not sure if it can be iterative? - ctx = kompress.open(path, _LOCATION_JSON) + ctx = (path / _LOCATION_JSON).open() if USE_GREP: unzip = f'unzip -p "{path}" "{_LOCATION_JSON}"' diff --git a/my/stackexchange/gdpr.py b/my/stackexchange/gdpr.py index 4a3182b..18b2b4d 100644 --- a/my/stackexchange/gdpr.py +++ b/my/stackexchange/gdpr.py @@ -6,7 +6,7 @@ Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][officia ### config from my.config import stackexchange as user_config -from ..core import dataclass, PathIsh, make_config +from ..core import dataclass, PathIsh, make_config, get_files @dataclass class stackexchange(user_config): gdpr_path: PathIsh # path to GDPR zip file @@ -61,12 +61,11 @@ class Vote(NamedTuple): # todo expose vote type? import json -from ..core.kompress import ZipPath from ..core.error import Res def votes() -> Iterable[Res[Vote]]: # TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed # todo should be defensive? not sure if present when user has no votes - path = ZipPath(config.gdpr_path) + path = max(get_files(config.gdpr_path)) votes_path = path / 'analytics' / 'qa\\vote.submit.json' # yes, it does contain a backslash... j = json.loads(votes_path.read_text(encoding='utf-8-sig')) # not sure why, but this encoding seems necessary for r in reversed(j): # they seem to be in decreasing order by default diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 44ebc5f..22014df 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -26,7 +26,6 @@ from functools import cached_property import html from ..core.common import Paths, datetime_aware from ..core.error import Res -from ..core.kompress import ZipPath @dataclass class twitter_archive(user_config): @@ -164,9 +163,7 @@ class Like(NamedTuple): class ZipExport: def __init__(self, archive_path: Path) -> None: - # todo maybe this should be insude get_files instead, perhps covered with a flag? - self.zpath = ZipPath(archive_path) - + self.zpath = archive_path if (self.zpath / 'tweets.csv').exists(): from ..core.warnings import high high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.") From 37bb33cdbc1eae6a5c3cfc35eb472461b006287f Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 21 Oct 2023 22:25:16 +0100 Subject: [PATCH 150/302] experimental: add a hacky helper to import "original/shadowed" modules from within overlays --- my/core/experimental.py | 64 ++++++++++++++++++++++++++++++++++++++++ my/util/hpi_heartbeat.py | 54 +++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 my/core/experimental.py create mode 100644 my/util/hpi_heartbeat.py diff --git a/my/core/experimental.py b/my/core/experimental.py new file mode 100644 index 0000000..c10ba71 --- /dev/null +++ b/my/core/experimental.py @@ -0,0 +1,64 @@ +import sys +from typing import Any, Dict, Optional +import types + + +# The idea behind this one is to support accessing "overlaid/shadowed" modules from namespace packages +# See usage examples here: +# - https://github.com/karlicoss/hpi-personal-overlay/blob/master/src/my/util/hpi_heartbeat.py +# - https://github.com/karlicoss/hpi-personal-overlay/blob/master/src/my/twitter/all.py +# Suppose you want to use my.twitter.talon, which isn't in the default all.py +# You could just copy all.py to your personal overlay, but that would mean duplicating +# all the code and possible upstream changes. +# Alternatively, you could import the "original" my.twitter.all module from "overlay" my.twitter.all +# _ORIG = import_original_module(__name__, __file__) +# this would magically take care of package import path etc, +# and should import the "original" my.twitter.all as _ORIG +# After that you can call its methods, extend etc. +def import_original_module( + module_name: str, + file: str, + *, + star: bool = False, + globals: Optional[Dict[str, Any]] = None, +) -> types.ModuleType: + module_to_restore = sys.modules[module_name] + + # NOTE: we really wanna to hack the actual package of the module + # rather than just top level my. + # since that would be a bit less disruptive + module_pkg = module_to_restore.__package__ + assert module_pkg is not None + parent = sys.modules[module_pkg] + + my_path = parent.__path__._path # type: ignore[attr-defined] + my_path_orig = list(my_path) + + def fixup_path() -> None: + for i, p in enumerate(my_path_orig): + starts = file.startswith(p) + if i == 0: + # not sure about this.. but I guess it'll always be 0th element? + assert starts, (my_path_orig, file) + if starts: + my_path.remove(p) + # should remove exactly one item + assert len(my_path) + 1 == len(my_path_orig), (my_path_orig, file) + + try: + fixup_path() + try: + del sys.modules[module_name] + # NOTE: we're using __import__ instead of importlib.import_module + # since it's closer to the actual normal import (e.g. imports subpackages etc properly ) + # fromlist=[None] forces it to return rightmost child + # (otherwise would just return 'my' package) + res = __import__(module_name, fromlist=[None]) # type: ignore[list-item] + if star: + assert globals is not None + globals.update({k: v for k, v in vars(res).items() if not k.startswith('_')}) + return res + finally: + sys.modules[module_name] = module_to_restore + finally: + my_path[:] = my_path_orig diff --git a/my/util/hpi_heartbeat.py b/my/util/hpi_heartbeat.py new file mode 100644 index 0000000..84790a4 --- /dev/null +++ b/my/util/hpi_heartbeat.py @@ -0,0 +1,54 @@ +""" +Just an helper module for testing HPI overlays +In particular the behaviour of import_original_module function + +The idea of testing is that overlays extend this module, and add their own +items to items(), and the checker asserts all overlays have contributed. +""" +from my.core import __NOT_HPI_MODULE__ + +from dataclasses import dataclass +from datetime import datetime +import sys +from typing import Iterator, List + +NOW = datetime.now() + + +@dataclass +class Item: + dt: datetime + message: str + path: List[str] + + +def get_pkg_path() -> List[str]: + pkg = sys.modules[__package__] + return list(pkg.__path__) + + +# NOTE: since we're hacking path for my.util +# imports from my. should work as expected +# (even though my.config is in the private config) +from my.config import demo + +assert demo.username == 'todo' + +# however, this won't work while the module is imported +# from my.util import extra +# assert extra.message == 'EXTRA' +# but it will work when we actually call the function (see below) + + +def items() -> Iterator[Item]: + from my.config import demo + + assert demo.username == 'todo' + + # here the import works as expected, since by the time the function is called, + # all overlays were already processed and paths/sys.modules restored + from my.util import extra # type: ignore[attr-defined] + + assert extra.message == 'EXTRA' + + yield Item(dt=NOW, message='hpi main', path=get_pkg_path()) From 872053a3c3b57510e61bbf9caa54134a8fc844d4 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 21 Oct 2023 23:02:40 +0100 Subject: [PATCH 151/302] my.hackernews.harmonic: fix issue with crashing due to html escaping also add proper logging --- my/hackernews/harmonic.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/my/hackernews/harmonic.py b/my/hackernews/harmonic.py index a4eb28e..f78c3ef 100644 --- a/my/hackernews/harmonic.py +++ b/my/hackernews/harmonic.py @@ -6,7 +6,6 @@ REQUIRES = ['lxml'] from dataclasses import dataclass from datetime import datetime, timezone import json -import html from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Sequence, TypedDict, cast @@ -19,12 +18,15 @@ from my.core import ( Stats, datetime_aware, get_files, + make_logger, stat, ) from .common import hackernews_link, SavedBase from my.config import harmonic as user_config +logger = make_logger(__name__) + @dataclass class harmonic(user_config): @@ -47,7 +49,8 @@ class Cached(TypedDict): # TODO also has children with comments, but not sure I need it? -# TODO reuse savedbase in materialistic? +# TODO if we ever add use .text property, need to html.unescape it first +# TODO reuse SavedBase in materialistic? @dataclass class Saved(SavedBase): raw: Cached @@ -79,6 +82,7 @@ _PREFIX = 'com.simon.harmonichackernews.KEY_SHARED_PREFERENCES' def _saved() -> Iterator[Res[Saved]]: for p in inputs(): + logger.info(f'processing: {p}') # TODO defensive for each item! tr = etree.parse(p) @@ -88,7 +92,7 @@ def _saved() -> Iterator[Res[Saved]]: cached: Dict[str, Cached] = {} for sid in cached_ids: res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORY{sid}"]'))) - j = json.loads(html.unescape(res.text)) + j = json.loads(res.text) cached[sid] = j res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_BOOKMARKS"]'))) From c5fe2e94125deda779b0d0088d2bb44732fcbbe1 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 21 Oct 2023 23:08:40 +0100 Subject: [PATCH 152/302] core.stats: fix is_data_provider when from __future__ import annotations is used --- my/core/stats.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/my/core/stats.py b/my/core/stats.py index 8923996..42e8cd9 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -4,7 +4,6 @@ Helpers for hpi doctor/stats functionality. import collections import importlib import inspect -import sys import typing from typing import Optional, Callable, Any, Iterator, Sequence, Dict, List @@ -59,7 +58,14 @@ def is_data_provider(fun: Any) -> bool: if fun.__name__ == 'inputs' or fun.__name__.endswith('_inputs'): return False - return_type = sig.return_annotation + # inspect.signature might return str instead of a proper type object + # if from __future__ import annotations is used + # so best to rely on get_type_hints (which evals the annotations) + type_hints = typing.get_type_hints(fun) + return_type = type_hints.get('return') + if return_type is None: + return False + return type_is_iterable(return_type) @@ -123,9 +129,6 @@ def test_sig_required_params() -> None: def type_is_iterable(type_spec) -> bool: - if sys.version_info[1] < 8: - # there is no get_origin before 3.8, and retrofitting gonna be a lot of pain - return any(x in str(type_spec) for x in ['List', 'Sequence', 'Iterable', 'Iterator']) origin = typing.get_origin(type_spec) if origin is None: return False From a60d69fb30bad7aca812e20c31083ae926fbce15 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 21 Oct 2023 23:50:35 +0100 Subject: [PATCH 153/302] core/stats: get rid of duplicated keys for 'auto stats' previously: ``` {'iter_data': {'iter_data': {'count': 9, 'last': datetime.datetime(2020, 1, 3, 1, 1, 1)}}} ``` after ``` {'iter_data': {'count': 9, 'last': datetime.datetime(2020, 1, 3, 1, 1, 1)}} ``` --- my/core/common.py | 17 ++++++++++++----- my/core/stats.py | 23 +++++++++++++++++++++-- my/core/tests/auto_stats.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 my/core/tests/auto_stats.py diff --git a/my/core/common.py b/my/core/common.py index 1284565..b692730 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -401,7 +401,12 @@ C = TypeVar('C') Stats = Dict[str, Any] StatsFun = Callable[[], Stats] # todo not sure about return type... -def stat(func: Union[Callable[[], Iterable[C]], Iterable[C]], quick: bool=False) -> Stats: +def stat( + func: Union[Callable[[], Iterable[C]], Iterable[C]], + *, + quick: bool = False, + name: Optional[str] = None, +) -> Stats: if callable(func): fr = func() fname = func.__name__ @@ -409,18 +414,20 @@ def stat(func: Union[Callable[[], Iterable[C]], Iterable[C]], quick: bool=False) # meh. means it's just a list.. not sure how to generate a name then fr = func fname = f'unnamed_{id(fr)}' - tname = type(fr).__name__ - if tname == 'DataFrame': + type_name = type(fr).__name__ + if type_name == 'DataFrame': # dynamic, because pandas is an optional dependency.. - df = cast(Any, fr) # todo ugh, not sure how to annotate properly + df = cast(Any, fr) # todo ugh, not sure how to annotate properly res = dict( dtypes=df.dtypes.to_dict(), rows=len(df), ) else: res = _stat_iterable(fr, quick=quick) + + stat_name = name if name is not None else fname return { - fname: res, + stat_name: res, } diff --git a/my/core/stats.py b/my/core/stats.py index 42e8cd9..9dfaa04 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -12,16 +12,35 @@ from .common import StatsFun, Stats, stat # TODO maybe could be enough to annotate OUTPUTS or something like that? # then stats could just use them as hints? -def guess_stats(module_name: str, quick: bool=False) -> Optional[StatsFun]: +def guess_stats(module_name: str, quick: bool = False) -> Optional[StatsFun]: providers = guess_data_providers(module_name) if len(providers) == 0: return None def auto_stats() -> Stats: - return {k: stat(v, quick=quick) for k, v in providers.items()} + res = {} + for k, v in providers.items(): + res.update(stat(v, quick=quick, name=k)) + return res + return auto_stats +def test_guess_stats() -> None: + from datetime import datetime + import my.core.tests.auto_stats as M + + auto_stats = guess_stats(M.__name__) + res = auto_stats() + assert res.keys() == {'iter_data'} + + r = res['iter_data'] + assert r == { + 'count': 9, + 'last': datetime(2020, 1, 3, 1, 1, 1), + } + + def guess_data_providers(module_name: str) -> Dict[str, Callable]: module = importlib.import_module(module_name) mfunctions = inspect.getmembers(module, inspect.isfunction) diff --git a/my/core/tests/auto_stats.py b/my/core/tests/auto_stats.py new file mode 100644 index 0000000..2946ab2 --- /dev/null +++ b/my/core/tests/auto_stats.py @@ -0,0 +1,30 @@ +""" +Helper 'module' for test_guess_stats +""" + +from dataclasses import dataclass +from datetime import datetime, timedelta +from pathlib import Path +from typing import Iterable, Sequence + + +@dataclass +class Item: + id: str + dt: datetime + source: Path + + +def inputs() -> Sequence[Path]: + return [ + Path('file1.json'), + Path('file2.json'), + Path('file3.json'), + ] + + +def iter_data() -> Iterable[Item]: + dt = datetime.fromisoformat('2020-01-01 01:01:01') + for path in inputs(): + for i in range(3): + yield Item(id=str(i), dt=dt + timedelta(days=i), source=path) From c335c0c9d8673247dd7141cc9c075a5846467e28 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 21 Oct 2023 23:57:01 +0100 Subject: [PATCH 154/302] core/stats: report datetime of first item in addition to last quite useful for quickly determining time span of a data source --- my/core/common.py | 20 ++++++++++++++------ my/core/stats.py | 1 + 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index b692730..b34d6d2 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -431,22 +431,25 @@ def stat( } -def _stat_iterable(it: Iterable[C], quick: bool=False) -> Any: +def _stat_iterable(it: Iterable[C], quick: bool = False) -> Any: from more_itertools import ilen, take, first # todo not sure if there is something in more_itertools to compute this? total = 0 errors = 0 - last = None + first_item = None + last_item = None def funcit(): - nonlocal errors, last, total + nonlocal errors, first_item, last_item, total for x in it: total += 1 if isinstance(x, Exception): errors += 1 else: - last = x + last_item = x + if first_item is None: + first_item = x yield x eit = funcit() @@ -471,8 +474,13 @@ def _stat_iterable(it: Iterable[C], quick: bool=False) -> Any: if errors > 0: res['errors'] = errors - if last is not None: - dt = guess_datetime(last) + if first_item is not None: + dt = guess_datetime(first_item) + if dt is not None: + res['first'] = dt + + if last_item is not None: + dt = guess_datetime(last_item) if dt is not None: res['last'] = dt return res diff --git a/my/core/stats.py b/my/core/stats.py index 9dfaa04..1818b63 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -37,6 +37,7 @@ def test_guess_stats() -> None: r = res['iter_data'] assert r == { 'count': 9, + 'first': datetime(2020, 1, 1, 1, 1, 1), 'last': datetime(2020, 1, 3, 1, 1, 1), } From 86ea605aecbe330731c4bdba7f15a2524ece1808 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sun, 22 Oct 2023 00:07:48 +0100 Subject: [PATCH 155/302] core/stats: enable processing input files, report first and last filename can be useful for quick investigation/testing setup --- my/core/common.py | 20 ++++++++++++-------- my/core/stats.py | 26 ++++++++++++++------------ 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index b34d6d2..602f8af 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -474,15 +474,19 @@ def _stat_iterable(it: Iterable[C], quick: bool = False) -> Any: if errors > 0: res['errors'] = errors - if first_item is not None: - dt = guess_datetime(first_item) - if dt is not None: - res['first'] = dt + def stat_item(item): + if item is None: + return None + if isinstance(item, Path): + return str(item) + return guess_datetime(item) + + if (stat_first := stat_item(first_item)) is not None: + res['first'] = stat_first + + if (stat_last := stat_item(last_item)) is not None: + res['last'] = stat_last - if last_item is not None: - dt = guess_datetime(last_item) - if dt is not None: - res['last'] = dt return res diff --git a/my/core/stats.py b/my/core/stats.py index 1818b63..44735b8 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -31,14 +31,20 @@ def test_guess_stats() -> None: import my.core.tests.auto_stats as M auto_stats = guess_stats(M.__name__) + assert auto_stats is not None res = auto_stats() - assert res.keys() == {'iter_data'} - r = res['iter_data'] - assert r == { - 'count': 9, - 'first': datetime(2020, 1, 1, 1, 1, 1), - 'last': datetime(2020, 1, 3, 1, 1, 1), + assert res == { + 'inputs': { + 'count': 3, + 'first': 'file1.json', + 'last': 'file3.json', + }, + 'iter_data': { + 'count': 9, + 'first': datetime(2020, 1, 1, 1, 1, 1), + 'last': datetime(2020, 1, 3, 1, 1, 1), + }, } @@ -54,7 +60,6 @@ def is_data_provider(fun: Any) -> bool: 1. returns iterable or something like that 2. takes no arguments? (otherwise not callable by stats anyway?) 3. doesn't start with an underscore (those are probably helper functions?) - 4. functions isn't the 'inputs' function (or ends with '_inputs') """ # todo maybe for 2 allow default arguments? not sure # one example which could benefit is my.pdfs @@ -74,9 +79,6 @@ def is_data_provider(fun: Any) -> bool: # probably a helper function? if fun.__name__.startswith('_'): return False - # ignore def inputs; something like comment_inputs or backup_inputs should also be ignored - if fun.__name__ == 'inputs' or fun.__name__.endswith('_inputs'): - return False # inspect.signature might return str instead of a proper type object # if from __future__ import annotations is used @@ -116,11 +118,11 @@ def test_is_data_provider() -> None: def inputs() -> Iterator[Any]: yield 1 - assert not idp(inputs) + assert idp(inputs) def producer_inputs() -> Iterator[Any]: yield 1 - assert not idp(producer_inputs) + assert idp(producer_inputs) # return any parameters the user is required to provide - those which don't have default values From f9a1050ceb7c1abe3a5340f70c7899571860a0d6 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 8 Oct 2023 23:37:59 +0100 Subject: [PATCH 156/302] my.instagram.android: more defensive error handling --- my/instagram/android.py | 83 ++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/my/instagram/android.py b/my/instagram/android.py index 8ebbf9f..97733b8 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -7,6 +7,7 @@ from dataclasses import dataclass from datetime import datetime import json from pathlib import Path +import sqlite3 from typing import Iterator, Sequence, Optional, Dict, Union from more_itertools import unique_everseen @@ -22,6 +23,7 @@ from my.core import ( assert_never, ) from my.core.cachew import mcachew +from my.core.error import echain from my.core.sqlite import sqlite_connect_immutable, select from my.config import instagram as user_config @@ -132,6 +134,48 @@ def _parse_message(j: Json) -> Optional[_Message]: ) +def _process_db(db: sqlite3.Connection) -> Iterator[Res[Union[User, _Message]]]: + # TODO ugh. seems like no way to extract username? + # sometimes messages (e.g. media_share) contain it in message field + # but generally it's not present. ugh + for (self_uid,) in select(('user_id',), 'FROM session', db=db): + yield User( + id=str(self_uid), + full_name=config.full_name or 'USERS_OWN_FULL_NAME', + username=config.full_name or 'USERS_OWN_USERNAME', + ) + + for (thread_json,) in select(('thread_info',), 'FROM threads', db=db): + j = json.loads(thread_json) + # todo in principle should leave the thread attached to the message? + # since thread is a group of users? + pre_users = [] + # inviter usually contains our own user + if 'inviter' in j: + # sometimes it's missing (e.g. in broadcast channels) + pre_users.append(j['inviter']) + pre_users.extend(j['recipients']) + for r in pre_users: + # id disappeared and seems that pk_id is in use now (around december 2022) + uid = r.get('id') or r.get('pk_id') + assert uid is not None + yield User( + id=str(uid), # for some reason it's int in the db + full_name=r['full_name'], + username=r['username'], + ) + + for (msg_json,) in select(('message',), 'FROM messages ORDER BY timestamp', db=db): + # eh, seems to contain everything in json? + j = json.loads(msg_json) + try: + m = _parse_message(j) + if m is not None: + yield m + except Exception as e: + yield e + + def _entities() -> Iterator[Res[Union[User, _Message]]]: # NOTE: definitely need to merge multiple, app seems to recycle old messages # TODO: hmm hard to guarantee timestamp ordering when we use synthetic input data... @@ -140,40 +184,11 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: for f in dbs: logger.info(f'{f} : processing...') with sqlite_connect_immutable(f) as db: - # TODO ugh. seems like no way to extract username? - # sometimes messages (e.g. media_share) contain it in message field - # but generally it's not present. ugh - for (self_uid,) in select(('user_id',), 'FROM session', db=db): - yield User( - id=str(self_uid), - full_name=config.full_name or 'USERS_OWN_FULL_NAME', - username=config.full_name or 'USERS_OWN_USERNAME', - ) - - for (thread_json,) in select(('thread_info',), 'FROM threads', db=db): - j = json.loads(thread_json) - # todo in principle should leave the thread attached to the message? - # since thread is a group of users? - # inviter usually contains our own user - for r in [j['inviter'], *j['recipients']]: - # id disappeared and seems that pk_id is in use now (around december 2022) - uid = r.get('id') or r.get('pk_id') - assert uid is not None - yield User( - id=str(uid), # for some reason it's int in the db - full_name=r['full_name'], - username=r['username'], - ) - - for (msg_json,) in select(('message',), 'FROM messages ORDER BY timestamp', db=db): - # eh, seems to contain everything in json? - j = json.loads(msg_json) - try: - m = _parse_message(j) - if m is not None: - yield m - except Exception as e: - yield e + try: + yield from _process_db(db=db) + except Exception as e: + # todo use error policy here + yield echain(RuntimeError(f'While processing {f}'), cause=e) @mcachew(depends_on=inputs) From f355a55e06288abdf82e913ce43db4499f348d77 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 23 Oct 2023 02:19:23 +0100 Subject: [PATCH 157/302] my.instagram.gdpr: process all historic archives + better normalising --- my/instagram/gdpr.py | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index afa4c96..a42d73a 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -7,7 +7,7 @@ import json from pathlib import Path from typing import Iterator, Sequence, Dict, Union -from more_itertools import bucket +from more_itertools import bucket, unique_everseen from my.core import ( get_files, @@ -69,7 +69,20 @@ def _decode(s: str) -> str: def _entities() -> Iterator[Res[Union[User, _Message]]]: - last = max(inputs()) + # it's worth processing all previous export -- sometimes instagram removes some metadata from newer ones + # NOTE: here there are basically two options + # - process inputs as is (from oldest to newest) + # this would be more stable wrt newer exports (e.g. existing thread ids won't change) + # the downside is that newer exports seem to have better thread ids, so might be preferrable to use them + # - process inputs reversed (from newest to oldest) + # the upside is that thread ids/usernames might be better + # the downside is that if for example the user renames, thread ids will change _a lot_, might be undesirable.. + # (from newest to oldest) + for path in inputs(): + yield from _entitites_from_path(path) + + +def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]: # TODO make sure it works both with plan directory # idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here # e.g. possible options are: @@ -84,10 +97,10 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # whereas here I don't need it.. # so for now will just implement this adhoc thing and think about properly fixing later - personal_info = last / 'personal_information' + personal_info = path / 'personal_information' if not personal_info.exists(): # old path, used up to somewhere between feb-aug 2022 - personal_info = last / 'account_information' + personal_info = path / 'account_information' j = json.loads((personal_info / 'personal_information.json').read_text()) [profile] = j['profile_user'] @@ -104,8 +117,8 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: ) yield self_user - files = list(last.rglob('messages/inbox/*/message_*.json')) - assert len(files) > 0, last + files = list(path.rglob('messages/inbox/*/message_*.json')) + assert len(files) > 0, path buckets = bucket(files, key=lambda p: p.parts[-2]) file_map = {k: list(buckets[k]) for k in buckets} @@ -126,7 +139,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # so I feel like there is just not guaranteed way to correlate :( other_id = fname[-id_len:] # NOTE: no match in android db? - other_username = fname[:-id_len - 1] + other_username = fname[: -id_len - 1] other_full_name = _decode(j['title']) yield User( id=other_id, @@ -135,7 +148,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: ) # todo "thread_type": "Regular" ? - for jm in j['messages']: + for jm in reversed(j['messages']): # in json, they are in reverse order for some reason try: content = None if 'content' in jm: @@ -144,7 +157,15 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # ugh. for some reason these contain an extra space and that messes up message merging.. content = content.strip() else: - share = jm.get('share') + if (share := jm.get('share')) is not None: + if (share_link := share.get('link')) is not None: + # somewhere around 20231007, instagram removed these from gdpr links and they show up a lot in various diffs + share_link = share_link.replace('feed_type=reshare_chaining&', '') + share_link = share_link.replace('?feed_type=reshare_chaining', '') + share['link'] = share_link + if (share_text := share.get('share_text')) is not None: + share['share_text'] = _decode(share_text) + photos = jm.get('photos') videos = jm.get('videos') cc = share or photos or videos @@ -166,7 +187,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: created=datetime.fromtimestamp(timestamp_ms / 1000), text=content, user_id=user_id, - thread_id=fname, # meh.. but no better way? + thread_id=fname, # meh.. but no better way? ) except Exception as e: yield e @@ -175,7 +196,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # TODO basically copy pasted from android.py... hmm def messages() -> Iterator[Res[Message]]: id2user: Dict[str, User] = {} - for x in _entities(): + for x in unique_everseen(_entities()): if isinstance(x, Exception): yield x continue From 414b88178f5e755dfeed1f9a62c9ce989ca7f347 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 23 Oct 2023 19:43:54 +0100 Subject: [PATCH 158/302] tinder.android: infer user's own name automatically --- my/tinder/android.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/my/tinder/android.py b/my/tinder/android.py index a820947..9047a53 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -3,7 +3,7 @@ Tinder data from Android app database (in =/data/data/com.tinder/databases/tinde """ from __future__ import annotations -from collections import defaultdict +from collections import defaultdict, Counter from dataclasses import dataclass from datetime import datetime, timezone from itertools import chain @@ -13,16 +13,17 @@ from typing import Sequence, Iterator, Union, Dict, List, Mapping from more_itertools import unique_everseen -from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware, LazyLogger +from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware, make_logger +from my.core.error import echain from my.core.sqlite import sqlite_connection +import my.config -logger = LazyLogger(__name__) +logger = make_logger(__name__) -from my.config import tinder as user_config @dataclass -class config(user_config.android): +class config(my.config.tinder.android): # paths[s]/glob to the exported sqlite databases export_path: Paths @@ -82,22 +83,31 @@ def inputs() -> Sequence[Path]: _Entity = Union[Person, _Match, _Message] -Entity = Union[Person, Match, Message] +Entity = Union[Person, Match, Message] def _entities() -> Iterator[Res[_Entity]]: dbs = inputs() for i, db_file in enumerate(dbs): - logger.debug(f'processing {db_file} {i}/{len(dbs)}') + logger.info(f'processing {db_file} {i}/{len(dbs)}') with sqlite_connection(db_file, immutable=True, row_factory='row') as db: yield from _handle_db(db) def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]: # profile_user_view contains our own user id + user_profile_rows = list(db.execute('SELECT * FROM profile_user_view')) + + if len(user_profile_rows) == 0: + # shit, sometime in 2023 profile_user_view stoppped containing user profile.. + # presumably the most common from_id/to_id would be our own username + counter = Counter([id_ for (id_,) in db.execute('SELECT from_id FROM message UNION ALL SELECT to_id FROM message')]) + [(you_id, _)] = counter.most_common(1) + yield Person(id=you_id, name='you') + for row in chain( - db.execute('SELECT * FROM profile_user_view'), - db.execute('SELECT * FROM match_person'), + user_profile_rows, + db.execute('SELECT * FROM match_person'), ): try: yield _parse_person(row) @@ -135,7 +145,7 @@ def _parse_match(row: sqlite3.Row) -> _Match: def _parse_msg(row: sqlite3.Row) -> _Message: # note it also has raw_message_data -- not sure which is best to use.. - sent = row['sent_date'] + sent = row['sent_date'] return _Message( sent=datetime.fromtimestamp(sent / 1000, tz=timezone.utc), id=row['id'], @@ -149,7 +159,7 @@ def _parse_msg(row: sqlite3.Row) -> _Message: # todo maybe it's rich_entities method? def entities() -> Iterator[Res[Entity]]: id2person: Dict[str, Person] = {} - id2match : Dict[str, Match ] = {} + id2match: Dict[str, Match] = {} for x in unique_everseen(_entities()): if isinstance(x, Exception): yield x @@ -176,9 +186,9 @@ def entities() -> Iterator[Res[Entity]]: try: match = id2match[x.match_id] from_ = id2person[x.from_id] - to = id2person[x.to_id] + to = id2person[x.to_id] except Exception as e: - yield e + yield echain(RuntimeError(f'while processing {x}'), e) continue yield Message( sent=x.sent, @@ -219,6 +229,8 @@ def match2messages() -> Iterator[Res[Mapping[Match, Sequence[Message]]]]: ml.append(x) continue yield res + + # TODO maybe a more natural return type is Iterator[Res[Tuple[Key, Value]]] # but this doesn't work straight away because the key might have no corresponding values From 72ab2603d5d891b444b67e3e7330e856f5007307 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 23 Oct 2023 21:07:47 +0100 Subject: [PATCH 159/302] my.whatsapp.android: exclude some dummy messages, minor cleanup --- my/whatsapp/android.py | 60 +++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py index fbccbf5..0c6bd20 100644 --- a/my/whatsapp/android.py +++ b/my/whatsapp/android.py @@ -11,19 +11,17 @@ from typing import Sequence, Iterator, Optional from more_itertools import unique_everseen -from my.core import get_files, Paths, datetime_aware, Res, LazyLogger, make_config +from my.core import get_files, Paths, datetime_aware, Res, make_logger, make_config from my.core.error import echain, notnone from my.core.sqlite import sqlite_connection +import my.config -from my.config import whatsapp as user_config - - -logger = LazyLogger(__name__) +logger = make_logger(__name__) @dataclass -class Config(user_config.android): +class Config(my.config.whatsapp.android): # paths[s]/glob to the exported sqlite databases export_path: Paths my_user_id: Optional[str] = None @@ -63,11 +61,13 @@ def _process_db(db: sqlite3.Connection): # TODO later, split out Chat/Sender objects separately to safe on object creation, similar to other android data sources chats = {} - for r in db.execute(''' + for r in db.execute( + ''' SELECT raw_string_jid AS chat_id, subject FROM chat_view WHERE chat_id IS NOT NULL /* seems that it might be null for chats that are 'recycled' (the db is more like an LRU cache) */ - '''): + ''' + ): chat_id = r['chat_id'] subject = r['subject'] chat = Chat( @@ -76,12 +76,13 @@ def _process_db(db: sqlite3.Connection): ) chats[chat.id] = chat - senders = {} - for r in db.execute(''' + for r in db.execute( + ''' SELECT _id, raw_string FROM jid - '''): + ''' + ): # TODO seems that msgstore.db doesn't have contact names # perhaps should extract from wa.db and match against wa_contacts.jid? s = Sender( @@ -90,18 +91,25 @@ def _process_db(db: sqlite3.Connection): ) senders[r['_id']] = s - + # NOTE: hmm, seems that message_view or available_message_view use lots of NULL as ... + # so even if it seems as if it has a column (e.g. for attachment path), there is actually no such data + # so makes more sense to just query message column directly # todo message_type? mostly 0, but seems all over, even for seemingly normal messages with text - for r in db.execute(''' + for r in db.execute( + ''' SELECT C.raw_string_jid AS chat_id, M.key_id, M.timestamp, sender_jid_row_id, M.from_me, M.text_data, MM.file_path - FROM message AS M - LEFT JOIN chat_view AS C - ON M.chat_row_id = C._id - LEFT JOIN message_media AS MM - ON M._id = MM.message_row_id + FROM message AS M + LEFT JOIN chat_view AS C ON M.chat_row_id = C._id + LEFT JOIN message_media AS MM ON M._id = MM.message_row_id WHERE M.key_id != -1 /* key_id -1 is some sort of fake message where everything is null */ + /* type 7 seems to be some dummy system message. + sometimes contain chat name, but usually null, so ignore them + for normal messages it's 0 + */ + AND M.message_type != 7 ORDER BY M.timestamp - '''): + ''' + ): msg_id: str = notnone(r['key_id']) ts: int = notnone(r['timestamp']) dt = datetime.fromtimestamp(ts / 1000, tz=timezone.utc) @@ -131,28 +139,20 @@ def _process_db(db: sqlite3.Connection): # for group chats our onw id is still 0, but other ids are properly set if from_me: myself_user_id = config.my_user_id or 'MYSELF_USER_ID' - sender = Sender(id=myself_user_id, name=None) + sender = Sender(id=myself_user_id, name=None) # TODO set my own name as well? else: sender = Sender(id=chat.id, name=None) else: sender = senders[sender_row_id] - - - m = Message( - chat=chat, - id=msg_id, - dt=dt, - sender=sender, - text=text - ) + m = Message(chat=chat, id=msg_id, dt=dt, sender=sender, text=text) yield m def _messages() -> Iterator[Res[Message]]: dbs = inputs() for i, f in enumerate(dbs): - logger.debug(f'processing {f} {i}/{len(dbs)}') + logger.info(f'processing {f} {i}/{len(dbs)}') with sqlite_connection(f, immutable=True, row_factory='row') as db: try: yield from _process_db(db) From 0e94e0a9eaecb8f6826eced361ac7b119b73b8d6 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 23 Oct 2023 21:52:05 +0100 Subject: [PATCH 160/302] whatsapp.andrdoid: handle most messages types properly --- my/whatsapp/android.py | 45 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py index 0c6bd20..e7c266c 100644 --- a/my/whatsapp/android.py +++ b/my/whatsapp/android.py @@ -94,10 +94,17 @@ def _process_db(db: sqlite3.Connection): # NOTE: hmm, seems that message_view or available_message_view use lots of NULL as ... # so even if it seems as if it has a column (e.g. for attachment path), there is actually no such data # so makes more sense to just query message column directly - # todo message_type? mostly 0, but seems all over, even for seemingly normal messages with text for r in db.execute( ''' - SELECT C.raw_string_jid AS chat_id, M.key_id, M.timestamp, sender_jid_row_id, M.from_me, M.text_data, MM.file_path + SELECT + C.raw_string_jid AS chat_id, + M.key_id, M.timestamp, + sender_jid_row_id, + M.from_me, + M.text_data, + MM.file_path, + MM.file_size, + M.message_type FROM message AS M LEFT JOIN chat_view AS C ON M.chat_row_id = C._id LEFT JOIN message_media AS MM ON M._id = MM.message_row_id @@ -116,8 +123,40 @@ def _process_db(db: sqlite3.Connection): text: Optional[str] = r['text_data'] media_file_path: Optional[str] = r['file_path'] + media_file_size: Optional[int] = r['file_size'] - if media_file_path is not None: + message_type = r['message_type'] + + if text is None: + # fmt: off + text = { + 5 : '[MAP LOCATION]', + 10: '[MISSED VOICE CALL]', + 15: '[DELETED]', + 16: '[LIVE LOCATION]', + 64: '[DELETED]', # seems like 'deleted by admin'? + }.get(message_type) + # fmt: on + + # check against known msg types + # fmt: off + if text is None and message_type not in { + 0, # normal + 1, # image + 2, # voice note + 3, # video + 7, # "system" message, e.g. chat name + 8, # document + 9, # also document? + 13, # animated gif? + 20, # webp/sticker? + }: + text = f"[UNKNOWN TYPE {message_type}]" + # fmt: on + + if media_file_size is not None: + # this is always not null for message_media table + # however media_file_path sometimes may be none mm = f'MEDIA: {media_file_path}' if text is None: text = mm From a5c04e789a5241b121817eba2b3ed7d24c695d73 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 24 Oct 2023 01:15:27 +0100 Subject: [PATCH 161/302] twitter.archive: deduplicate results via json.dumps this speeds up processing quite a bit, from 40s to 20s for me, plus removes tons of identical outputs interesting enough, using raw object without json.dumps as key brings unique_everseen to crawl... --- my/twitter/archive.py | 75 ++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 22014df..0ea6b24 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -22,31 +22,48 @@ except ImportError as ie: from dataclasses import dataclass +from datetime import datetime +from itertools import chain +import json # hmm interesting enough, orjson didn't give much speedup here? +from pathlib import Path from functools import cached_property import html -from ..core.common import Paths, datetime_aware -from ..core.error import Res +from typing import ( + Iterator, + List, + Optional, + Sequence, +) + +from more_itertools import unique_everseen + +from my.core import ( + datetime_aware, + get_files, + make_logger, + stat, + Json, + Paths, + Res, + Stats, +) +from my.core import warnings +from my.core.cfg import make_config +from my.core.serialize import dumps as json_dumps + +from .common import TweetId, permalink + @dataclass class twitter_archive(user_config): - export_path: Paths # path[s]/glob to the twitter archive takeout + export_path: Paths # path[s]/glob to the twitter archive takeout ### -from ..core.cfg import make_config config = make_config(twitter_archive) -from datetime import datetime -from typing import List, Optional, NamedTuple, Sequence, Iterator -from pathlib import Path -import json - -from my.core import get_files, make_logger, Json - - - logger = make_logger(__name__) @@ -54,11 +71,9 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) -from .common import TweetId, permalink - - # TODO make sure it's not used anywhere else and simplify interface -class Tweet(NamedTuple): +@dataclass +class Tweet: raw: Json screen_name: str @@ -80,7 +95,7 @@ class Tweet(NamedTuple): res: str = self.raw['full_text'] ## replace shortened URLS - repls = [] # from, to, what + repls = [] # from, to, what for ue in self.entities['urls']: [fr, to] = map(int, ue['indices']) repls.append((fr, to, ue['expanded_url'])) @@ -94,7 +109,7 @@ class Tweet(NamedTuple): parts = [] idx = 0 for fr, to, what in repls: - parts.append(res[idx: fr]) + parts.append(res[idx:fr]) parts.append(what) idx = to parts.append(res[idx:]) @@ -132,7 +147,8 @@ class Tweet(NamedTuple): return self.created_at -class Like(NamedTuple): +@dataclass +class Like: raw: Json screen_name: str @@ -165,13 +181,12 @@ class ZipExport: def __init__(self, archive_path: Path) -> None: self.zpath = archive_path if (self.zpath / 'tweets.csv').exists(): - from ..core.warnings import high - high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.") - self.old_format = False # changed somewhere around 2020.03 + warnings.high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.") + self.old_format = False # changed somewhere around 2020.03 if not (self.zpath / 'Your archive.html').exists(): self.old_format = True - def raw(self, what: str, *, fname: Optional[str]=None) -> Iterator[Json]: + def raw(self, what: str, *, fname: Optional[str] = None) -> Iterator[Json]: logger.info(f'{self.zpath} : processing {what}') path = fname or what @@ -213,16 +228,18 @@ class ZipExport: # todo not sure about list and sorting? although can't hurt considering json is not iterative? def tweets() -> Iterator[Res[Tweet]]: - for inp in inputs(): - yield from sorted(ZipExport(inp).tweets(), key=lambda t: t.dt) + _all = chain.from_iterable(ZipExport(i).tweets() for i in inputs()) + res = unique_everseen(_all, key=json_dumps) + yield from sorted(res, key=lambda t: t.dt) def likes() -> Iterator[Res[Like]]: - for inp in inputs(): - yield from ZipExport(inp).likes() + _all = chain.from_iterable(ZipExport(i).likes() for i in inputs()) + res = unique_everseen(_all, key=json_dumps) + # ugh. likes don't have datetimes.. + yield from res -from ..core import stat, Stats def stats() -> Stats: return { **stat(tweets), From 1f61e853c99c58d16a924ba4a064bd597ca16719 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 24 Oct 2023 23:41:45 +0100 Subject: [PATCH 162/302] reddit.rexport: experiment with using optional cpu pool (used by all of HPI) Enabled by the env variable, specifying how many cores to dedicate, e.g. HPI_CPU_POOL=4 hpi query ... --- my/core/_cpu_pool.py | 33 +++++++++++++++++++++++++++++++++ my/reddit/rexport.py | 24 +++++++++++++++++++----- 2 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 my/core/_cpu_pool.py diff --git a/my/core/_cpu_pool.py b/my/core/_cpu_pool.py new file mode 100644 index 0000000..5ac66de --- /dev/null +++ b/my/core/_cpu_pool.py @@ -0,0 +1,33 @@ +""" +EXPERIMENTAL! use with caution +Manages 'global' ProcessPoolExecutor which is 'managed' by HPI itself, and +can be passed down to DALs to speed up data processing. + +The reason to have it managed by HPI is because we don't want DALs instantiate pools +themselves -- they can't cooperate and it would be hard/infeasible to control +how many cores we want to dedicate to the DAL. + +Enabled by the env variable, specifying how many cores to dedicate +e.g. "HPI_CPU_POOL=4 hpi query ..." +""" +from concurrent.futures import ProcessPoolExecutor +import os +from typing import cast, Optional + + +_NOT_SET = cast(ProcessPoolExecutor, object()) +_INSTANCE: Optional[ProcessPoolExecutor] = _NOT_SET + + +def get_cpu_pool() -> Optional[ProcessPoolExecutor]: + global _INSTANCE + if _INSTANCE is _NOT_SET: + use_cpu_pool = os.environ.get('HPI_CPU_POOL') + if use_cpu_pool is None or int(use_cpu_pool) == 0: + _INSTANCE = None + else: + # NOTE: this won't be cleaned up properly, but I guess it's fine? + # since this it's basically a singleton for the whole process + # , and will be destroyed when python exists + _INSTANCE = ProcessPoolExecutor(max_workers=int(use_cpu_pool)) + return _INSTANCE diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index 1f72133..f166ecd 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -8,6 +8,7 @@ REQUIRES = [ ] from dataclasses import dataclass +import inspect from pathlib import Path from typing import TYPE_CHECKING, Iterator, Sequence @@ -45,14 +46,16 @@ def migration(attrs: Attrs) -> Attrs: ex: uconfig.rexport = attrs['rexport'] attrs['export_path'] = ex.export_path else: - warnings.high("""DEPRECATED! Please modify your reddit config to look like: + warnings.high( + """DEPRECATED! Please modify your reddit config to look like: class reddit: class rexport: export_path: Paths = '/path/to/rexport/data' - """) + """ + ) export_dir = 'export_dir' - if export_dir in attrs: # legacy name + if export_dir in attrs: # legacy name attrs['export_path'] = attrs[export_dir] warnings.high(f'"{export_dir}" is deprecated! Please use "export_path" instead."') return attrs @@ -93,8 +96,19 @@ Upvote = dal.Upvote def _dal() -> dal.DAL: - inp = list(inputs()) - return dal.DAL(inp) + sources = list(inputs()) + + ## backwards compatibility (old rexport DAL didn't have cpu_pool argument) + cpu_pool_arg = 'cpu_pool' + pass_cpu_pool = cpu_pool_arg in inspect.signature(dal.DAL).parameters + if pass_cpu_pool: + from my.core._cpu_pool import get_cpu_pool + + kwargs = {cpu_pool_arg: get_cpu_pool()} + else: + kwargs = {} + ## + return dal.DAL(sources, **kwargs) cache = mcachew(depends_on=inputs) From a0910e798dab31794d30d51cba1fc267be2b9b7b Mon Sep 17 00:00:00 2001 From: karlicoss Date: Wed, 25 Oct 2023 02:27:06 +0100 Subject: [PATCH 163/302] core.logging: ignore CollapseLogsHandler if we're not attached to a terminal otherwise fails at os.get_terminal_size --- my/core/logging.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/my/core/logging.py b/my/core/logging.py index accfc2e..5d2af99 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -3,12 +3,12 @@ from __future__ import annotations from functools import lru_cache import logging import os +import sys from typing import Union import warnings def test() -> None: - import sys from typing import Callable M: Callable[[str], None] = lambda s: print(s, file=sys.stderr) @@ -128,13 +128,15 @@ def _setup_handlers_and_formatters(name: str) -> None: logger.addFilter(AddExceptionTraceback()) - ch = logging.StreamHandler() collapse_level = get_collapse_level() - ch = logging.StreamHandler() if collapse_level is None else CollapseLogsHandler(maxlevel=collapse_level) + if collapse_level is None or not sys.stderr.isatty(): + handler = logging.StreamHandler() + else: + handler = CollapseLogsHandler(maxlevel=collapse_level) # default level for handler is NOTSET, which will make it process all messages # we rely on the logger to actually accept/reject log msgs - logger.addHandler(ch) + logger.addHandler(handler) # this attribute is set to True by default, which causes log entries to be passed to root logger (e.g. if you call basicConfig beforehand) # even if log entry is handled by this logger ... not sure what's the point of this behaviour?? @@ -151,12 +153,12 @@ def _setup_handlers_and_formatters(name: str) -> None: FORMAT_COLOR = FORMAT.format(start='%(log_color)s', end='%(reset)s') # colorlog should detect tty in principle, but doesn't handle everything for some reason # see https://github.com/borntyping/python-colorlog/issues/71 - if ch.stream.isatty(): + if handler.stream.isatty(): formatter = colorlog.ColoredFormatter(FORMAT_COLOR) else: formatter = logging.Formatter(FORMAT_NOCOLOR) - ch.setFormatter(formatter) + handler.setFormatter(formatter) # by default, logging.exception isn't logging traceback unless called inside of the exception handler From bef0423b4fbb056df72a38b4bd722e9082e87b5c Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 27 Oct 2023 02:14:50 +0100 Subject: [PATCH 164/302] my.zulip.organization: use UTC timestamps, support custom archive names + some cleanup --- my/zulip/organization.py | 65 ++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/my/zulip/organization.py b/my/zulip/organization.py index 64b5ae3..8725411 100644 --- a/my/zulip/organization.py +++ b/my/zulip/organization.py @@ -2,24 +2,37 @@ Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]] """ from dataclasses import dataclass -from typing import Sequence, Iterator, Dict +from datetime import datetime, timezone +from itertools import count +import json +from pathlib import Path +from typing import Sequence, Iterator, Dict, Union + +from my.core import ( + assert_never, + datetime_aware, + get_files, + stat, + Json, + Paths, + Res, + Stats, +) +from my.core.error import notnone +import my.config -from my.config import zulip as user_config -from ..core import Paths @dataclass -class organization(user_config.organization): +class organization(my.config.zulip.organization): # paths[s]/glob to the exported JSON data export_path: Paths -from pathlib import Path -from ..core import get_files, Json def inputs() -> Sequence[Path]: - return get_files(organization.export_path) - - -from datetime import datetime + # TODO: seems like export ids are kinda random.. + # not sure what's the best way to figure out the last without renaming? + # could use mtime perhaps? + return get_files(organization.export_path, sort=False) @dataclass(frozen=True) @@ -39,16 +52,11 @@ class Sender: # from the data, seems that subjects are completely implicit and determined by name? # streams have ids (can extract from realm/zerver_stream), but unclear how to correlate messages/topics to streams? - @dataclass(frozen=True) class _Message: # todo hmm not sure what would be a good field order.. id: int - sent: datetime - # TODO hmm kinda unclear whether it uses UTC or not?? - # https://github.com/zulip/zulip/blob/0c2e4eec200d986a9a020f3e9a651d27216e0e85/zerver/models.py#L3071-L3076 - # it keeps it tz aware.. but not sure what happens after? - # https://github.com/zulip/zulip/blob/1dfddffc8dac744fd6a6fbfd937018074c8bb166/zproject/computed_settings.py#L151 + sent: datetime_aware # double checked and they are in utc subject: str sender_id: int server_id: int @@ -60,7 +68,7 @@ class _Message: @dataclass(frozen=True) class Message: id: int - sent: datetime + sent: datetime_aware subject: str sender: Sender server: Server @@ -76,23 +84,18 @@ class Message: return f'https://{self.server.string_id}.zulipchat.com/#narrow/near/{self.id}' -from typing import Union -from itertools import count -import json -from ..core import Res, assert_never # todo cache it def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: - # TODO hmm -- not sure if max lexicographically will actually be latest? last = max(inputs()) - subdir = last.with_suffix('').stem # there is a directory inside tar.gz - # todo would be nice to switch it to unpacked dirs as well, similar to ZipPath # I guess makes sense to have a special implementation for .tar.gz considering how common are they import tarfile - from ..core.error import notnone tfile = tarfile.open(last) + + subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that + with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo: rj = json.load(fo) @@ -114,20 +117,22 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: for j in rj['zerver_userprofile_crossrealm']: # e.g. zulip bot yield Sender( id=j['id'], - full_name=j['email'], # doesn't seem to have anything + full_name=j['email'], # doesn't seem to have anything email=j['email'], ) def _parse_message(j: Json) -> _Message: ds = j['date_sent'] + # fmt: off return _Message( id = j['id'], - sent = datetime.fromtimestamp(ds), + sent = datetime.fromtimestamp(ds, tz=timezone.utc), subject = j['subject'], sender_id = j['sender'], server_id = server.id, content = j['content'], ) + # fmt: on for idx in count(start=1, step=1): fname = f'messages-{idx:06}.json' @@ -172,9 +177,5 @@ def messages() -> Iterator[Res[Message]]: assert_never(x) -from my.core import Stats def stats() -> Stats: - from my.core import stat - return { - **stat(messages) - } + return {**stat(messages)} From 3a25c9042ce2fe08fcc4218a860f572d453ce60b Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 27 Oct 2023 02:27:04 +0100 Subject: [PATCH 165/302] my.hackernews.dogsheep: use utc datetime + minor cleanup --- my/hackernews/dogsheep.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py index aac0b1a..de6c58d 100644 --- a/my/hackernews/dogsheep.py +++ b/my/hackernews/dogsheep.py @@ -4,18 +4,19 @@ Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogshee from __future__ import annotations from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from typing import Iterator, Sequence, Optional -from my.core import get_files, Paths, Res +from my.core import get_files, Paths, Res, datetime_aware from my.core.sqlite import sqlite_connection +import my.config -from my.config import hackernews as user_config +from .common import hackernews_link @dataclass -class config(user_config.dogsheep): +class config(my.config.hackernews.dogsheep): # paths[s]/glob to the dogsheep database export_path: Paths @@ -26,24 +27,23 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) -from .common import hackernews_link - # TODO not sure if worth splitting into Comment and Story? @dataclass(unsafe_hash=True) class Item: id: str type: str - # TODO is it urc?? - created: datetime + created: datetime_aware # checked and it's utc title: Optional[str] # only present for Story - text_html: Optional[str] # should be present for Comment and might for Story - url: Optional[str] # might be present for Story + text_html: Optional[str] # should be present for Comment and might for Story + url: Optional[str] # might be present for Story # todo process 'deleted'? fields? # todo process 'parent'? @property def permalink(self) -> str: return hackernews_link(self.id) + + # TODO hmm kinda annoying that permalink isn't getting serialized # maybe won't be such a big problem if we used hpi query directly on objects, without jsons? # so we could just take .permalink thing @@ -56,7 +56,7 @@ def items() -> Iterator[Res[Item]]: yield Item( id=r['id'], type=r['type'], - created=datetime.fromtimestamp(r['time']), + created=datetime.fromtimestamp(r['time'], tz=timezone.utc), title=r['title'], # todo hmm maybe a method to strip off html tags would be nice text_html=r['text'], From 32aa87b3ecef292ac99d374c34e40e4b1d6b7b71 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 27 Oct 2023 01:31:36 +0100 Subject: [PATCH 166/302] dcotor: make compileall check a bit more defensive --- my/core/__main__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index 643df50..ca88513 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -174,13 +174,14 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module # note: ugh, annoying that copytree requires a non-existing dir before 3.8. # once we have min version 3.8, can use dirs_exist_ok=True param tdir = Path(td) / 'cfg' - # this will resolve symlinks when copying - shutil.copytree(cfg_path, tdir) # NOTE: compileall still returns code 0 if the path doesn't exist.. # but in our case hopefully it's not an issue cmd = [sys.executable, '-m', 'compileall', '-q', str(tdir)] try: + # this will resolve symlinks when copying + # should be under try/catch since might fail if some symlinks are missing + shutil.copytree(cfg_path, tdir) check_call(cmd) info('syntax check: ' + ' '.join(cmd)) except Exception as e: From fb2b3e07ded353ae0d7a852160c2a9774cca8577 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 27 Oct 2023 23:20:00 +0100 Subject: [PATCH 167/302] my.emfit: cleanup and pass cpu pool --- my/emfit/__init__.py | 68 ++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index cde6ddc..1ec3341 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -8,22 +8,30 @@ REQUIRES = [ 'git+https://github.com/karlicoss/emfitexport', ] +from contextlib import contextmanager +import dataclasses +from datetime import datetime, time, timedelta +import inspect from pathlib import Path -from typing import Dict, List, Iterable, Any, Optional +from typing import Any, Dict, Iterable, Iterator, List, Optional -from ..core import get_files -from ..core.common import mcachew -from ..core.cachew import cache_dir -from ..core.error import Res, set_error_datetime, extract_error_datetime -from ..core.pandas import DataFrameT +from my.core import ( + get_files, + stat, + Res, + Stats, +) +from my.core.common import mcachew +from my.core.cachew import cache_dir +from my.core.error import set_error_datetime, extract_error_datetime +from my.core.pandas import DataFrameT from my.config import emfit as config - import emfitexport.dal as dal -# todo ugh. need to make up my mind on log vs logger naming... I guessl ogger makes more sense -logger = dal.log -Emfit = dal.Emfit + + +Emfit = dal.Emfit # TODO move to common? @@ -39,13 +47,22 @@ def _cachew_depends_on(): # TODO take __file__ into account somehow? @mcachew(cache_path=cache_dir() / 'emfit.cache', depends_on=_cachew_depends_on) def datas() -> Iterable[Res[Emfit]]: - import dataclasses - # data from emfit is coming in UTC. There is no way (I think?) to know the 'real' timezone, and local times matter more for sleep analysis - # TODO actually this is wrong?? check this.. + # TODO actually this is wrong?? there is some sort of local offset in the export emfit_tz = config.timezone - for x in dal.sleeps(config.export_path): + ## backwards compatibility (old DAL didn't have cpu_pool argument) + cpu_pool_arg = 'cpu_pool' + pass_cpu_pool = cpu_pool_arg in inspect.signature(dal.sleeps).parameters + if pass_cpu_pool: + from my.core._cpu_pool import get_cpu_pool + + kwargs = {cpu_pool_arg: get_cpu_pool()} + else: + kwargs = {} + ## + + for x in dal.sleeps(config.export_path, **kwargs): if isinstance(x, Exception): yield x else: @@ -54,6 +71,7 @@ def datas() -> Iterable[Res[Emfit]]: continue # TODO maybe have a helper to 'patch up' all dattetimes in a namedtuple/dataclass? # TODO do the same for jawbone data? + # fmt: off x = dataclasses.replace( x, start =x.start .astimezone(emfit_tz), @@ -61,6 +79,7 @@ def datas() -> Iterable[Res[Emfit]]: sleep_start=x.sleep_start.astimezone(emfit_tz), sleep_end =x.sleep_end .astimezone(emfit_tz), ) + # fmt: on yield x @@ -78,7 +97,7 @@ def pre_dataframe() -> Iterable[Res[Emfit]]: yield r else: err = RuntimeError(f'Multiple sleeps per night, not supported yet: {g}') - set_error_datetime(err, dt=g[0].date) + set_error_datetime(err, dt=datetime.combine(g[0].date, time.min)) g.clear() yield err @@ -94,7 +113,6 @@ def pre_dataframe() -> Iterable[Res[Emfit]]: def dataframe() -> DataFrameT: - from datetime import timedelta dicts: List[Dict[str, Any]] = [] last: Optional[Emfit] = None for s in pre_dataframe(): @@ -102,7 +120,7 @@ def dataframe() -> DataFrameT: if isinstance(s, Exception): edt = extract_error_datetime(s) d = { - 'date' : edt, + 'date': edt, 'error': str(s), } else: @@ -117,6 +135,7 @@ def dataframe() -> DataFrameT: # todo ugh. get rid of hardcoding, just generate the schema automatically # TODO use 'workdays' provider.... + # fmt: off d = { 'date' : dd, @@ -133,25 +152,24 @@ def dataframe() -> DataFrameT: 'hrv_change' : hrv_change, 'respiratory_rate_avg': s.respiratory_rate_avg, } - last = s # meh + # fmt: on + last = s # meh dicts.append(d) - import pandas + return pandas.DataFrame(dicts) -from ..core import stat, Stats def stats() -> Stats: return stat(pre_dataframe) -from contextlib import contextmanager -from typing import Iterator @contextmanager -def fake_data(nights: int=500) -> Iterator: +def fake_data(nights: int = 500) -> Iterator: from my.core.cfg import tmp_config from tempfile import TemporaryDirectory + with TemporaryDirectory() as td: tdir = Path(td) gen = dal.FakeData() @@ -168,5 +186,7 @@ def fake_data(nights: int=500) -> Iterator: # TODO remove/deprecate it? I think used by timeline def get_datas() -> List[Emfit]: # todo ugh. run lint properly - return list(sorted(datas(), key=lambda e: e.start)) # type: ignore + return list(sorted(datas(), key=lambda e: e.start)) # type: ignore + + # TODO move away old entries if there is a diff?? From 70bf51a12568eda67cfe6639c598085b174bd86e Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 27 Oct 2023 21:28:38 +0100 Subject: [PATCH 168/302] core/stats: exclude contextmanagers from guess_stats --- my/core/common.py | 4 ++++ my/core/tests/auto_stats.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index 602f8af..ccebaf2 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -409,6 +409,10 @@ def stat( ) -> Stats: if callable(func): fr = func() + if hasattr(fr, '__enter__') and hasattr(fr, '__exit__'): + # context managers has Iterable type, but they aren't data providers + # sadly doesn't look like there is a way to tell from typing annotations + return {} fname = func.__name__ else: # meh. means it's just a list.. not sure how to generate a name then diff --git a/my/core/tests/auto_stats.py b/my/core/tests/auto_stats.py index 2946ab2..2c09b5b 100644 --- a/my/core/tests/auto_stats.py +++ b/my/core/tests/auto_stats.py @@ -1,11 +1,11 @@ """ Helper 'module' for test_guess_stats """ - +from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime, timedelta from pathlib import Path -from typing import Iterable, Sequence +from typing import Iterable, Sequence, Iterator @dataclass @@ -28,3 +28,9 @@ def iter_data() -> Iterable[Item]: for path in inputs(): for i in range(3): yield Item(id=str(i), dt=dt + timedelta(days=i), source=path) + + +@contextmanager +def some_contextmanager() -> Iterator[str]: + # this shouldn't end up in guess_stats because context manager is not a data provider + yield 'hello' From 4f7c9b4a711518fcf685c98790fda90fb2566f83 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 28 Oct 2023 17:23:26 +0100 Subject: [PATCH 169/302] core: move split compat/legacy modules into hpi_compat and compat --- my/core/compat.py | 52 ++-------------------- my/core/{legacy.py => hpi_compat.py} | 64 +++++++++++++++++++++++++--- my/fbmessenger/__init__.py | 2 +- my/github/ghexport.py | 3 +- my/hypothesis.py | 3 +- my/instapaper.py | 3 +- my/pocket.py | 3 +- my/reddit/__init__.py | 2 +- my/reddit/rexport.py | 2 +- 9 files changed, 72 insertions(+), 62 deletions(-) rename my/core/{legacy.py => hpi_compat.py} (61%) diff --git a/my/core/compat.py b/my/core/compat.py index d7937f9..48e194b 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -1,56 +1,12 @@ ''' -Some backwards compatibility stuff/deprecation helpers +Contains backwards compatibility helpers for different python versions. +If something is relevant to HPI itself, please put it in .hpi_compat instead ''' +import os import sys -from types import ModuleType from typing import TYPE_CHECKING -from . import warnings -from .common import LazyLogger - -logger = LazyLogger('my.core.compat') - - -def pre_pip_dal_handler( - name: str, - e: ModuleNotFoundError, - cfg, - requires=[], -) -> ModuleType: - ''' - https://github.com/karlicoss/HPI/issues/79 - ''' - if e.name != name: - # the module itself was imported, so the problem is with some dependencies - raise e - try: - dal = _get_dal(cfg, name) - warnings.high(f''' -Specifying modules' dependencies in the config or in my/config/repos is deprecated! -Please install {' '.join(requires)} as PIP packages (see the corresponding README instructions). -'''.strip(), stacklevel=2) - except ModuleNotFoundError: - dal = None - - if dal is None: - # probably means there was nothing in the old config in the first place - # so we should raise the original exception - raise e - return dal - - -def _get_dal(cfg, module_name: str): - mpath = getattr(cfg, module_name, None) - if mpath is not None: - from .common import import_dir - return import_dir(mpath, '.dal') - else: - from importlib import import_module - return import_module(f'my.config.repos.{module_name}.dal') - - -import os windows = os.name == 'nt' @@ -60,7 +16,7 @@ def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwa source.backup(dest, **kwargs) -# can remove after python3.9 +# can remove after python3.9 (although need to keep the method itself for bwd compat) def removeprefix(text: str, prefix: str) -> str: if text.startswith(prefix): return text[len(prefix):] diff --git a/my/core/legacy.py b/my/core/hpi_compat.py similarity index 61% rename from my/core/legacy.py rename to my/core/hpi_compat.py index 3ad121d..9ef103a 100644 --- a/my/core/legacy.py +++ b/my/core/hpi_compat.py @@ -1,16 +1,20 @@ -# I think 'compat' should be for python-specific compat stuff, whereas this for HPI specific backwards compatibility +""" +Contains various backwards compatibility/deprecation helpers relevant to HPI itself. +(as opposed to .compat module which implements compatibility between python versions) +""" import os import inspect import re +from types import ModuleType from typing import List -from my.core import warnings as W +from my.core import warnings def handle_legacy_import( - parent_module_name: str, - legacy_submodule_name: str, - parent_module_path: List[str], + parent_module_name: str, + legacy_submodule_name: str, + parent_module_path: List[str], ) -> bool: ### # this is to trick mypy into treating this as a proper namespace package @@ -19,6 +23,7 @@ def handle_legacy_import( # - https://github.com/karlicoss/hpi_namespace_experiment # - discussion here https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/extending.20HPI/near/269946944 from pkgutil import extend_path + parent_module_path[:] = extend_path(parent_module_path, parent_module_name) # 'this' source tree ends up first in the pythonpath when we extend_path() # so we need to move 'this' source tree towards the end to make sure we prioritize overlays @@ -52,9 +57,54 @@ def handle_legacy_import( is_legacy_import = not (imported_as_parent or importing_submodule) if is_legacy_import and not autocompleting_module_cli: - W.high(f'''\ + warnings.high( + f'''\ importing {parent_module_name} is DEPRECATED! \ Instead, import from {parent_module_name}.{legacy_submodule_name} or {parent_module_name}.all \ See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info. -''') +''' + ) return is_legacy_import + + +def pre_pip_dal_handler( + name: str, + e: ModuleNotFoundError, + cfg, + requires=[], +) -> ModuleType: + ''' + https://github.com/karlicoss/HPI/issues/79 + ''' + if e.name != name: + # the module itself was imported, so the problem is with some dependencies + raise e + try: + dal = _get_dal(cfg, name) + warnings.high( + f''' +Specifying modules' dependencies in the config or in my/config/repos is deprecated! +Please install {' '.join(requires)} as PIP packages (see the corresponding README instructions). +'''.strip(), + stacklevel=2, + ) + except ModuleNotFoundError: + dal = None + + if dal is None: + # probably means there was nothing in the old config in the first place + # so we should raise the original exception + raise e + return dal + + +def _get_dal(cfg, module_name: str): + mpath = getattr(cfg, module_name, None) + if mpath is not None: + from .common import import_dir + + return import_dir(mpath, '.dal') + else: + from importlib import import_module + + return import_module(f'my.config.repos.{module_name}.dal') diff --git a/my/fbmessenger/__init__.py b/my/fbmessenger/__init__.py index 3919c44..40fb235 100644 --- a/my/fbmessenger/__init__.py +++ b/my/fbmessenger/__init__.py @@ -19,7 +19,7 @@ REQUIRES = [ ] -from my.core.legacy import handle_legacy_import +from my.core.hpi_compat import handle_legacy_import is_legacy_import = handle_legacy_import( parent_module_name=__name__, legacy_submodule_name='export', diff --git a/my/github/ghexport.py b/my/github/ghexport.py index 9eebbf0..d446c35 100644 --- a/my/github/ghexport.py +++ b/my/github/ghexport.py @@ -35,7 +35,8 @@ config = make_config(github, migration=migration) try: from ghexport import dal except ModuleNotFoundError as e: - from my.core.compat import pre_pip_dal_handler + from my.core.hpi_compat import pre_pip_dal_handler + dal = pre_pip_dal_handler('ghexport', e, config, requires=REQUIRES) ############################ diff --git a/my/hypothesis.py b/my/hypothesis.py index 370854a..822fe9d 100644 --- a/my/hypothesis.py +++ b/my/hypothesis.py @@ -35,7 +35,8 @@ config = make_config(hypothesis) try: from hypexport import dal except ModuleNotFoundError as e: - from .core.compat import pre_pip_dal_handler + from my.core.hpi_compat import pre_pip_dal_handler + dal = pre_pip_dal_handler('hypexport', e, config, requires=REQUIRES) ############################ diff --git a/my/instapaper.py b/my/instapaper.py index 1ab62c2..df1f70b 100644 --- a/my/instapaper.py +++ b/my/instapaper.py @@ -28,7 +28,8 @@ config = make_config(instapaper) try: from instapexport import dal except ModuleNotFoundError as e: - from .core.compat import pre_pip_dal_handler + from my.core.hpi_compat import pre_pip_dal_handler + dal = pre_pip_dal_handler('instapexport', e, config, requires=REQUIRES) ############################ diff --git a/my/pocket.py b/my/pocket.py index 912806d..2a7bdcb 100644 --- a/my/pocket.py +++ b/my/pocket.py @@ -28,7 +28,8 @@ config = make_config(pocket) try: from pockexport import dal except ModuleNotFoundError as e: - from .core.compat import pre_pip_dal_handler + from my.core.hpi_compat import pre_pip_dal_handler + dal = pre_pip_dal_handler('pockexport', e, config, requires=REQUIRES) ############################ diff --git a/my/reddit/__init__.py b/my/reddit/__init__.py index 22813f1..e81aaf9 100644 --- a/my/reddit/__init__.py +++ b/my/reddit/__init__.py @@ -19,7 +19,7 @@ REQUIRES = [ ] -from my.core.legacy import handle_legacy_import +from my.core.hpi_compat import handle_legacy_import is_legacy_import = handle_legacy_import( parent_module_name=__name__, legacy_submodule_name='rexport', diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index f166ecd..a7be39b 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -67,7 +67,7 @@ config = make_config(reddit, migration=migration) try: from rexport import dal except ModuleNotFoundError as e: - from my.core.compat import pre_pip_dal_handler + from my.core.hpi_compat import pre_pip_dal_handler dal = pre_pip_dal_handler('rexport', e, config, requires=REQUIRES) # TODO ugh. this would import too early From d88a1b9933329cade72da7fb39c398ed6ceafa73 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 28 Oct 2023 17:46:47 +0100 Subject: [PATCH 170/302] my.hypothesis: explose data as iterators instead of lists also add an adapter to support migrating in backwards compatible manner --- my/core/hpi_compat.py | 41 +++++++++++++++++++++++++-- my/hypothesis.py | 65 +++++++++++++++++-------------------------- 2 files changed, 64 insertions(+), 42 deletions(-) diff --git a/my/core/hpi_compat.py b/my/core/hpi_compat.py index 9ef103a..61121de 100644 --- a/my/core/hpi_compat.py +++ b/my/core/hpi_compat.py @@ -6,9 +6,9 @@ import os import inspect import re from types import ModuleType -from typing import List +from typing import Iterator, List, Optional, TypeVar -from my.core import warnings +from . import warnings def handle_legacy_import( @@ -108,3 +108,40 @@ def _get_dal(cfg, module_name: str): from importlib import import_module return import_module(f'my.config.repos.{module_name}.dal') + + +V = TypeVar('V') + + +# named to be kinda consistent with more_itertools, e.g. more_itertools.always_iterable +class always_supports_sequence(Iterator[V]): + """ + Helper to make migration from Sequence/List to Iterable/Iterator type backwards compatible + """ + + def __init__(self, it: Iterator[V]) -> None: + self.it = it + self._list: Optional[List] = None + + def __iter__(self) -> Iterator[V]: + return self.it.__iter__() + + def __next__(self) -> V: + return self.it.__next__() + + def __getattr__(self, name): + return getattr(self.it, name) + + @property + def aslist(self) -> List[V]: + if self._list is None: + qualname = getattr(self.it, '__qualname__', '') # defensive just in case + warnings.medium(f'Using {qualname} as list is deprecated. Migrate to iterative processing or call list() explicitly.') + self._list = list(self.it) + return self._list + + def __len__(self) -> int: + return len(self.aslist) + + def __getitem__(self, i: int) -> V: + return self.aslist[i] diff --git a/my/hypothesis.py b/my/hypothesis.py index 822fe9d..257e739 100644 --- a/my/hypothesis.py +++ b/my/hypothesis.py @@ -5,21 +5,23 @@ REQUIRES = [ 'git+https://github.com/karlicoss/hypexport', ] from dataclasses import dataclass -from datetime import datetime -from typing import Callable - -from .core import Paths - -from my.config import hypothesis as user_config - -REQUIRES = [ - 'git+https://github.com/karlicoss/hypexport', -] +from pathlib import Path +from typing import Iterator, Sequence +from my.core import ( + get_files, + stat, + Paths, + Res, + Stats, +) +from my.core.cfg import make_config +from my.core.hpi_compat import always_supports_sequence +import my.config @dataclass -class hypothesis(user_config): +class hypothesis(my.config.hypothesis): ''' Uses [[https://github.com/karlicoss/hypexport][hypexport]] outputs ''' @@ -28,7 +30,6 @@ class hypothesis(user_config): export_path: Paths -from .core.cfg import make_config config = make_config(hypothesis) @@ -39,37 +40,28 @@ except ModuleNotFoundError as e: dal = pre_pip_dal_handler('hypexport', e, config, requires=REQUIRES) -############################ - -from typing import List -from .core.error import Res, sort_res_by Highlight = dal.Highlight -Page = dal.Page +Page = dal.Page + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) def _dal() -> dal.DAL: - from .core import get_files - sources = get_files(config.export_path) - return dal.DAL(sources) + return dal.DAL(inputs()) # TODO they are in reverse chronological order... -def highlights() -> List[Res[Highlight]]: - # todo hmm. otherwise mypy complans - key: Callable[[Highlight], datetime] = lambda h: h.created - return sort_res_by(_dal().highlights(), key=key) +def highlights() -> Iterator[Res[Highlight]]: + return always_supports_sequence(_dal().highlights()) -# TODO eh. always provide iterators? although sort_res_by could be neat too... -def pages() -> List[Res[Page]]: - # note: mypy report shows "No Anys on this line here", apparently a bug with type aliases - # https://github.com/python/mypy/issues/8594 - key: Callable[[Page], datetime] = lambda h: h.created - return sort_res_by(_dal().pages(), key=key) +def pages() -> Iterator[Res[Page]]: + return always_supports_sequence(_dal().pages()) -from .core import stat, Stats def stats() -> Stats: return { **stat(highlights), @@ -77,12 +69,5 @@ def stats() -> Stats: } -def _main() -> None: - for page in get_pages(): - print(page) - -if __name__ == '__main__': - _main() - -get_highlights = highlights # todo deprecate -get_pages = pages # todo deprecate +get_highlights = highlights # todo deprecate +get_pages = pages # todo deprecate From edea2c2e75d5351d4c03a9736d1b91938687aa41 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 28 Oct 2023 19:16:39 +0100 Subject: [PATCH 171/302] my.kobo: add hightlights method to return Hightlight objects iteratively also minor cleanup --- my/kobo.py | 60 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/my/kobo.py b/my/kobo.py index 337d9c7..85bc50f 100644 --- a/my/kobo.py +++ b/my/kobo.py @@ -1,54 +1,69 @@ """ [[https://uk.kobobooks.com/products/kobo-aura-one][Kobo]] e-ink reader: annotations and reading stats """ +from __future__ import annotations REQUIRES = [ 'kobuddy', ] +from dataclasses import dataclass +from typing import Iterator -from .core import Paths, dataclass -from my.config import kobo as user_config -@dataclass -class kobo(user_config): - ''' - Uses [[https://github.com/karlicoss/kobuddy#as-a-backup-tool][kobuddy]] outputs. - ''' - # path[s]/glob to the exported databases - export_path: Paths +from my.core import ( + get_files, + stat, + Paths, + Stats, +) +from my.core.cfg import make_config +import my.config - -from .core.cfg import make_config -config = make_config(kobo) - -from .core import get_files import kobuddy -# todo not sure about this glob.. -kobuddy.DATABASES = list(get_files(config.export_path, glob='*.sqlite')) - -######################### - -# hmm, explicit imports make pylint a bit happier? from kobuddy import Highlight, get_highlights from kobuddy import * +@dataclass +class kobo(my.config.kobo): + ''' + Uses [[https://github.com/karlicoss/kobuddy#as-a-backup-tool][kobuddy]] outputs. + ''' + + # path[s]/glob to the exported databases + export_path: Paths + + +config = make_config(kobo) + +# TODO not ideal to set it here.. should switch kobuddy to use a proper DAL +kobuddy.DATABASES = list(get_files(config.export_path)) + + +def highlights() -> Iterator[Highlight]: + return kobuddy._iter_highlights() + -from .core import stat, Stats def stats() -> Stats: - return stat(get_highlights) + return stat(highlights) + ## TODO hmm. not sure if all this really belongs here?... perhaps orger? from typing import Callable, Union, List + # TODO maybe type over T? _Predicate = Callable[[str], bool] Predicatish = Union[str, _Predicate] + + def from_predicatish(p: Predicatish) -> _Predicate: if isinstance(p, str): + def ff(s): return s == p + return ff else: return p @@ -69,6 +84,7 @@ def get_todos() -> List[Highlight]: if ann is None: ann = '' return 'todo' in ann.lower().split() + return by_annotation(with_todo) From 6821fbc2feaad310f5b89c2eb95175a4e9dce6a9 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 28 Oct 2023 20:21:09 +0100 Subject: [PATCH 172/302] core/config: implement a warning if config is imported from the dir other than MY_CONFIG this should help with identifying setup issues --- my/core/init.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/my/core/init.py b/my/core/init.py index 6bf766e..bec3a9a 100644 --- a/my/core/init.py +++ b/my/core/init.py @@ -14,6 +14,7 @@ Please let me know if you are aware of a better way of dealing with this! # separate function to present namespace pollution def setup_config() -> None: + from pathlib import Path import sys import warnings @@ -47,6 +48,23 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-mo Importing 'my.config' failed! (error: {ex}). This is likely to result in issues. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info. """) + else: + # defensive just in case -- __file__ may not be present if there is some dynamic magic involved + used_config_file = getattr(my.config, '__file__', None) + if used_config_file is not None: + used_config_path = Path(used_config_file) + try: + # will crash if it's imported from other dir? + used_config_path.relative_to(mycfg_dir) + except ValueError: + # TODO maybe implement a strict mode where these warnings will be errors? + warnings.warn( + f""" +Expected my.config to be located at {mycfg_dir}, but instead its path is {used_config_path}. +This will likely cause issues down the line -- double check {mycfg_dir} structure. +See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info. +""", + ) setup_config() From f668208bce86ffab9652f67f235d75434e200e4d Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 28 Oct 2023 01:11:38 +0100 Subject: [PATCH 173/302] my.stackexchange.stexport: small cleanup & stat improvements --- my/stackexchange/stexport.py | 40 +++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/my/stackexchange/stexport.py b/my/stackexchange/stexport.py index 6286c83..812a155 100644 --- a/my/stackexchange/stexport.py +++ b/my/stackexchange/stexport.py @@ -5,24 +5,38 @@ REQUIRES = [ 'git+https://github.com/karlicoss/stexport', ] -### config -from my.config import stackexchange as user_config -from ..core import dataclass, PathIsh, make_config +from dataclasses import dataclass + +from stexport import dal + +from my.core import ( + PathIsh, + Stats, + get_files, + make_config, + stat, +) +import my.config + + @dataclass -class stackexchange(user_config): +class stackexchange(my.config.stackexchange): ''' Uses [[https://github.com/karlicoss/stexport][stexport]] outputs ''' - export_path: PathIsh # path to GDPR zip file -config = make_config(stackexchange) -### -from stexport import dal + export_path: PathIsh + + +config = make_config(stackexchange) +# TODO kinda annoying it's resolving gdpr path here (and fails during make_config if gdpr path isn't available) +# I guess it's a good argument to avoid clumping configs together +# or move to my.config.stackexchange.stexport +### # todo lru cache? def _dal() -> dal.DAL: - from ..core import get_files inputs = get_files(config.export_path) return dal.DAL(inputs) @@ -32,7 +46,9 @@ def site(name: str) -> dal.SiteDAL: return _dal().site_dal(name) -from ..core import stat, Stats def stats() -> Stats: - s = site('stackoverflow') - return stat(s.questions) + res = {} + for name in _dal().sites(): + s = site(name=name) + res.update({name: stat(s.questions, name='questions')}) + return res From bd27bd4c24a1a726cdd9ac87b5e3b9ac76502acc Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 28 Oct 2023 22:01:50 +0100 Subject: [PATCH 174/302] docs: add documentation on logging during HPI module development --- doc/MODULE_DESIGN.org | 100 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 91 insertions(+), 9 deletions(-) diff --git a/doc/MODULE_DESIGN.org b/doc/MODULE_DESIGN.org index c0ab4f6..7aedf2f 100644 --- a/doc/MODULE_DESIGN.org +++ b/doc/MODULE_DESIGN.org @@ -2,6 +2,19 @@ Some thoughts on modules, how to structure them, and adding your own/extending H This is slightly more advanced, and would be useful if you're trying to extend HPI by developing your own modules, or contributing back to HPI +* TOC +:PROPERTIES: +:TOC: :include all :depth 1 :force (nothing) :ignore (this) :local (nothing) +:END: +:CONTENTS: +- [[#allpy][all.py]] +- [[#module-count][module count]] +- [[#single-file-modules][single file modules]] +- [[#adding-new-modules][Adding new modules]] +- [[#an-extendable-module-structure][An Extendable module structure]] +- [[#logging-guidelines][Logging guidelines]] +:END: + * all.py Some modules have lots of different sources for data. For example, ~my.location~ (location data) has lots of possible sources -- from ~my.google.takeout.parser~, using the ~gpslogger~ android app, or through geo locating ~my.ip~ addresses. For a module with multiple possible sources, its common to split it into files like: @@ -234,16 +247,85 @@ It could be argued that namespace packages and editable installs are a bit compl There's no requirement to use this for individual modules, it just seems to be the best solution we've arrived at so far -* Logging +* Logging guidelines +HPI doesn't enforce any specific logging mechanism, you're free to use whatever you prefer in your modules. -The ~my.core~ module exports a ~make_logger~ function which works nicely with -~cachew~ and gives you colored logs. You can use it like this: +However there are some general guidelines for developing modules that can make them more pleasant to use. -#+begin_src python - from my.core import make_logger +- each module should have its unique logger, the easiest way to ensure that is simply use module's ~__name__~ attribute as the logger name - logger = make_logger(__name__) + In addition, this ensures the logger hierarchy reflect the package hierarchy. + For instance, if you initialize the logger for =my.module= with specific settings, the logger for =my.module.helper= would inherit these settings. See more on that [[ https://docs.python.org/3/library/logging.html?highlight=logging#logger-objects][in python docs]]. - # or to set a custom level - logger = make_logger(__name__, level='warning') -#+end_src + As a bonus, if you use the module ~__name__~, this logger will be automatically be picked up and used by ~cachew~. + +- often modules are processing multiple files, extracting data from each one ([[https://beepb00p.xyz/exports.html#types][incremental/synthetic exports]]) + + It's nice to log each file name you're processing as =logger.info= so the user of module gets a sense of progress. + If possible, add the index of file you're processing and the total count. + + #+begin_src python + def process_all_data(): + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + # :>{width} to align the logs vertically + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + yield from process_path(path) + #+end_src + + If there is a lot of logging happening related to a specific path, instead of adding path to each logging message manually, consider using [[https://docs.python.org/3/library/logging.html?highlight=loggeradapter#logging.LoggerAdapter][LoggerAdapter]]. + +- log exceptions, but sparingly + + Generally it's a good practice to call ~logging.exception~ from the ~except~ clause, so it's immediately visible where the errors are happening. + + However, in HPI, instead of crashing on exceptions we often behave defensively and ~yield~ them instead (see [[https://beepb00p.xyz/mypy-error-handling.html][mypy assisted error handling]]). + + In this case logging every time may become a bit spammy, so use exception logging sparingly in this case. + Typically it's best to rely on the downstream data consumer to handle the exceptions properly. + +- instead of =logging.getLogger=, it's best to use =my.core.make_logger= + + #+begin_src python + from my.core import make_logger + + logger = make_logger(__name__) + + # or to set a custom level + logger = make_logger(__name__, level='warning') + #+end_src + + This sets up some nicer defaults over standard =logging= module: + + - colored logs (via =colorlog= library) + - =INFO= as the initial logging level (instead of default =ERROR=) + - logging full exception trace when even when logging outside of the exception handler + + This is particularly useful for [[https://beepb00p.xyz/mypy-error-handling.html][mypy assisted error handling]]. + + By default, =logging= only logs the exception message (without the trace) in this case, which makes errors harder to debug. + - control logging level from the shell via ~LOGGING_LEVEL_*~ env variable + + This can be useful to suppress logging output if it's too spammy, or showing more output for debugging. + + E.g. ~LOGGING_LEVEL_my_instagram_gdpr=DEBUG hpi query my.instagram.gdpr.messages~ + + - experimental: passing env variable ~LOGGING_COLLAPSE=~ will "collapse" logging with the same level + + Instead of printing new logging line each time, it will 'redraw' the last logged line with a new logging message. + + This can be convenient if there are too many logs, you just need logging to get a sense of progress. + + - experimental: passing env variable ~ENLIGHTEN_ENABLE=yes~ will display TUI progress bars in some cases + + See [[https://github.com/Rockhopper-Technologies/enlighten#readme][https://github.com/Rockhopper-Technologies/enlighten#readme]] + + This can be convenient for showing the progress of parallel processing of different files from HPI: + + #+BEGIN_EXAMPLE + ghexport.dal[111] 29%|████████████████████ | 29/100 [00:03<00:07, 10.03 files/s] + rexport.dal[comments] 17%|████████ | 115/682 [00:03<00:14, 39.15 files/s] + my.instagram.android 0%|▎ | 3/2631 [00:02<34:50, 1.26 files/s] + #+END_EXAMPLE From ea195e3d17b3065f5a700773161072b6e025962c Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sun, 29 Oct 2023 00:11:58 +0100 Subject: [PATCH 175/302] general: improve logging during file processing in various modules --- my/hackernews/harmonic.py | 9 ++++++--- my/hackernews/materialistic.py | 13 ++++++++++--- my/tinder/android.py | 10 ++++++---- my/whatsapp/android.py | 12 +++++++----- 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/my/hackernews/harmonic.py b/my/hackernews/harmonic.py index f78c3ef..6070510 100644 --- a/my/hackernews/harmonic.py +++ b/my/hackernews/harmonic.py @@ -81,10 +81,13 @@ _PREFIX = 'com.simon.harmonichackernews.KEY_SHARED_PREFERENCES' def _saved() -> Iterator[Res[Saved]]: - for p in inputs(): - logger.info(f'processing: {p}') + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') # TODO defensive for each item! - tr = etree.parse(p) + tr = etree.parse(path) res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORIES_STRINGS"]'))) cached_ids = [x.text.split('-')[0] for x in res] diff --git a/my/hackernews/materialistic.py b/my/hackernews/materialistic.py index eddf053..4d5cd47 100644 --- a/my/hackernews/materialistic.py +++ b/my/hackernews/materialistic.py @@ -7,7 +7,7 @@ from typing import Any, Dict, Iterator, NamedTuple, Sequence from more_itertools import unique_everseen -from my.core import get_files, datetime_aware +from my.core import get_files, datetime_aware, make_logger from my.core.sqlite import sqlite_connection from my.config import materialistic as config # todo migrate config to my.hackernews.materialistic @@ -15,6 +15,9 @@ from my.config import materialistic as config # todo migrate config to my.hacke from .common import hackernews_link +logger = make_logger(__name__) + + def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -51,8 +54,12 @@ class Saved(NamedTuple): def _all_raw() -> Iterator[Row]: - for db in inputs(): - with sqlite_connection(db, immutable=True, row_factory='dict') as conn: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + with sqlite_connection(path, immutable=True, row_factory='dict') as conn: yield from conn.execute('SELECT * FROM saved ORDER BY time') diff --git a/my/tinder/android.py b/my/tinder/android.py index 9047a53..0ba9739 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -87,10 +87,12 @@ Entity = Union[Person, Match, Message] def _entities() -> Iterator[Res[_Entity]]: - dbs = inputs() - for i, db_file in enumerate(dbs): - logger.info(f'processing {db_file} {i}/{len(dbs)}') - with sqlite_connection(db_file, immutable=True, row_factory='row') as db: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + with sqlite_connection(path, immutable=True, row_factory='row') as db: yield from _handle_db(db) diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py index e7c266c..b82c353 100644 --- a/my/whatsapp/android.py +++ b/my/whatsapp/android.py @@ -189,14 +189,16 @@ def _process_db(db: sqlite3.Connection): def _messages() -> Iterator[Res[Message]]: - dbs = inputs() - for i, f in enumerate(dbs): - logger.info(f'processing {f} {i}/{len(dbs)}') - with sqlite_connection(f, immutable=True, row_factory='row') as db: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + with sqlite_connection(path, immutable=True, row_factory='row') as db: try: yield from _process_db(db) except Exception as e: - yield echain(RuntimeError(f'While processing {f}'), cause=e) + yield echain(RuntimeError(f'While processing {path}'), cause=e) def messages() -> Iterator[Res[Message]]: From f28f68b14b799d88dd5b7061ba17ac4802e9f915 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sun, 29 Oct 2023 21:58:13 +0000 Subject: [PATCH 176/302] general: enhancle logging for various modules --- my/bluemaestro.py | 18 ++++++++++-------- my/fbmessenger/android.py | 12 +++++++----- my/github/gdpr.py | 2 ++ my/instagram/android.py | 12 +++++++----- my/lastfm.py | 15 ++++++++++----- 5 files changed, 36 insertions(+), 23 deletions(-) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index 1586426..8f05aac 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -65,22 +65,24 @@ def is_bad_table(name: str) -> bool: @mcachew(depends_on=inputs) def measurements() -> Iterable[Res[Measurement]]: # todo ideally this would be via arguments... but needs to be lazy - dbs = inputs() + paths = inputs() + total = len(paths) + width = len(str(total)) last: Optional[datetime] = None # tables are immutable, so can save on processing.. processed_tables: Set[str] = set() - for f in dbs: - logger.info('processing %s', f) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') tot = 0 new = 0 # todo assert increasing timestamp? - with sqlite_connect_immutable(f) as db: + with sqlite_connect_immutable(path) as db: db_dt: Optional[datetime] = None try: datas = db.execute( - f'SELECT "{f.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index' + f'SELECT "{path.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index' ) oldfmt = True db_dts = list(db.execute('SELECT last_download FROM info'))[0][0] @@ -156,7 +158,7 @@ def measurements() -> Iterable[Res[Measurement]]: upper = timedelta(days=10) # kinda arbitrary if not (db_dt - lower < dt < db_dt + timedelta(days=10)): # todo could be more defenive?? - yield RuntimeError('timestamp too far out', f, name, db_dt, dt) + yield RuntimeError('timestamp too far out', path, name, db_dt, dt) continue # err.. sometimes my values are just interleaved with these for no apparent reason??? @@ -164,7 +166,7 @@ def measurements() -> Iterable[Res[Measurement]]: yield RuntimeError('the weird sensor bug') continue - assert -60 <= temp <= 60, (f, dt, temp) + assert -60 <= temp <= 60, (path, dt, temp) ## tot += 1 @@ -181,7 +183,7 @@ def measurements() -> Iterable[Res[Measurement]]: dewpoint=dewp, ) yield p - logger.debug('%s: new %d/%d', f, new, tot) + logger.debug(f'{path}: new {new}/{tot}') # logger.info('total items: %d', len(merged)) # for k, v in merged.items(): # # TODO shit. quite a few of them have varying values... how is that freaking possible???? diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index 38551b4..a5e6749 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -75,14 +75,16 @@ class Message(_BaseMessage): Entity = Union[Sender, Thread, _Message] def _entities() -> Iterator[Res[Entity]]: - dbs = inputs() - for i, f in enumerate(dbs): - logger.debug(f'processing {f} {i}/{len(dbs)}') - with sqlite_connection(f, immutable=True, row_factory='row') as db: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + with sqlite_connection(path, immutable=True, row_factory='row') as db: try: yield from _process_db(db) except Exception as e: - yield echain(RuntimeError(f'While processing {f}'), cause=e) + yield echain(RuntimeError(f'While processing {path}'), cause=e) def _normalise_user_id(ukey: str) -> str: diff --git a/my/github/gdpr.py b/my/github/gdpr.py index 1ff0f93..1fde7c9 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -46,6 +46,8 @@ def inputs() -> Sequence[Path]: def events() -> Iterable[Res[Event]]: last = max(inputs()) + logger.info(f'extracting data from {last}') + # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples # another one is zulip archive if last.is_dir(): diff --git a/my/instagram/android.py b/my/instagram/android.py index 97733b8..eace1c0 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -180,15 +180,17 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: # NOTE: definitely need to merge multiple, app seems to recycle old messages # TODO: hmm hard to guarantee timestamp ordering when we use synthetic input data... # todo use TypedDict? - dbs = inputs() - for f in dbs: - logger.info(f'{f} : processing...') - with sqlite_connect_immutable(f) as db: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + with sqlite_connect_immutable(path) as db: try: yield from _process_db(db=db) except Exception as e: # todo use error policy here - yield echain(RuntimeError(f'While processing {f}'), cause=e) + yield echain(RuntimeError(f'While processing {path}'), cause=e) @mcachew(depends_on=inputs) diff --git a/my/lastfm.py b/my/lastfm.py index 97c112c..90484b4 100644 --- a/my/lastfm.py +++ b/my/lastfm.py @@ -2,9 +2,13 @@ Last.fm scrobbles ''' -from .core import Paths, dataclass +from my.core import Paths, dataclass, make_logger from my.config import lastfm as user_config + +logger = make_logger(__name__) + + @dataclass class lastfm(user_config): """ @@ -13,7 +17,7 @@ class lastfm(user_config): export_path: Paths -from .core.cfg import make_config +from my.core.cfg import make_config config = make_config(lastfm) @@ -22,7 +26,7 @@ import json from pathlib import Path from typing import NamedTuple, Sequence, Iterable -from .core.common import mcachew, Json, get_files +from my.core.common import mcachew, Json, get_files def inputs() -> Sequence[Path]: @@ -64,19 +68,20 @@ class Scrobble(NamedTuple): @mcachew(depends_on=inputs) def scrobbles() -> Iterable[Scrobble]: last = max(inputs()) + logger.info(f'loading data from {last}') j = json.loads(last.read_text()) for raw in reversed(j): yield Scrobble(raw=raw) -from .core import stat, Stats +from my.core import stat, Stats def stats() -> Stats: return stat(scrobbles) def fill_influxdb() -> None: - from .core import influxdb + from my.core import influxdb # todo needs to be more automatic sd = (dict( dt=x.dt, From 79ce8e84ecb293c2f2b07224991daea70e9f0bb8 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Mon, 30 Oct 2023 02:28:12 +0000 Subject: [PATCH 177/302] fbmessenger.android: support processing msys database seems that threads_db2 stopped updating some time ago, and msys contains all new data now --- my/fbmessenger/android.py | 91 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 7 deletions(-) diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index a5e6749..d14b653 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -49,6 +49,7 @@ class Thread: id: str name: Optional[str] # isn't set for groups or one to one messages + # todo not sure about order of fields... @dataclass class _BaseMessage: @@ -74,6 +75,8 @@ class Message(_BaseMessage): Entity = Union[Sender, Thread, _Message] + + def _entities() -> Iterator[Res[Entity]]: paths = inputs() total = len(paths) @@ -82,7 +85,11 @@ def _entities() -> Iterator[Res[Entity]]: logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') with sqlite_connection(path, immutable=True, row_factory='row') as db: try: - yield from _process_db(db) + use_msys = len(list(db.execute('SELECT * FROM sqlite_master WHERE name = "logging_events_v2"'))) > 0 + if use_msys: + yield from _process_db_msys(db) + else: + yield from _process_db_threads_db2(db) except Exception as e: yield echain(RuntimeError(f'While processing {path}'), cause=e) @@ -91,7 +98,7 @@ def _normalise_user_id(ukey: str) -> str: # trying to match messages.author from fbchat prefix = 'FACEBOOK:' assert ukey.startswith(prefix), ukey - return ukey[len(prefix):] + return ukey[len(prefix) :] def _normalise_thread_id(key) -> str: @@ -99,7 +106,74 @@ def _normalise_thread_id(key) -> str: return key.split(':')[1] -def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: +# NOTE: this is sort of copy pasted from other _process_db method +# maybe later could unify them +def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: + senders: Dict[str, Sender] = {} + for r in db.execute('SELECT CAST(id AS TEXT) AS id, name FROM contacts'): + s = Sender( + id=r['id'], + name=r['name'], + ) + senders[s.id] = s + yield s + + # TODO can we get it from db? could infer as the most common id perhaps? + self_id = config.facebook_id + thread_users: Dict[str, List[Sender]] = {} + for r in db.execute('SELECT CAST(thread_key AS TEXT) AS thread_key, CAST(contact_id AS TEXT) AS contact_id FROM participants'): + thread_key = r['thread_key'] + user_key = r['contact_id'] + if self_id is not None and user_key == self_id: + # exclude yourself, otherwise it's just spammy to show up in all participants + continue + + ll = thread_users.get(thread_key) + if ll is None: + ll = [] + thread_users[thread_key] = ll + ll.append(senders[user_key]) + + # 15 is a weird thread that doesn't have any participants and messages + for r in db.execute('SELECT CAST(thread_key AS TEXT) AS thread_key, thread_name FROM threads WHERE thread_type != 15'): + thread_key = r['thread_key'] + name = r['thread_name'] + if name is None: + users = thread_users[thread_key] + name = ', '.join([u.name or u.id for u in users]) + yield Thread( + id=thread_key, + name=name, + ) + + # TODO should be quicker to explicitly specify columns rather than SELECT * + # should probably add it to module development tips? + for r in db.execute( + ''' + SELECT + message_id, + timestamp_ms, + text, + CAST(thread_key AS TEXT) AS thread_key, + CAST(sender_id AS TEXT) AS sender_id, + reply_source_id + FROM messages + ORDER BY timestamp_ms /* they aren't in order in the database, so need to sort */ + ''' + ): + yield _Message( + id=r['message_id'], + # TODO double check utc + dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000, tz=timezone.utc), + # is_incoming=False, TODO?? + text=r['text'], + thread_id=r['thread_key'], + sender_id=r['sender_id'], + reply_to_id=r['reply_source_id'], + ) + + +def _process_db_threads_db2(db: sqlite3.Connection) -> Iterator[Res[Entity]]: senders: Dict[str, Sender] = {} for r in db.execute('''SELECT * FROM thread_users'''): # for messaging_actor_type == 'REDUCED_MESSAGING_ACTOR', name is None @@ -142,22 +216,25 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]: name=name, ) - for r in db.execute(''' + for r in db.execute( + ''' SELECT *, json_extract(sender, "$.user_key") AS user_key FROM messages WHERE msg_type NOT IN ( -1, /* these don't have any data at all, likely immediately deleted or something? */ 2 /* these are 'left group' system messages, also a bit annoying since they might reference nonexistent users */ ) ORDER BY timestamp_ms /* they aren't in order in the database, so need to sort */ - '''): + ''' + ): yield _Message( id=r['msg_id'], - dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000, tz=timezone.utc), # double checked against some messages in different timezone + # double checked against some messages in different timezone + dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000, tz=timezone.utc), # is_incoming=False, TODO?? text=r['text'], thread_id=_normalise_thread_id(r['thread_key']), sender_id=_normalise_user_id(r['user_key']), - reply_to_id=r['message_replied_to_id'] + reply_to_id=r['message_replied_to_id'], ) From d6786084cabd7f42158e7b2c84a6dfefcea3fc63 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 30 Oct 2023 22:13:59 +0000 Subject: [PATCH 178/302] general: deprecate some old methods by hiding behind TYPE_CHECKING --- my/hypothesis.py | 8 +++++--- my/pocket.py | 7 +++++-- my/polar.py | 4 +++- my/youtube/takeout.py | 6 ++++-- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/my/hypothesis.py b/my/hypothesis.py index 257e739..55fff64 100644 --- a/my/hypothesis.py +++ b/my/hypothesis.py @@ -6,7 +6,7 @@ REQUIRES = [ ] from dataclasses import dataclass from pathlib import Path -from typing import Iterator, Sequence +from typing import Iterator, Sequence, TYPE_CHECKING from my.core import ( get_files, @@ -69,5 +69,7 @@ def stats() -> Stats: } -get_highlights = highlights # todo deprecate -get_pages = pages # todo deprecate +if not TYPE_CHECKING: + # "deprecate" by hiding from mypy + get_highlights = highlights + get_pages = pages diff --git a/my/pocket.py b/my/pocket.py index 2a7bdcb..b638fba 100644 --- a/my/pocket.py +++ b/my/pocket.py @@ -5,6 +5,7 @@ REQUIRES = [ 'git+https://github.com/karlicoss/pockexport', ] from dataclasses import dataclass +from typing import TYPE_CHECKING from .core import Paths @@ -61,5 +62,7 @@ def stats() -> Stats: # todo deprecate? -def get_articles() -> Sequence[Article]: - return list(articles()) +if not TYPE_CHECKING: + # "deprecate" by hiding from mypy + def get_articles() -> Sequence[Article]: + return list(articles()) diff --git a/my/polar.py b/my/polar.py index 0f2ee82..fe59d00 100644 --- a/my/polar.py +++ b/my/polar.py @@ -249,4 +249,6 @@ def get_entries() -> List[Result]: ## deprecated -Error = Exception # for backwards compat with Orger; can remove later +if not TYPE_CHECKING: + # "deprecate" by hiding from mypy + Error = Exception # for backwards compat with Orger; can remove later diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index a3a2dda..79b4549 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -1,4 +1,4 @@ -from typing import NamedTuple, List, Iterable +from typing import NamedTuple, List, Iterable, TYPE_CHECKING from ..core import datetime_aware, Res, LazyLogger from ..core.compat import removeprefix @@ -99,7 +99,9 @@ def stats() -> Stats: ### deprecated stuff (keep in my.media.youtube) -get_watched = watched +if not TYPE_CHECKING: + # "deprecate" by hiding from mypy + get_watched = watched def _watched_legacy() -> Iterable[Watched]: From 71cb66df5ffb557750f838228501f38eb6c94759 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Tue, 31 Oct 2023 00:42:17 +0000 Subject: [PATCH 179/302] core: add helper for more_iterable to check that all types involved are hashable Otherwise unique_everseen performance may degrade to quadratic rather than linear For now hidden behind HPI_CHECK_UNIQUE_EVERSEEN flag also switch some modules to use it --- my/core/common.py | 73 ++++++++++++++++++++++++++++++++++++- my/fbmessenger/android.py | 5 +-- my/instagram/android.py | 5 +-- my/instagram/gdpr.py | 5 ++- my/tinder/android.py | 5 +-- my/twitter/talon.py | 7 ++-- my/vk/vk_messages_backup.py | 8 ++-- my/whatsapp/android.py | 5 +-- 8 files changed, 90 insertions(+), 23 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index ccebaf2..85b9386 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -6,7 +6,25 @@ from contextlib import contextmanager import os import sys import types -from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple, TYPE_CHECKING, NoReturn +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + NoReturn, + Optional, + Sequence, + TYPE_CHECKING, + Tuple, + TypeVar, + Union, + cast, + get_args, + get_type_hints, + get_origin, +) import warnings from . import warnings as core_warnings @@ -628,6 +646,59 @@ def assert_never(value: NoReturn) -> NoReturn: assert False, f'Unhandled value: {value} ({type(value).__name__})' +def _check_all_hashable(fun): + # TODO ok, take callable? + hints = get_type_hints(fun) + # TODO needs to be defensive like in cachew? + return_type = hints.get('return') + # TODO check if None + origin = get_origin(return_type) # Iterator etc? + (arg,) = get_args(return_type) + # options we wanna handle are simple type on the top level or union + arg_origin = get_origin(arg) + + if sys.version_info[:2] >= (3, 10): + is_uniontype = arg_origin is types.UnionType + else: + is_uniontype = False + + is_union = arg_origin is Union or is_uniontype + if is_union: + to_check = get_args(arg) + else: + to_check = (arg,) + + no_hash = [ + t + for t in to_check + # seems that objects that have not overridden hash have the attribute but it's set to None + if getattr(t, '__hash__', None) is None + ] + assert len(no_hash) == 0, f'Types {no_hash} are not hashable, this will result in significant performance downgrade for unique_everseen' + + +_UET = TypeVar('_UET') +_UEU = TypeVar('_UEU') + + +def unique_everseen( + fun: Callable[[], Iterable[_UET]], + key: Optional[Callable[[_UET], _UEU]] = None, +) -> Iterator[_UET]: + # TODO support normal iterable as well? + import more_itertools + + # NOTE: it has to take original callable, because otherwise we don't have access to generator type annotations + iterable = fun() + + if key is None: + # todo check key return type as well? but it's more likely to be hashable + if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None: + _check_all_hashable(fun) + + return more_itertools.unique_everseen(iterable=iterable, key=key) + + ## legacy imports, keeping them here for backwards compatibility from functools import cached_property as cproperty from typing import Literal diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index d14b653..fa313ea 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -9,9 +9,8 @@ from pathlib import Path import sqlite3 from typing import Iterator, Sequence, Optional, Dict, Union, List -from more_itertools import unique_everseen - from my.core import get_files, Paths, datetime_aware, Res, assert_never, LazyLogger, make_config +from my.core.common import unique_everseen from my.core.error import echain from my.core.sqlite import sqlite_connection @@ -242,7 +241,7 @@ def messages() -> Iterator[Res[Message]]: senders: Dict[str, Sender] = {} msgs: Dict[str, Message] = {} threads: Dict[str, Thread] = {} - for x in unique_everseen(_entities()): + for x in unique_everseen(_entities): if isinstance(x, Exception): yield x continue diff --git a/my/instagram/android.py b/my/instagram/android.py index eace1c0..ea5ee35 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -10,8 +10,6 @@ from pathlib import Path import sqlite3 from typing import Iterator, Sequence, Optional, Dict, Union -from more_itertools import unique_everseen - from my.core import ( get_files, Paths, @@ -22,6 +20,7 @@ from my.core import ( Res, assert_never, ) +from my.core.common import unique_everseen from my.core.cachew import mcachew from my.core.error import echain from my.core.sqlite import sqlite_connect_immutable, select @@ -196,7 +195,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: @mcachew(depends_on=inputs) def messages() -> Iterator[Res[Message]]: id2user: Dict[str, User] = {} - for x in unique_everseen(_entities()): + for x in unique_everseen(_entities): if isinstance(x, Exception): yield x continue diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index a42d73a..233f040 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -7,7 +7,7 @@ import json from pathlib import Path from typing import Iterator, Sequence, Dict, Union -from more_itertools import bucket, unique_everseen +from more_itertools import bucket from my.core import ( get_files, @@ -17,6 +17,7 @@ from my.core import ( assert_never, make_logger, ) +from my.core.common import unique_everseen from my.config import instagram as user_config @@ -196,7 +197,7 @@ def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]: # TODO basically copy pasted from android.py... hmm def messages() -> Iterator[Res[Message]]: id2user: Dict[str, User] = {} - for x in unique_everseen(_entities()): + for x in unique_everseen(_entities): if isinstance(x, Exception): yield x continue diff --git a/my/tinder/android.py b/my/tinder/android.py index 0ba9739..7e5f535 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -11,9 +11,8 @@ from pathlib import Path import sqlite3 from typing import Sequence, Iterator, Union, Dict, List, Mapping -from more_itertools import unique_everseen - from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware, make_logger +from my.core.common import unique_everseen from my.core.error import echain from my.core.sqlite import sqlite_connection import my.config @@ -162,7 +161,7 @@ def _parse_msg(row: sqlite3.Row) -> _Message: def entities() -> Iterator[Res[Entity]]: id2person: Dict[str, Person] = {} id2match: Dict[str, Match] = {} - for x in unique_everseen(_entities()): + for x in unique_everseen(_entities): if isinstance(x, Exception): yield x continue diff --git a/my/twitter/talon.py b/my/twitter/talon.py index e43f600..306a735 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -9,9 +9,8 @@ import re import sqlite3 from typing import Iterator, Sequence, Union -from more_itertools import unique_everseen - from my.core import Paths, Res, datetime_aware, get_files +from my.core.common import unique_everseen from my.core.sqlite import sqlite_connection from .common import TweetId, permalink @@ -133,7 +132,7 @@ def _parse_tweet(row: sqlite3.Row) -> Tweet: def tweets() -> Iterator[Res[Tweet]]: - for x in unique_everseen(_entities()): + for x in unique_everseen(_entities): if isinstance(x, Exception): yield x elif isinstance(x, _IsTweet): @@ -141,7 +140,7 @@ def tweets() -> Iterator[Res[Tweet]]: def likes() -> Iterator[Res[Tweet]]: - for x in unique_everseen(_entities()): + for x in unique_everseen(_entities): if isinstance(x, Exception): yield x elif isinstance(x, _IsFavorire): diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index 089605b..1837385 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -5,12 +5,12 @@ VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totkton from datetime import datetime from dataclasses import dataclass import json -from typing import Dict, Iterator, NamedTuple +from typing import Dict, Iterator -from more_itertools import unique_everseen import pytz -from my.core import stat, Stats, Json, Res, datetime_aware +from my.core import stat, Stats, Json, Res, datetime_aware, get_files +from my.core.common import unique_everseen from my.config import vk_messages_backup as config @@ -147,7 +147,7 @@ def _messages() -> Iterator[Res[Message]]: def messages() -> Iterator[Res[Message]]: # seems that during backup messages were sometimes duplicated.. - yield from unique_everseen(_messages()) + yield from unique_everseen(_messages) def stats() -> Stats: diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py index b82c353..295d831 100644 --- a/my/whatsapp/android.py +++ b/my/whatsapp/android.py @@ -9,9 +9,8 @@ from pathlib import Path import sqlite3 from typing import Sequence, Iterator, Optional -from more_itertools import unique_everseen - from my.core import get_files, Paths, datetime_aware, Res, make_logger, make_config +from my.core.common import unique_everseen from my.core.error import echain, notnone from my.core.sqlite import sqlite_connection import my.config @@ -202,4 +201,4 @@ def _messages() -> Iterator[Res[Message]]: def messages() -> Iterator[Res[Message]]: - yield from unique_everseen(_messages()) + yield from unique_everseen(_messages) From 24da04f142c6f81c6e3dc9850485179c9a81b7f5 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 1 Nov 2023 01:47:55 +0000 Subject: [PATCH 180/302] ci: fix wrong release command --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cf85155..f49c6b5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -100,4 +100,4 @@ jobs: if: github.event_name != 'pull_request' && startsWith(github.event.ref, 'refs/tags') env: TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} - run: pip3 install --user --upgrade build twine && .ci/release --test + run: pip3 install --user --upgrade build twine && .ci/release From 105928238f9cd250db215092f1710827b97bb03e Mon Sep 17 00:00:00 2001 From: karlicoss Date: Tue, 31 Oct 2023 00:47:31 +0000 Subject: [PATCH 181/302] vk_messages_backup: some cleanup + switch to get_files --- my/vk/vk_messages_backup.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index 1837385..c73587f 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -2,8 +2,8 @@ VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) ''' # note: could reuse the original repo, but little point I guess since VK closed their API -from datetime import datetime from dataclasses import dataclass +from datetime import datetime import json from typing import Dict, Iterator @@ -22,6 +22,7 @@ TZ = pytz.timezone('Europe/Moscow') Uid = int + @dataclass(frozen=True) class User: id: Uid @@ -45,8 +46,10 @@ class Message: Users = Dict[Uid, User] + + def users() -> Users: - files = list(sorted(config.storage_path.glob('user_*.json'))) + files = get_files(config.storage_path, glob='user_*.json') res = {} for f in files: j = json.loads(f.read_text()) @@ -60,6 +63,8 @@ def users() -> Users: GROUP_CHAT_MIN_ID = 2000000000 + + def _parse_chat(*, msg: Json, udict: Users) -> Chat: # exported with newer api, peer_id is a proper identifier both for users and chats peer_id = msg.get('peer_id') @@ -88,13 +93,13 @@ def _parse_chat(*, msg: Json, udict: Users) -> Chat: def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: mid = msg['id'] - md = msg['date'] + md = msg['date'] dt = datetime.fromtimestamp(md, tz=TZ) # todo attachments? e.g. url could be an attachment # todo might be forwarded? - mb = msg.get('body') + mb = msg.get('body') if mb is None: mb = msg.get('text') assert mb is not None, msg @@ -103,7 +108,7 @@ def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: if out: user = udict[config.user_id] else: - mu = msg.get('user_id') or msg.get('from_id') + mu = msg.get('user_id') or msg.get('from_id') assert mu is not None, msg user = udict[mu] return Message( @@ -118,8 +123,7 @@ def _parse_msg(*, msg: Json, chat: Chat, udict: Users) -> Message: def _messages() -> Iterator[Res[Message]]: udict = users() - uchats = list(sorted(config.storage_path.glob('userchat_*.json' ))) + \ - list(sorted(config.storage_path.glob('groupchat_*.json'))) + uchats = get_files(config.storage_path, glob='userchat_*.json') + get_files(config.storage_path, glob='groupchat_*.json') for f in uchats: j = json.loads(f.read_text()) # ugh. very annoying, sometimes not possible to extract title from last message From 7631f1f2e498e8a1829c02e3543bb8dd88770ffd Mon Sep 17 00:00:00 2001 From: karlicoss Date: Thu, 2 Nov 2023 00:27:09 +0000 Subject: [PATCH 182/302] monzo.monzoexport: initial module --- my/config.py | 5 +++++ my/monzo/monzoexport.py | 45 +++++++++++++++++++++++++++++++++++++++++ tox.ini | 1 + 3 files changed, 51 insertions(+) create mode 100644 my/monzo/monzoexport.py diff --git a/my/config.py b/my/config.py index 9cc9c11..ac44f41 100644 --- a/my/config.py +++ b/my/config.py @@ -269,3 +269,8 @@ class whatsapp: class harmonic: export_path: Paths + + +class monzo: + class monzoexport: + export_path: Paths diff --git a/my/monzo/monzoexport.py b/my/monzo/monzoexport.py new file mode 100644 index 0000000..3aa0cf5 --- /dev/null +++ b/my/monzo/monzoexport.py @@ -0,0 +1,45 @@ +""" +Monzo transactions data (using https://github.com/karlicoss/monzoexport ) +""" +REQUIRES = [ + 'git+https://github.com/karlicoss/monzoexport', +] + +from dataclasses import dataclass +from pathlib import Path +from typing import Sequence, Iterator + +from my.core import ( + Paths, + get_files, + make_logger, +) +import my.config + + +@dataclass +class config(my.config.monzo.monzoexport): + ''' + Uses [[https://github.com/karlicoss/monzoexport][ghexport]] outputs. + ''' + + export_path: Paths + '''path[s]/glob to the exported JSON data''' + + +logger = make_logger(__name__) + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +import monzoexport.dal as dal + + +def _dal() -> dal.DAL: + return dal.DAL(inputs()) + + +def transactions() -> Iterator[dal.MonzoTransaction]: + return _dal().transactions() diff --git a/tox.ini b/tox.ini index ac0a68d..dad6d9b 100644 --- a/tox.ini +++ b/tox.ini @@ -153,6 +153,7 @@ commands = my.ip.all \ my.kobo \ my.location.gpslogger \ + my.monzo.monzoexport \ my.orgmode \ my.pdfs \ my.pinboard \ From 5630621ec1be12144f6779f02984a79eacd90d24 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Mon, 6 Nov 2023 22:25:00 +0000 Subject: [PATCH 183/302] my.pinboard: some cleanup --- my/pinboard.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/my/pinboard.py b/my/pinboard.py index 354f15c..ef4ca36 100644 --- a/my/pinboard.py +++ b/my/pinboard.py @@ -5,22 +5,34 @@ REQUIRES = [ 'git+https://github.com/karlicoss/pinbexport', ] -from my.config import pinboard as config +from dataclasses import dataclass +from pathlib import Path +from typing import Iterator, Sequence +from my.core import get_files, Paths, Res +import my.config import pinbexport.dal as pinbexport + +@dataclass +class config(my.config.pinboard): # TODO rename to pinboard.pinbexport? + # TODO rename to export_path? + export_dir: Paths + + +# TODO not sure if should keep this import here? Bookmark = pinbexport.Bookmark +def inputs() -> Sequence[Path]: + return get_files(config.export_dir) + + # yep; clearly looks that the purpose of my. package is to wire files to DAL implicitly; otherwise it's just passtrhough. def dal() -> pinbexport.DAL: - from .core import get_files - inputs = get_files(config.export_dir) # todo rename to export_path - model = pinbexport.DAL(inputs) - return model + return pinbexport.DAL(inputs()) -from typing import Iterable -def bookmarks() -> Iterable[pinbexport.Bookmark]: +def bookmarks() -> Iterator[Res[pinbexport.Bookmark]]: return dal().bookmarks() From 4ac3bbb101973135c81bc84a9d40282081034851 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Mon, 6 Nov 2023 23:31:56 +0000 Subject: [PATCH 184/302] my.bumble.android: fix message deduplication --- my/bumble/android.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/my/bumble/android.py b/my/bumble/android.py index 86c9d1e..3a159da 100644 --- a/my/bumble/android.py +++ b/my/bumble/android.py @@ -106,10 +106,11 @@ def _handle_db(db: sqlite3.Connection) -> Iterator[EntitiesRes]: def _key(r: EntitiesRes): if isinstance(r, _Message): - if '&srv_width=' in r.text: + if '/hidden?' in r.text: # ugh. seems that image URLs change all the time in the db? # can't access them without login anyway # so use a different key for such messages + # todo maybe normalize text instead? since it's gonna always trigger diffs down the line return (r.id, r.created) return r From 19353e996d2d757bf43126c4149abdc7be482681 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Mon, 6 Nov 2023 23:42:22 +0000 Subject: [PATCH 185/302] my.hackernews.harmonic: use orjson + add __hash__ for Saved object plus some minor cleanup --- my/hackernews/harmonic.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/my/hackernews/harmonic.py b/my/hackernews/harmonic.py index 6070510..3b4ae61 100644 --- a/my/hackernews/harmonic.py +++ b/my/hackernews/harmonic.py @@ -1,16 +1,16 @@ """ [[https://play.google.com/store/apps/details?id=com.simon.harmonichackernews][Harmonic]] app for Hackernews """ -REQUIRES = ['lxml'] +REQUIRES = ['lxml', 'orjson'] from dataclasses import dataclass from datetime import datetime, timezone -import json +import orjson from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Sequence, TypedDict, cast from lxml import etree -from more_itertools import unique_everseen, one +from more_itertools import one from my.core import ( Paths, @@ -21,15 +21,16 @@ from my.core import ( make_logger, stat, ) +from my.core.common import unique_everseen +import my.config from .common import hackernews_link, SavedBase -from my.config import harmonic as user_config logger = make_logger(__name__) @dataclass -class harmonic(user_config): +class harmonic(my.config.harmonic): export_path: Paths @@ -76,6 +77,10 @@ class Saved(SavedBase): def hackernews_link(self) -> str: return hackernews_link(self.uid) + def __hash__(self) -> int: + # meh. but seems like the easiest and fastest way to hash a dict? + return hash(orjson.dumps(self.raw)) + _PREFIX = 'com.simon.harmonichackernews.KEY_SHARED_PREFERENCES' @@ -95,7 +100,7 @@ def _saved() -> Iterator[Res[Saved]]: cached: Dict[str, Cached] = {} for sid in cached_ids: res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORY{sid}"]'))) - j = json.loads(res.text) + j = orjson.loads(res.text) cached[sid] = j res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_BOOKMARKS"]'))) @@ -112,7 +117,7 @@ def _saved() -> Iterator[Res[Saved]]: def saved() -> Iterator[Res[Saved]]: - yield from unique_everseen(_saved()) + yield from unique_everseen(_saved) def stats() -> Stats: From 33f8d867e23e3459035196be328a6768c126d609 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Tue, 7 Nov 2023 21:07:32 +0000 Subject: [PATCH 186/302] my.browser.export: cleanup - make logging INFO (default) -- otherwise it's too quiet during processing lots of databases - can pass inputs cachew directly now --- my/browser/export.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/my/browser/export.py b/my/browser/export.py index 46a4217..ce5a6de 100644 --- a/my/browser/export.py +++ b/my/browser/export.py @@ -4,20 +4,18 @@ Parses browser history using [[http://github.com/seanbreckenridge/browserexport] REQUIRES = ["browserexport"] -from my.config import browser as user_config -from my.core import Paths, dataclass - - -@dataclass -class config(user_config.export): - # path[s]/glob to your backed up browser history sqlite files - export_path: Paths - - +from dataclasses import dataclass from pathlib import Path -from typing import Iterator, Sequence, List +from typing import Iterator, Sequence -from my.core import Stats, get_files, LazyLogger +import my.config +from my.core import ( + Paths, + Stats, + get_files, + make_logger, + stat, +) from my.core.common import mcachew from browserexport.merge import read_and_merge, Visit @@ -25,7 +23,13 @@ from browserexport.merge import read_and_merge, Visit from .common import _patch_browserexport_logs -logger = LazyLogger(__name__, level="warning") +@dataclass +class config(my.config.browser.export): + # path[s]/glob to your backed up browser history sqlite files + export_path: Paths + + +logger = make_logger(__name__) _patch_browserexport_logs(logger.level) @@ -34,16 +38,10 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) -def _cachew_depends_on() -> List[str]: - return [str(f) for f in inputs()] - - -@mcachew(depends_on=_cachew_depends_on, logger=logger) +@mcachew(depends_on=inputs, logger=logger) def history() -> Iterator[Visit]: yield from read_and_merge(inputs()) def stats() -> Stats: - from my.core import stat - return {**stat(history)} From e547acfa5907bbd53dd72604f5f6a7b70cf33946 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Tue, 7 Nov 2023 21:09:48 +0000 Subject: [PATCH 187/302] general: update minimal cachew version had quite a few useful fixes/performance optimizations since --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 42ffeaa..b857662 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,7 @@ def main() -> None: # todo document these? 'orjson', # for my.core.serialize 'pyfzf_iter', # for my.core.denylist - 'cachew>=0.8.0', + 'cachew>=0.15.20231019 ', 'mypy', # used for config checks 'colorlog', # for colored logs 'enlighten', # for CLI progress bars From ac5f71c68b91462180cf78e8f454b502052b2cc9 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 10 Nov 2023 01:59:21 +0000 Subject: [PATCH 188/302] my.jawbone: get rid of matplotlib import on top level --- my/jawbone/__init__.py | 15 +++++++++------ tox.ini | 1 - 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 4b41242..7f4d6bd 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Dict, Any, List, Iterable import json from functools import lru_cache @@ -155,9 +157,6 @@ def stats(): #### NOTE: most of the stuff below is deprecated and remnants of my old code! #### sorry for it, feel free to remove if you don't need it -import matplotlib.pyplot as plt # type: ignore -from matplotlib.figure import Figure # type: ignore -from matplotlib.axes import Axes # type: ignore def hhmm(time: datetime): return time.strftime("%H:%M") @@ -168,9 +167,10 @@ def hhmm(time: datetime): # fromstart = time - sleep.created # return fromstart / tick -import matplotlib.dates as mdates # type: ignore -def plot_one(sleep: SleepEntry, fig: Figure, axes: Axes, xlims=None, showtext=True): +def plot_one(sleep: SleepEntry, fig, axes, xlims=None, showtext=True): + import matplotlib.dates as mdates # type: ignore[import-not-found] + span = sleep.completed - sleep.created print(f"{sleep.xid} span: {span}") @@ -253,7 +253,10 @@ def predicate(sleep: SleepEntry): # TODO move to dashboard -def plot(): +def plot() -> None: + from matplotlib.figure import Figure # type: ignore[import-not-found] + import matplotlib.pyplot as plt # type: ignore[import-not-found] + # TODO FIXME melatonin data melatonin_data = {} # type: ignore[var-annotated] diff --git a/tox.ini b/tox.ini index dad6d9b..0de357f 100644 --- a/tox.ini +++ b/tox.ini @@ -171,7 +171,6 @@ commands = -p {[testenv]package_name} \ --exclude 'my/coding/codeforces.py' \ --exclude 'my/coding/topcoder.py' \ - --exclude 'my/jawbone/.*' \ --txt-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \ {posargs} From 65c617ed94e39cd3d12d58c5d6b35f894f41f892 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 10 Nov 2023 02:00:37 +0000 Subject: [PATCH 189/302] my.emfit: add missing properties to fake data generator --- my/emfit/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index 1ec3341..30b693c 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -170,6 +170,8 @@ def fake_data(nights: int = 500) -> Iterator: from my.core.cfg import tmp_config from tempfile import TemporaryDirectory + import pytz + with TemporaryDirectory() as td: tdir = Path(td) gen = dal.FakeData() @@ -178,6 +180,8 @@ def fake_data(nights: int = 500) -> Iterator: class override: class emfit: export_path = tdir + excluded_sids = () + timezone = pytz.timezone('Europe/London') # meh with tmp_config(modules=__name__, config=override) as cfg: yield cfg From 70bb9ed0c5e5aa3e4996c61880af1a5328a66017 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 10 Nov 2023 02:03:20 +0000 Subject: [PATCH 190/302] location.google_takeout_semantic: handle None visitConfidence --- my/location/google_takeout_semantic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/my/location/google_takeout_semantic.py b/my/location/google_takeout_semantic.py index fcf7f01..b4f16db 100644 --- a/my/location/google_takeout_semantic.py +++ b/my/location/google_takeout_semantic.py @@ -54,8 +54,9 @@ def locations() -> Iterator[Res[Location]]: for g in events(): if isinstance(g, SemanticLocation): - if g.visitConfidence < require_confidence: - logger.debug(f"Skipping {g} due to low confidence ({g.visitConfidence}))") + visitConfidence = g.visitConfidence + if visitConfidence is None or visitConfidence < require_confidence: + logger.debug(f"Skipping {g} due to low confidence ({visitConfidence}))") continue yield Location( lon=g.lng, From 996169aa295775f45d4828f9a1893840091a24ae Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 10 Nov 2023 22:09:48 +0000 Subject: [PATCH 191/302] time.tz.via_location: more consistent behaviour wrt caching previously it was possible to cachew never properly initialize the cache because if you only queried some dates in the past because we never made it to the end of _iter_tzs also some minor cleanup --- my/time/tz/via_location.py | 160 ++++++++++++++++++------------------- 1 file changed, 79 insertions(+), 81 deletions(-) diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 1ed1ba7..612341a 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -6,6 +6,24 @@ REQUIRES = [ 'timezonefinder', ] +from collections import Counter +from dataclasses import dataclass +from datetime import date, datetime +from functools import lru_cache +import heapq +from itertools import groupby +import os +from typing import Iterator, Optional, Tuple, Any, List, Iterable, Set, Dict + +import pytz + +from my.core import make_logger, stat, Stats, datetime_aware +from my.core.common import mcachew +from my.core.source import import_source +from my.core.warnings import high + +from my.location.common import LatLon + ## user might not have tz config section, so makes sense to be more defensive about it # todo might be useful to extract a helper for this @@ -27,8 +45,6 @@ if 'user_config' not in globals(): ## -from my.core import dataclass - @dataclass class config(user_config): # less precise, but faster @@ -46,55 +62,33 @@ class config(user_config): _iter_tz_refresh_time: int = 6 -from collections import Counter -from datetime import date, datetime -from functools import lru_cache -from itertools import groupby -from typing import Iterator, NamedTuple, Optional, Tuple, Any, List, Iterable, Set +logger = make_logger(__name__) -import heapq -import pytz -from more_itertools import seekable -from my.core.common import LazyLogger, mcachew, tzdatetime -from my.core.source import import_source - -logger = LazyLogger(__name__, level='warning') - -@lru_cache(2) +@lru_cache(None) def _timezone_finder(fast: bool) -> Any: if fast: # less precise, but faster from timezonefinder import TimezoneFinderL as Finder else: - from timezonefinder import TimezoneFinder as Finder # type: ignore + from timezonefinder import TimezoneFinder as Finder # type: ignore return Finder(in_memory=True) -# todo move to common? -Zone = str - - -# NOTE: for now only daily resolution is supported... later will implement something more efficient -class DayWithZone(NamedTuple): - day: date - zone: Zone - - -from my.location.common import LatLon - # for backwards compatibility -def _locations() -> Iterator[Tuple[LatLon, datetime]]: +def _locations() -> Iterator[Tuple[LatLon, datetime_aware]]: try: import my.location.all + for loc in my.location.all.locations(): if loc.accuracy is not None and loc.accuracy > config.require_accuracy: continue yield ((loc.lat, loc.lon), loc.dt) except Exception as e: - from my.core.warnings import high - logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implementation", exc_info=e) + logger.exception( + "Could not setup via_location using my.location.all provider, falling back to legacy google implementation", exc_info=e + ) high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data") import my.location.google @@ -102,10 +96,22 @@ def _locations() -> Iterator[Tuple[LatLon, datetime]]: for gloc in my.location.google.locations(): yield ((gloc.lat, gloc.lon), gloc.dt) + # TODO: could use heapmerge or sort the underlying iterators somehow? # see https://github.com/karlicoss/HPI/pull/237#discussion_r858372934 -def _sorted_locations() -> List[Tuple[LatLon, datetime]]: - return list(sorted(_locations(), key=lambda x: x[1])) +def _sorted_locations() -> List[Tuple[LatLon, datetime_aware]]: + return sorted(_locations(), key=lambda x: x[1]) + + +# todo move to common? +Zone = str + + +# NOTE: for now only daily resolution is supported... later will implement something more efficient +@dataclass(unsafe_hash=True) +class DayWithZone: + day: date + zone: Zone def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> Iterator[DayWithZone]: @@ -120,20 +126,22 @@ def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> I # TODO this is probably a bit expensive... test & benchmark ldt = dt.astimezone(tz) ndate = ldt.date() - #if pdt is not None and ndate < pdt.date(): + # if pdt is not None and ndate < pdt.date(): # # TODO for now just drop and collect the stats # # I guess we'd have minor drops while air travel... # warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}") # continue - #pdt = ldt - z = tz.zone; assert z is not None + # pdt = ldt + z = tz.zone + assert z is not None yield DayWithZone(day=ndate, zone=z) + # Note: this takes a while, as the upstream since _locations isn't sorted, so this # has to do an iterative sort of the entire my.locations.all list def _iter_local_dates() -> Iterator[DayWithZone]: - finder = _timezone_finder(fast=config.fast) # rely on the default - #pdt = None + finder = _timezone_finder(fast=config.fast) # rely on the default + # pdt = None # TODO: warnings doesn't actually warn? # warnings = [] @@ -157,7 +165,7 @@ def _iter_local_dates_fallback() -> Iterator[DayWithZone]: yield from _find_tz_for_locs(_timezone_finder(fast=config.fast), _fallback_locations()) -def most_common(lst: List[DayWithZone]) -> DayWithZone: +def most_common(lst: Iterator[DayWithZone]) -> DayWithZone: res, _ = Counter(lst).most_common(1)[0] return res @@ -181,59 +189,49 @@ def _iter_tz_depends_on() -> str: # refresh _iter_tzs every few hours -- don't think a better depends_on is possible dynamically -@mcachew(logger=logger, depends_on=_iter_tz_depends_on) +@mcachew(depends_on=_iter_tz_depends_on) def _iter_tzs() -> Iterator[DayWithZone]: # since we have no control over what order the locations are returned, # we need to sort them first before we can do a groupby - local_dates: List[DayWithZone] = list(_iter_local_dates()) - local_dates.sort(key=lambda p: p.day) + by_day = lambda p: p.day + + local_dates: List[DayWithZone] = sorted(_iter_local_dates(), key=by_day) logger.debug(f"no. of items using exact locations: {len(local_dates)}") - local_dates_fallback: List[DayWithZone] = list(_iter_local_dates_fallback()) - local_dates_fallback.sort(key=lambda p: p.day) + local_dates_fallback: List[DayWithZone] = sorted(_iter_local_dates_fallback(), key=by_day) # find days that are in fallback but not in local_dates (i.e., missing days) - local_dates_set: Set[date] = set(d.day for d in local_dates) + local_dates_set: Set[date] = {d.day for d in local_dates} use_fallback_days: List[DayWithZone] = [d for d in local_dates_fallback if d.day not in local_dates_set] logger.debug(f"no. of items being used from fallback locations: {len(use_fallback_days)}") # combine local_dates and missing days from fallback into a sorted list - all_dates = heapq.merge(local_dates, use_fallback_days, key=lambda p: p.day) + all_dates = heapq.merge(local_dates, use_fallback_days, key=by_day) + # todo could probably use heapify here instead of heapq.merge? - for d, gr in groupby(all_dates, key=lambda p: p.day): - logger.info(f"processed {d}{', using fallback' if d in local_dates_set else ''}") - zone = most_common(list(gr)).zone + for d, gr in groupby(all_dates, key=by_day): + logger.debug(f"processed {d}{', using fallback' if d in local_dates_set else ''}") + zone = most_common(gr).zone yield DayWithZone(day=d, zone=zone) @lru_cache(1) -def loc_tz_getter() -> Iterator[DayWithZone]: - # seekable makes it cache the emitted values - return seekable(_iter_tzs()) +def _day2zone() -> Dict[date, pytz.BaseTzInfo]: + # NOTE: kinda unfortunate that this will have to process all days before returning result for just one + # however otherwise cachew cache might never be initialized properly + # so we'll always end up recomputing everyting during subsequent runs + return {dz.day: pytz.timezone(dz.zone) for dz in _iter_tzs()} -# todo expose zone names too? -@lru_cache(maxsize=None) def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]: - sit = loc_tz_getter() - # todo hmm. seeking is not super efficient... might need to use some smarter dict-based cache - # hopefully, this method itself caches stuff forthe users, so won't be too bad - sit.seek(0) # type: ignore - - zone: Optional[str] = None - for x, tz in sit: - if x == d: - zone = tz - if x >= d: - break - return None if zone is None else pytz.timezone(zone) + return _day2zone().get(d) # ok to cache, there are only a few home locations? -@lru_cache(maxsize=None) +@lru_cache(None) def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]: (lat, lng) = loc - finder = _timezone_finder(fast=False) # ok to use slow here for better precision + finder = _timezone_finder(fast=False) # ok to use slow here for better precision zone = finder.timezone_at(lat=lat, lng=lng) if zone is None: # TODO shouldn't really happen, warn? @@ -242,7 +240,7 @@ def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]: return pytz.timezone(zone) -def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: +def get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: ''' Given a datetime, returns the timezone for that date. ''' @@ -258,16 +256,14 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: # that datetime is between, else fallback on your first home location, so it acts # as a last resort from my.location.fallback import via_home as home + loc = list(home.estimate_location(dt)) assert len(loc) == 1, f"should only have one home location, received {loc}" return _get_home_tz(loc=(loc[0].lat, loc[0].lon)) -# expose as 'public' function -get_tz = _get_tz - -def localize(dt: datetime) -> tzdatetime: - tz = _get_tz(dt) +def localize(dt: datetime) -> datetime_aware: + tz = get_tz(dt) if tz is None: # TODO -- this shouldn't really happen.. think about it carefully later return dt @@ -275,20 +271,17 @@ def localize(dt: datetime) -> tzdatetime: return tz.localize(dt) -from ...core import stat, Stats -def stats(quick: bool=False) -> Stats: +def stats(quick: bool = False) -> Stats: if quick: prev, config.sort_locations = config.sort_locations, False - res = { - 'first': next(_iter_local_dates()) - } + res = {'first': next(_iter_local_dates())} config.sort_locations = prev return res # TODO not sure what would be a good stat() for this module... # might be nice to print some actual timezones? # there aren't really any great iterables to expose - import os VIA_LOCATION_START_YEAR = int(os.environ.get("VIA_LOCATION_START_YEAR", 1990)) + def localized_years(): last = datetime.now().year + 2 # note: deliberately take + 2 years, so the iterator exhausts. otherwise stuff might never get cached @@ -296,4 +289,9 @@ def stats(quick: bool=False) -> Stats: for Y in range(VIA_LOCATION_START_YEAR, last): dt = datetime.fromisoformat(f'{Y}-01-01 01:01:01') yield localize(dt) + return stat(localized_years) + + +# deprecated -- still used in some other modules so need to keep +_get_tz = get_tz From 657ce08ac85b49711464accebc07b230619d9f8a Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 10 Nov 2023 22:28:28 +0000 Subject: [PATCH 192/302] fix mypy issues after mypy/libraries updates --- my/core/_deprecated/kompress.py | 8 ++++---- my/core/cachew.py | 2 +- my/core/preinit.py | 2 +- my/core/query.py | 2 +- my/core/query_range.py | 2 +- my/instagram/common.py | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index cd1bd9d..25b8a20 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -162,7 +162,7 @@ class ZipPath(zipfile_Path): # NOTE: is_dir/is_file might not behave as expected, the base class checks it only based on the slash in path # seems that root/at are not exposed in the docs, so might be an implementation detail - root: zipfile.ZipFile + root: zipfile.ZipFile # type: ignore[assignment] at: str @property @@ -191,14 +191,14 @@ class ZipPath(zipfile_Path): # note: seems that zip always uses forward slash, regardless OS? return zipfile_Path(self.root, self.at + '/') - def rglob(self, glob: str) -> Sequence[ZipPath]: + def rglob(self, glob: str) -> Iterator[ZipPath]: # note: not 100% sure about the correctness, but seem fine? # Path.match() matches from the right, so need to rpaths = [p for p in self.root.namelist() if p.startswith(self.at)] rpaths = [p for p in rpaths if Path(p).match(glob)] - return [ZipPath(self.root, p) for p in rpaths] + return (ZipPath(self.root, p) for p in rpaths) - def relative_to(self, other: ZipPath) -> Path: + def relative_to(self, other: ZipPath) -> Path: # type: ignore[override, unused-ignore] assert self.filepath == other.filepath, (self.filepath, other.filepath) return self.subpath.relative_to(other.subpath) diff --git a/my/core/cachew.py b/my/core/cachew.py index 7dd62d2..cc5a95b 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -7,7 +7,7 @@ import sys from typing import Optional, Iterator, cast, TYPE_CHECKING, TypeVar, Callable, overload, Union, Any, Type import warnings -import appdirs +import appdirs # type: ignore[import-untyped] PathIsh = Union[str, Path] # avoid circular import from .common diff --git a/my/core/preinit.py b/my/core/preinit.py index 88bcb27..8c0f6a4 100644 --- a/my/core/preinit.py +++ b/my/core/preinit.py @@ -4,7 +4,7 @@ from pathlib import Path # - it's imported from my.core.init (so we wan't to keep this file as small/reliable as possible, hence not common or something) # - we still need this function in __main__, so has to be separate from my/core/init.py def get_mycfg_dir() -> Path: - import appdirs + import appdirs # type: ignore[import-untyped] import os # not sure if that's necessary, i.e. could rely on PYTHONPATH instead # on the other hand, by using MY_CONFIG we are guaranteed to load it from the desired path? diff --git a/my/core/query.py b/my/core/query.py index 4e00569..7c22838 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -178,7 +178,7 @@ pass 'drop_exceptions' to ignore exceptions""") return lambda o: o.get(key, default) # type: ignore[union-attr] else: if hasattr(obj, key): - return lambda o: getattr(o, key, default) # type: ignore[arg-type] + return lambda o: getattr(o, key, default) # Note: if the attribute you're ordering by is an Optional type, # and on some objects it'll return None, the getattr(o, field_name, default) won't diff --git a/my/core/query_range.py b/my/core/query_range.py index a1cfaed..afde933 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -327,7 +327,7 @@ def select_range( # we should search for on each value in the iterator if order_value is None and order_by_value_type is not None: # search for that type on the iterator object - order_value = lambda o: isinstance(o, order_by_value_type) # type: ignore + order_value = lambda o: isinstance(o, order_by_value_type) # if the user supplied a order_key, and/or we've generated an order_value, create # the function that accesses that type on each value in the iterator diff --git a/my/instagram/common.py b/my/instagram/common.py index 36c6b83..4df07a1 100644 --- a/my/instagram/common.py +++ b/my/instagram/common.py @@ -68,6 +68,6 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: if user is not None: repls['user'] = user if len(repls) > 0: - m = replace(m, **repls) # type: ignore[type-var, misc] # ugh mypy is confused because of Protocol? + m = replace(m, **repls) # type: ignore[type-var] # ugh mypy is confused because of Protocol? mmap[k] = m yield m From 7b1cec9326609583c4ae4c542dd352df70ba9c11 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 10 Nov 2023 23:02:11 +0000 Subject: [PATCH 193/302] codeforces/topcode: move to top level and check in ci --- my/{coding => }/codeforces.py | 15 +++++---------- my/{coding => }/topcoder.py | 16 +++++----------- tox.ini | 2 -- 3 files changed, 10 insertions(+), 23 deletions(-) rename my/{coding => }/codeforces.py (87%) rename my/{coding => }/topcoder.py (83%) diff --git a/my/coding/codeforces.py b/my/codeforces.py similarity index 87% rename from my/coding/codeforces.py rename to my/codeforces.py index a8b0f65..a97c360 100644 --- a/my/coding/codeforces.py +++ b/my/codeforces.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 from my.config import codeforces as config # type: ignore[attr-defined] @@ -8,8 +7,8 @@ import json from typing import NamedTuple, Dict, Iterator -from ..core import get_files, Res, unwrap -from ..core.konsume import ignore, wrap +from my.core import get_files, Res +from my.core.konsume import ignore, wrap Cid = int @@ -72,20 +71,16 @@ class Competition(NamedTuple): ignore(json, 'rank', 'oldRating', 'newRating') -def iter_data() -> Iterator[Res[Competition]]: +def data() -> Iterator[Res[Competition]]: cmap = get_contests() last = max(get_files(config.export_path, 'codeforces*.json')) with wrap(json.loads(last.read_text())) as j: - j['status'].ignore() - res = j['result'].zoom() + j['status'].ignore() # type: ignore[index] + res = j['result'].zoom() # type: ignore[index] for c in list(res): # TODO maybe we want 'iter' method?? ignore(c, 'handle', 'ratingUpdateTimeSeconds') yield from Competition.make(cmap=cmap, json=c) c.consume() # TODO maybe if they are all empty, no need to consume?? - - -def get_data(): - return list(sorted(iter_data(), key=Competition.when.fget)) diff --git a/my/coding/topcoder.py b/my/topcoder.py similarity index 83% rename from my/coding/topcoder.py rename to my/topcoder.py index 96bcdf7..7432379 100644 --- a/my/coding/topcoder.py +++ b/my/topcoder.py @@ -1,16 +1,14 @@ -#!/usr/bin/env python3 from my.config import topcoder as config # type: ignore[attr-defined] from datetime import datetime from functools import cached_property import json -from typing import NamedTuple, Dict, Iterator +from typing import NamedTuple, Iterator -from ..core import get_files, Res, unwrap, Json -from ..core.error import Res, unwrap -from ..core.konsume import zoom, wrap, ignore +from my.core import get_files, Res, Json +from my.core.konsume import zoom, wrap, ignore def _get_latest() -> Json: @@ -54,11 +52,11 @@ class Competition(NamedTuple): ) -def iter_data() -> Iterator[Res[Competition]]: +def data() -> Iterator[Res[Competition]]: with wrap(_get_latest()) as j: ignore(j, 'id', 'version') - res = j['result'].zoom() + res = j['result'].zoom() # type: ignore[index] ignore(res, 'success', 'status', 'metadata') cont = res['content'].zoom() @@ -77,7 +75,3 @@ def iter_data() -> Iterator[Res[Competition]]: yield from Competition.make(json=c) c.consume() - -def get_data(): - return list(sorted(iter_data(), key=Competition.when.fget)) - diff --git a/tox.ini b/tox.ini index 0de357f..a48e0df 100644 --- a/tox.ini +++ b/tox.ini @@ -169,8 +169,6 @@ commands = {envpython} -m mypy --install-types --non-interactive \ -p {[testenv]package_name} \ - --exclude 'my/coding/codeforces.py' \ - --exclude 'my/coding/topcoder.py' \ --txt-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \ {posargs} From 37643c098ff60e7cf1260da69e9d2e820f2b9850 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Fri, 10 Nov 2023 23:03:55 +0000 Subject: [PATCH 194/302] tox: remove cat coverage index from tox, it's not very useful anyway --- tox.ini | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tox.ini b/tox.ini index a48e0df..06d1476 100644 --- a/tox.ini +++ b/tox.ini @@ -116,7 +116,6 @@ commands = [testenv:mypy-core] -allowlist_externals = cat commands = {envpython} -m pip install --use-pep517 -e .[testing,optional] {envpython} -m pip install orgparse # used it core.orgmode? @@ -127,13 +126,11 @@ commands = --txt-report .coverage.mypy-core \ --html-report .coverage.mypy-core \ {posargs} - cat .coverage.mypy-core/index.txt # specific modules that are known to be mypy compliant (to avoid false negatives) # todo maybe split into separate jobs? need to add comment how to run [testenv:mypy-misc] -allowlist_externals = cat commands = {envpython} -m pip install --use-pep517 -e .[testing,optional] @@ -172,8 +169,6 @@ commands = --txt-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \ {posargs} - # txt report is a bit more convenient to view on CI - cat .coverage.mypy-misc/index.txt {envpython} -m mypy --install-types --non-interactive \ tests From bde43d6a7a8817c60da2c2f47d64a8f71132f27b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 11 Nov 2023 00:28:13 +0000 Subject: [PATCH 195/302] my.body.sleep: massive speedup for average temperature calculation --- my/body/sleep/common.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/my/body/sleep/common.py b/my/body/sleep/common.py index 7bc1021..e84c8d5 100644 --- a/my/body/sleep/common.py +++ b/my/body/sleep/common.py @@ -17,15 +17,21 @@ class Combine: bdf = BM.dataframe() temp = bdf['temp'] + # sort index and drop nans, otherwise indexing with [start: end] gonna complain + temp = pd.Series( + temp.values, + index=pd.to_datetime(temp.index, utc=True) + ).sort_index() + temp = temp.loc[temp.index.dropna()] + def calc_avg_temperature(row): start = row['sleep_start'] end = row['sleep_end'] if pd.isna(start) or pd.isna(end): return None - between = (start <= temp.index) & (temp.index <= end) # on no temp data, returns nan, ok - return temp[between].mean() + return temp[start: end].mean() df['avg_temp'] = df.apply(calc_avg_temperature, axis=1) return df From 09e0f66892d8d336698193312ed8773386a25ef7 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sun, 19 Nov 2023 17:49:16 +0000 Subject: [PATCH 196/302] tox: disable --parallel flag in hpi module install It's been so flaky it ends up taking more time to merge stuff. See https://github.com/karlicoss/HPI/issues/306 --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 06d1476..e51d0b6 100644 --- a/tox.ini +++ b/tox.ini @@ -134,7 +134,7 @@ commands = commands = {envpython} -m pip install --use-pep517 -e .[testing,optional] - {envpython} -m my.core module install --parallel \ + {envpython} -m my.core module install \ my.arbtt \ my.browser.export \ my.coding.commits \ From a843407e40d05960650b6e1cee6cd0a63f8f09d1 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 19 Nov 2023 22:45:02 +0000 Subject: [PATCH 197/302] core/compat: move fromisoformat to .core.compat module --- my/arbtt.py | 7 ++++--- my/core/common.py | 28 ++++++++++------------------ my/core/compat.py | 39 +++++++++++++++++++++++++++++++++++++++ my/core/query_range.py | 4 ++-- my/polar.py | 8 ++++---- my/rss/feedbin.py | 5 +++-- my/runnerup.py | 7 ++++--- my/stackexchange/gdpr.py | 5 +++-- 8 files changed, 69 insertions(+), 34 deletions(-) diff --git a/my/arbtt.py b/my/arbtt.py index 5683515..941a05f 100644 --- a/my/arbtt.py +++ b/my/arbtt.py @@ -23,7 +23,7 @@ def inputs() -> Sequence[Path]: from .core import dataclass, Json, PathIsh, datetime_aware -from .core.common import isoparse +from .core.compat import fromisoformat @dataclass @@ -39,6 +39,7 @@ class Entry: @property def dt(self) -> datetime_aware: # contains utc already + # TODO after python>=3.11, could just use fromisoformat ds = self.json['date'] elen = 27 lds = len(ds) @@ -46,10 +47,10 @@ class Entry: # ugh. sometimes contains less that 6 decimal points ds = ds[:-1] + '0' * (elen - lds) + 'Z' elif lds > elen: - # ahd sometimes more... + # and sometimes more... ds = ds[:elen - 1] + 'Z' - return isoparse(ds) + return fromisoformat(ds) @property def active(self) -> Optional[str]: diff --git a/my/core/common.py b/my/core/common.py index 85b9386..f1441a9 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -313,20 +313,18 @@ class classproperty(Generic[_R]): # def __get__(self) -> _R: # return self.f() -# TODO deprecate in favor of datetime_aware -tzdatetime = datetime +# for now just serves documentation purposes... but one day might make it statically verifiable where possible? +# TODO e.g. maybe use opaque mypy alias? +datetime_naive = datetime +datetime_aware = datetime -# TODO doctests? -def isoparse(s: str) -> tzdatetime: - """ - Parses timestamps formatted like 2020-05-01T10:32:02.925961Z - """ - # TODO could use dateutil? but it's quite slow as far as I remember.. - # TODO support non-utc.. somehow? - assert s.endswith('Z'), s - s = s[:-1] + '+00:00' - return datetime.fromisoformat(s) +# TODO deprecate +tzdatetime = datetime_aware + + +# TODO deprecate (although could be used in modules) +from .compat import fromisoformat as isoparse import re @@ -590,12 +588,6 @@ def asdict(thing: Any) -> Json: raise TypeError(f'Could not convert object {thing} to dict') -# for now just serves documentation purposes... but one day might make it statically verifiable where possible? -# TODO e.g. maybe use opaque mypy alias? -datetime_naive = datetime -datetime_aware = datetime - - def assert_subpackage(name: str) -> None: # can lead to some unexpected issues if you 'import cachew' which being in my/core directory.. so let's protect against it # NOTE: if we use overlay, name can be smth like my.origg.my.core.cachew ... diff --git a/my/core/compat.py b/my/core/compat.py index 48e194b..9cdea27 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -76,3 +76,42 @@ if sys.version_info[:2] <= (3, 9): return lo else: from bisect import bisect_left + + +from datetime import datetime +if sys.version_info[:2] >= (3, 11): + fromisoformat = datetime.fromisoformat +else: + def fromisoformat(date_string: str) -> datetime: + # didn't support Z as "utc" before 3.11 + if date_string.endswith('Z'): + # NOTE: can be removed from 3.11? + # https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat + date_string = date_string[:-1] + '+00:00' + return datetime.fromisoformat(date_string) + + +def test_fromisoformat() -> None: + from datetime import timezone + + # feedbin has this format + assert fromisoformat('2020-05-01T10:32:02.925961Z') == datetime( + 2020, 5, 1, 10, 32, 2, 925961, timezone.utc, + ) + + # polar has this format + assert fromisoformat('2018-11-28T22:04:01.304Z') == datetime( + 2018, 11, 28, 22, 4, 1, 304000, timezone.utc, + ) + + # stackexchange, runnerup has this format + assert fromisoformat('2020-11-30T00:53:12Z') == datetime( + 2020, 11, 30, 0, 53, 12, 0, timezone.utc, + ) + + # arbtt has this format (sometimes less/more than 6 digits in milliseconds) + # TODO doesn't work atm, not sure if really should be supported... + # maybe should have flags for weird formats? + # assert isoparse('2017-07-18T18:59:38.21731Z') == datetime( + # 2017, 7, 18, 18, 59, 38, 217310, timezone.utc, + # ) diff --git a/my/core/query_range.py b/my/core/query_range.py index afde933..dfb9e55 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -24,7 +24,7 @@ from .query import ( ET, ) -from .common import isoparse +from .compat import fromisoformat timedelta_regex = re.compile(r"^((?P[\.\d]+?)w)?((?P[\.\d]+?)d)?((?P[\.\d]+?)h)?((?P[\.\d]+?)m)?((?P[\.\d]+?)s)?$") @@ -78,7 +78,7 @@ def parse_datetime_float(date_str: str) -> float: except ValueError: pass try: - return isoparse(ds).timestamp() + return fromisoformat(ds).timestamp() except (AssertionError, ValueError): pass diff --git a/my/polar.py b/my/polar.py index fe59d00..cd2c719 100644 --- a/my/polar.py +++ b/my/polar.py @@ -42,7 +42,7 @@ from typing import List, Dict, Iterable, NamedTuple, Sequence, Optional import json from .core import LazyLogger, Json, Res -from .core.common import isoparse +from .core.compat import fromisoformat from .core.error import echain, sort_res_by from .core.konsume import wrap, Zoomable, Wdict @@ -145,7 +145,7 @@ class Loader: cmap[hlid] = ccs ccs.append(Comment( cid=cid.value, - created=isoparse(crt.value), + created=fromisoformat(crt.value), text=html.value, # TODO perhaps coonvert from html to text or org? )) v.consume() @@ -183,7 +183,7 @@ class Loader: yield Highlight( hid=hid, - created=isoparse(crt), + created=fromisoformat(crt), selection=text, comments=tuple(comments), tags=tuple(htags), @@ -221,7 +221,7 @@ class Loader: path = Path(config.polar_dir) / 'stash' / filename yield Book( - created=isoparse(added), + created=fromisoformat(added), uid=self.uid, path=path, title=title, diff --git a/my/rss/feedbin.py b/my/rss/feedbin.py index 8ba25b8..6160abc 100644 --- a/my/rss/feedbin.py +++ b/my/rss/feedbin.py @@ -7,7 +7,8 @@ from my.config import feedbin as config from pathlib import Path from typing import Sequence -from ..core.common import listify, get_files, isoparse +from ..core.common import listify, get_files +from ..core.compat import fromisoformat from .common import Subscription @@ -22,7 +23,7 @@ def parse_file(f: Path): raw = json.loads(f.read_text()) for r in raw: yield Subscription( - created_at=isoparse(r['created_at']), + created_at=fromisoformat(r['created_at']), title=r['title'], url=r['site_url'], id=r['id'], diff --git a/my/runnerup.py b/my/runnerup.py index f12d9b3..ca09466 100644 --- a/my/runnerup.py +++ b/my/runnerup.py @@ -11,7 +11,8 @@ from pathlib import Path from typing import Iterable from .core import Res, get_files -from .core.common import isoparse, Json +from .core.common import Json +from .core.compat import fromisoformat import tcxparser # type: ignore[import-untyped] @@ -44,7 +45,7 @@ def _parse(f: Path) -> Workout: return { 'id' : f.name, # not sure? - 'start_time' : isoparse(tcx.started_at), + 'start_time' : fromisoformat(tcx.started_at), 'duration' : timedelta(seconds=tcx.duration), 'sport' : sport, 'heart_rate_avg': tcx.hr_avg, @@ -58,7 +59,7 @@ def _parse(f: Path) -> Workout: # tcx.hr_values(), # # todo cadence? # ): - # t = isoparse(ts) + # t = fromisoformat(ts) def workouts() -> Iterable[Res[Workout]]: diff --git a/my/stackexchange/gdpr.py b/my/stackexchange/gdpr.py index 18b2b4d..2f3b98d 100644 --- a/my/stackexchange/gdpr.py +++ b/my/stackexchange/gdpr.py @@ -16,7 +16,8 @@ config = make_config(stackexchange) # TODO just merge all of them and then filter?.. not sure -from ..core.common import Json, isoparse +from ..core.common import Json +from ..core.compat import fromisoformat from typing import NamedTuple, Iterable from datetime import datetime class Vote(NamedTuple): @@ -25,7 +26,7 @@ class Vote(NamedTuple): @property def when(self) -> datetime: - return isoparse(self.j['eventTime']) + return fromisoformat(self.j['eventTime']) # todo Url return type? @property From 224ba521e36ae0326f433c4adadcfafdb7da3c3e Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 3 Dec 2023 09:49:05 -0800 Subject: [PATCH 198/302] gpslogger: catch broken xml file error --- my/location/gpslogger.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/my/location/gpslogger.py b/my/location/gpslogger.py index 17f828f..29e2547 100644 --- a/my/location/gpslogger.py +++ b/my/location/gpslogger.py @@ -23,6 +23,7 @@ from pathlib import Path from typing import Iterator, Sequence, List import gpxpy +from gpxpy.gpx import GPXXMLSyntaxException from more_itertools import unique_everseen from my.core import Stats, LazyLogger @@ -58,7 +59,11 @@ def locations() -> Iterator[Location]: def _extract_locations(path: Path) -> Iterator[Location]: with path.open("r") as gf: - gpx_obj = gpxpy.parse(gf) + try: + gpx_obj = gpxpy.parse(gf) + except GPXXMLSyntaxException as e: + logger.warning("failed to parse XML %s: %s", path, e) + return for track in gpx_obj.tracks: for segment in track.segments: for point in segment.points: From 84d835962dcabb93c5e0cd31af2e43dc1c42020f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 20 Dec 2023 02:06:41 +0000 Subject: [PATCH 199/302] docs: some documentation/thoughts on properly implementing overlay packages --- doc/OVERLAYS.org | 144 +++++++++++++++++++ doc/overlays/install_packages.sh | 4 + doc/overlays/main/setup.py | 17 +++ doc/overlays/main/src/my/py.typed | 0 doc/overlays/main/src/my/reddit.py | 11 ++ doc/overlays/main/src/my/twitter/all.py | 7 + doc/overlays/main/src/my/twitter/common.py | 11 ++ doc/overlays/main/src/my/twitter/gdpr.py | 9 ++ doc/overlays/overlay/setup.py | 17 +++ doc/overlays/overlay/src/my/py.typed | 0 doc/overlays/overlay/src/my/twitter/all.py | 8 ++ doc/overlays/overlay/src/my/twitter/talon.py | 9 ++ 12 files changed, 237 insertions(+) create mode 100644 doc/OVERLAYS.org create mode 100755 doc/overlays/install_packages.sh create mode 100644 doc/overlays/main/setup.py create mode 100644 doc/overlays/main/src/my/py.typed create mode 100644 doc/overlays/main/src/my/reddit.py create mode 100644 doc/overlays/main/src/my/twitter/all.py create mode 100644 doc/overlays/main/src/my/twitter/common.py create mode 100644 doc/overlays/main/src/my/twitter/gdpr.py create mode 100644 doc/overlays/overlay/setup.py create mode 100644 doc/overlays/overlay/src/my/py.typed create mode 100644 doc/overlays/overlay/src/my/twitter/all.py create mode 100644 doc/overlays/overlay/src/my/twitter/talon.py diff --git a/doc/OVERLAYS.org b/doc/OVERLAYS.org new file mode 100644 index 0000000..3ac6b07 --- /dev/null +++ b/doc/OVERLAYS.org @@ -0,0 +1,144 @@ +NOTE this kinda overlaps with [[file:MODULE_DESIGN.org][the module design doc]], should be unified in the future. + +# This is describing TODO +# TODO goals +# - overrides +# - proper mypy support +# - TODO reusing parent modules? + +# You can see them TODO in overlays dir + +Consider a toy package/module structure with minimal code, wihout any actual data parsing, just for demonstration purposes. + +- =main= package structure + # TODO do links + + - =my/twitter/gdpr.py= + Extracts Twitter data from GDPR archive. + - =my/twitter/all.py= + Merges twitter data from multiple sources (only =gdpr= in this case), so data consumers are agnostic of specific data sources used. + This will be overriden by =overlay=. + - =my/twitter/common.py= + Contains helper function to merge data, so they can be reused by overlay's =all.py=. + - =my/reddit.py= + Extracts Reddit data -- this won't be overridden by the overlay, we just keep it for demonstration purposes. + +- =overlay= package structure + + - =my/twitter/talon.py= + Extracts Twitter data from Talon android app. + - =my/twitter/all.py= + Override for =all.py= from =main= package -- it merges together data from =gpdr= and =talon= modules. + +# TODO mention resolution? reorder_editable + +* Installing + +NOTE: this was tested with =python 3.10= and =pip 23.3.2=. + +To install, we run: + +: pip3 install --user -e overlay/ +: pip3 install --user -e main/ + +# TODO mention non-editable installs (this bit will still work with non-editable install) + +As a result, we get: + +: pip3 list | grep hpi +: hpi-main 0.0.0 /project/main/src +: hpi-overlay 0.0.0 /project/overlay/src + +: cat ~/.local/lib/python3.10/site-packages/easy-install.pth +: /project/overlay/src +: /project/main/src + +(the order above is important, so =overlay= takes precedence over =main= TODO link) + +Verify the setup: + +: $ python3 -c 'import my; print(my.__path__)' +: _NamespacePath(['/project/overlay/src/my', '/project/main/src/my']) + +This basically means that modules will be searched in both paths, with overlay taking precedence. + +* Testing + +: $ python3 -c 'import my.reddit as R; print(R.upvotes())' +: [main] my.reddit hello +: ['reddit upvote1', 'reddit upvote2'] + +Just as expected here, =my.reddit= is imported from the =main= package, since it doesn't exist in =overlay=. + +Let's theck twitter now: + +: $ python3 -c 'import my.twitter.all as T; print(T.tweets())' +: [overlay] my.twitter.all hello +: [main] my.twitter.common hello +: [main] my.twitter.gdpr hello +: [overlay] my.twitter.talon hello +: ['gdpr tweet 1', 'gdpr tweet 2', 'talon tweet 1', 'talon tweet 2'] + +As expected, =my.twitter.all= was imported from the =overlay=. +As you can see it's merged data from =gdpr= (from =main= package) and =talon= (from =overlay= package). + +So far so good, let's see how it works with mypy. + +* Mypy support + +To check that mypy works as expected I injected some statements in modules that have no impact on runtime, +but should trigger mypy, like this =trigger_mypy_error: str = 123=: + +Let's run it: + +: $ mypy --namespace-packages --strict -p my +: overlay/src/my/twitter/talon.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str") +: [assignment] +: trigger_mypy_error: str = 123 +: ^ +: Found 1 error in 1 file (checked 4 source files) + +Hmm, this did find the statement in the =overlay=, but missed everything from =main= (e.g. =reddit.py= and =gdpr.py= should have also triggered the check). + +First, let's check which sources mypy is processing: + +: $ mypy --namespace-packages --strict -p my -v 2>&1 | grep BuildSource +: LOG: Found source: BuildSource(path='/project/overlay/src/my', module='my', has_text=False, base_dir=None) +: LOG: Found source: BuildSource(path='/project/overlay/src/my/twitter', module='my.twitter', has_text=False, base_dir=None) +: LOG: Found source: BuildSource(path='/project/overlay/src/my/twitter/all.py', module='my.twitter.all', has_text=False, base_dir=None) +: LOG: Found source: BuildSource(path='/project/overlay/src/my/twitter/talon.py', module='my.twitter.talon', has_text=False, base_dir=None) + +So seems like mypy is not processing anything from =main= package at all? + +At this point I cloned mypy, put a breakpoint, and found out this is the culprit: https://github.com/python/mypy/blob/1dd8e7fe654991b01bd80ef7f1f675d9e3910c3a/mypy/modulefinder.py#L288 + +This basically returns the first path where it finds =my= package, which happens to be the overlay in this case. +So everything else is ignored? + +It even seems to have a test for a similar usecase, which is quite sad. +https://github.com/python/mypy/blob/1dd8e7fe654991b01bd80ef7f1f675d9e3910c3a/mypy/test/testmodulefinder.py#L64-L71 + +TODO For now I'm going to open an issue in mypy repository and ask why is that the case. + +But ok, maybe mypy treats =main= as an external package somhow but still type checks it properly? +Let's see what's going on with imports: + +: $ mypy --namespace-packages --strict -p my --follow-imports=error +: overlay/src/my/twitter/talon.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str") +: [assignment] +: trigger_mypy_error: str = 123 +: ^ +: overlay/src/my/twitter/all.py:3: error: Import of "my.twitter.common" ignored [misc] +: from .common import merge +: ^ +: overlay/src/my/twitter/all.py:6: error: Import of "my.twitter.gdpr" ignored [misc] +: from . import gdpr +: ^ +: overlay/src/my/twitter/all.py:6: note: (Using --follow-imports=error, module not passed on command line) +: overlay/src/my/twitter/all.py: note: In function "tweets": +: overlay/src/my/twitter/all.py:8: error: Returning Any from function declared to return "List[str]" [no-any-return] +: return merge(gdpr, talon) +: ^ +: Found 4 errors in 2 files (checked 4 source files) + +Nope -- looks like it's completely unawareof =main=, and what's worst, by default (without tweaking =--follow-imports=), these errors would be suppressed. diff --git a/doc/overlays/install_packages.sh b/doc/overlays/install_packages.sh new file mode 100755 index 0000000..3fc38d3 --- /dev/null +++ b/doc/overlays/install_packages.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -eux +pip3 install --user -e overlay/ +pip3 install --user -e main/ diff --git a/doc/overlays/main/setup.py b/doc/overlays/main/setup.py new file mode 100644 index 0000000..51ac55c --- /dev/null +++ b/doc/overlays/main/setup.py @@ -0,0 +1,17 @@ +from setuptools import setup, find_namespace_packages # type: ignore + + +def main() -> None: + pkgs = find_namespace_packages('src') + pkg = min(pkgs) + setup( + name='hpi-main', + zip_safe=False, + packages=pkgs, + package_dir={'': 'src'}, + package_data={pkg: ['py.typed']}, + ) + + +if __name__ == '__main__': + main() diff --git a/doc/overlays/main/src/my/py.typed b/doc/overlays/main/src/my/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/doc/overlays/main/src/my/reddit.py b/doc/overlays/main/src/my/reddit.py new file mode 100644 index 0000000..ae4e481 --- /dev/null +++ b/doc/overlays/main/src/my/reddit.py @@ -0,0 +1,11 @@ +print(f'[main] {__name__} hello') + + +def upvotes() -> list[str]: + return [ + 'reddit upvote1', + 'reddit upvote2', + ] + + +trigger_mypy_error: str = 123 diff --git a/doc/overlays/main/src/my/twitter/all.py b/doc/overlays/main/src/my/twitter/all.py new file mode 100644 index 0000000..feb9fce --- /dev/null +++ b/doc/overlays/main/src/my/twitter/all.py @@ -0,0 +1,7 @@ +print(f'[main] {__name__} hello') + +from .common import merge + +def tweets() -> list[str]: + from . import gdpr + return merge(gdpr) diff --git a/doc/overlays/main/src/my/twitter/common.py b/doc/overlays/main/src/my/twitter/common.py new file mode 100644 index 0000000..4121b5b --- /dev/null +++ b/doc/overlays/main/src/my/twitter/common.py @@ -0,0 +1,11 @@ +print(f'[main] {__name__} hello') + +from typing import Protocol + +class Source(Protocol): + def tweets(self) -> list[str]: + ... + +def merge(*sources: Source) -> list[str]: + from itertools import chain + return list(chain.from_iterable(src.tweets() for src in sources)) diff --git a/doc/overlays/main/src/my/twitter/gdpr.py b/doc/overlays/main/src/my/twitter/gdpr.py new file mode 100644 index 0000000..22ea220 --- /dev/null +++ b/doc/overlays/main/src/my/twitter/gdpr.py @@ -0,0 +1,9 @@ +print(f'[main] {__name__} hello') + +def tweets() -> list[str]: + return [ + 'gdpr tweet 1', + 'gdpr tweet 2', + ] + +trigger_mypy_error: str = 123 diff --git a/doc/overlays/overlay/setup.py b/doc/overlays/overlay/setup.py new file mode 100644 index 0000000..aaaa244 --- /dev/null +++ b/doc/overlays/overlay/setup.py @@ -0,0 +1,17 @@ +from setuptools import setup, find_namespace_packages # type: ignore + + +def main() -> None: + pkgs = find_namespace_packages('src') + pkg = min(pkgs) + setup( + name='hpi-overlay', + zip_safe=False, + packages=pkgs, + package_dir={'': 'src'}, + package_data={pkg: ['py.typed']}, + ) + + +if __name__ == '__main__': + main() diff --git a/doc/overlays/overlay/src/my/py.typed b/doc/overlays/overlay/src/my/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/doc/overlays/overlay/src/my/twitter/all.py b/doc/overlays/overlay/src/my/twitter/all.py new file mode 100644 index 0000000..895d84b --- /dev/null +++ b/doc/overlays/overlay/src/my/twitter/all.py @@ -0,0 +1,8 @@ +print(f'[overlay] {__name__} hello') + +from .common import merge + +def tweets() -> list[str]: + from . import gdpr + from . import talon + return merge(gdpr, talon) diff --git a/doc/overlays/overlay/src/my/twitter/talon.py b/doc/overlays/overlay/src/my/twitter/talon.py new file mode 100644 index 0000000..782236c --- /dev/null +++ b/doc/overlays/overlay/src/my/twitter/talon.py @@ -0,0 +1,9 @@ +print(f'[overlay] {__name__} hello') + +def tweets() -> list[str]: + return [ + 'talon tweet 1', + 'talon tweet 2', + ] + +trigger_mypy_error: str = 123 From adbc0e73a26d1db53badc577e17c90f8a483973f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 20 Dec 2023 18:07:34 +0000 Subject: [PATCH 200/302] docs: add note about directly checking overlays with mypy --- doc/OVERLAYS.org | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/doc/OVERLAYS.org b/doc/OVERLAYS.org index 3ac6b07..8580d53 100644 --- a/doc/OVERLAYS.org +++ b/doc/OVERLAYS.org @@ -32,7 +32,7 @@ Consider a toy package/module structure with minimal code, wihout any actual dat # TODO mention resolution? reorder_editable -* Installing +* Installing (editable install) NOTE: this was tested with =python 3.10= and =pip 23.3.2=. @@ -62,7 +62,7 @@ Verify the setup: This basically means that modules will be searched in both paths, with overlay taking precedence. -* Testing +* Testing (editable install) : $ python3 -c 'import my.reddit as R; print(R.upvotes())' : [main] my.reddit hello @@ -84,7 +84,7 @@ As you can see it's merged data from =gdpr= (from =main= package) and =talon= (f So far so good, let's see how it works with mypy. -* Mypy support +* Mypy support (editable install) To check that mypy works as expected I injected some statements in modules that have no impact on runtime, but should trigger mypy, like this =trigger_mypy_error: str = 123=: @@ -118,7 +118,7 @@ So everything else is ignored? It even seems to have a test for a similar usecase, which is quite sad. https://github.com/python/mypy/blob/1dd8e7fe654991b01bd80ef7f1f675d9e3910c3a/mypy/test/testmodulefinder.py#L64-L71 -TODO For now I'm going to open an issue in mypy repository and ask why is that the case. +For now, I opened an issue in mypy repository https://github.com/python/mypy/issues/16683 But ok, maybe mypy treats =main= as an external package somhow but still type checks it properly? Let's see what's going on with imports: @@ -142,3 +142,39 @@ Let's see what's going on with imports: : Found 4 errors in 2 files (checked 4 source files) Nope -- looks like it's completely unawareof =main=, and what's worst, by default (without tweaking =--follow-imports=), these errors would be suppressed. + +* What if we don't install at all? +Instead of editable install let's try running mypy directly over source files + +First let's only check =main= package: + +: $ MYPYPATH=main/src mypy --namespace-packages --strict -p my +: main/src/my/twitter/gdpr.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str") [assignment] +: trigger_mypy_error: str = 123 +: ^~~ +: main/src/my/reddit.py:11: error: Incompatible types in assignment (expression has type "int", variable has type "str") [assignment] +: trigger_mypy_error: str = 123 +: ^~~ +: Found 2 errors in 2 files (checked 6 source files) + +As expected, it found both errors. + +Now with overlay as well: + +: $ MYPYPATH=overlay/src:main/src mypy --namespace-packages --strict -p my +: overlay/src/my/twitter/all.py:6: note: In module imported here: +: main/src/my/twitter/gdpr.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str") [assignment] +: trigger_mypy_error: str = 123 +: ^~~ +: overlay/src/my/twitter/talon.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str") +: [assignment] +: trigger_mypy_error: str = 123 +: ^~~ +: Found 2 errors in 2 files (checked 4 source files) + +Interesting enough, this is slightly better than the editable install (it detected error in =gdpr.py= as well). +But still no =reddit.py= error. + +TODO possibly worth submitting to mypy issue tracker as well... + +Overall it seems that properly type checking modules in overlays (especially the ones actually overriding/extending base modules) is kinda problematic. From a8f8858cb188bf4517cab2338e4ec3894f8fce7a Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 22 Dec 2023 02:46:41 +0000 Subject: [PATCH 201/302] docs: document more experiments with overlays in docs --- doc/OVERLAYS.org | 142 +++++++++++++++++- doc/overlays/overlay2/setup.py | 17 +++ doc/overlays/overlay2/src/my/py.typed | 0 .../overlay2/src/my/twitter/__init__.py | 13 ++ doc/overlays/overlay3/setup.py | 17 +++ doc/overlays/overlay3/src/my/py.typed | 0 doc/overlays/overlay3/src/my/twitter/_hook.py | 9 ++ 7 files changed, 196 insertions(+), 2 deletions(-) create mode 100644 doc/overlays/overlay2/setup.py create mode 100644 doc/overlays/overlay2/src/my/py.typed create mode 100644 doc/overlays/overlay2/src/my/twitter/__init__.py create mode 100644 doc/overlays/overlay3/setup.py create mode 100644 doc/overlays/overlay3/src/my/py.typed create mode 100644 doc/overlays/overlay3/src/my/twitter/_hook.py diff --git a/doc/OVERLAYS.org b/doc/OVERLAYS.org index 8580d53..98687b7 100644 --- a/doc/OVERLAYS.org +++ b/doc/OVERLAYS.org @@ -1,5 +1,7 @@ NOTE this kinda overlaps with [[file:MODULE_DESIGN.org][the module design doc]], should be unified in the future. +Relevant discussion about overlays: https://github.com/karlicoss/HPI/issues/102 + # This is describing TODO # TODO goals # - overrides @@ -62,7 +64,7 @@ Verify the setup: This basically means that modules will be searched in both paths, with overlay taking precedence. -* Testing (editable install) +* Testing runtime behaviour (editable install) : $ python3 -c 'import my.reddit as R; print(R.upvotes())' : [main] my.reddit hello @@ -143,6 +145,30 @@ Let's see what's going on with imports: Nope -- looks like it's completely unawareof =main=, and what's worst, by default (without tweaking =--follow-imports=), these errors would be suppressed. +What if we check =my.twitter= directly? + +: $ mypy --namespace-packages --strict -p my.twitter --follow-imports=error +: overlay/src/my/twitter/talon.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str") +: [assignment] +: trigger_mypy_error: str = 123 +: ^~~ +: overlay/src/my/twitter: error: Ancestor package "my" ignored [misc] +: overlay/src/my/twitter: note: (Using --follow-imports=error, submodule passed on command line) +: overlay/src/my/twitter/all.py:3: error: Import of "my.twitter.common" ignored [misc] +: from .common import merge +: ^ +: overlay/src/my/twitter/all.py:3: note: (Using --follow-imports=error, module not passed on command line) +: overlay/src/my/twitter/all.py:6: error: Import of "my.twitter.gdpr" ignored [misc] +: from . import gdpr +: ^ +: overlay/src/my/twitter/all.py: note: In function "tweets": +: overlay/src/my/twitter/all.py:8: error: Returning Any from function declared to return "list[str]" [no-any-return] +: return merge(gdpr, talon) +: ^~~~~~~~~~~~~~~~~~~~~~~~~ +: Found 5 errors in 3 files (checked 3 source files) + +Now we're also getting =error: Ancestor package "my" ignored [misc]= .. not ideal. + * What if we don't install at all? Instead of editable install let's try running mypy directly over source files @@ -177,4 +203,116 @@ But still no =reddit.py= error. TODO possibly worth submitting to mypy issue tracker as well... -Overall it seems that properly type checking modules in overlays (especially the ones actually overriding/extending base modules) is kinda problematic. +Overall it seems that properly type checking HPI setup as a whole is kinda problematic, especially if the modules actually override/extend base modules. + +* Modifying (monkey patching) original module in the overlay +Let's say we want to modify/monkey patch =my.twitter.talon= module from =main=, for example, convert "gdpr" to uppercase, i.e. =tweet.replace('gdpr', 'GDPR')=. + +# TODO see overlay2/ + +I think our options are: + +- symlink to the 'parent' packages, e.g. =main= in the case + + Alternatively, somehow install =main= under a different name/alias (managed by pip). + + This is discussed here: https://github.com/karlicoss/HPI/issues/102 + + The main upside is that it's relatively simple and (sort of works with mypy). + + There are a few big downsides: + - creates a parallel package hierarchy (to the one maintained by pip), symlinks will need to be carefully managed manually + + This may not be such a huge deal if you don't have too many overlays. + However this results in problems if you're trying to switch between two different HPI checkouts (e.g. stable and development). If you have symlinks into "stable" from the overlay then stable modules will sometimes be picked up when you're expecting "development" package. + + - symlinks pointing outside of the source tree might cause pip install to go into infinite loop + + - it modifies the package name + + This may potentially result in some confusing behaviours. + + One thing I noticed for example is that cachew caches might get duplicated. + + - it might not work in all cases or might result in recursive imports + +- do not shadow the original module + + Basically instead of shadowing via namespace package mechanism and creating identically named module, + create some sort of hook that would patch the original =my.twitter.talon= module from =main=. + + The downside is that it's a bit unclear where to do that, we need some sort of entry point? + + - it could be some global dynamic hook defined in the overlay, and then executed from =my.core= + + However, it's a bit intrusive, and unclear how to handle errors. E.g. what if we're monkey patching a module that we weren't intending to use, don't have dependencies installed and it's crashing? + + Perhaps core could support something like =_hook= in each of HPI's modules? + Note that it can't be =my.twitter.all=, since we might want to override =.all= itself. + + The downside is is this probably not going to work well with =tmp_config= and such -- we'll need to somehow execute the hook again on reloading the module? + + - ideally we'd have something that integrates with =importlib= and executed automatically when module is imported? + + TODO explore these: + + - https://stackoverflow.com/questions/43571737/how-to-implement-an-import-hook-that-can-modify-the-source-code-on-the-fly-using + - https://github.com/brettlangdon/importhook + + This one is pretty intrusive, and has some issues, e.g. https://github.com/brettlangdon/importhook/issues/4 + + Let's try it: + : $ PYTHONPATH=overlay3/src:main/src python3 -c 'import my.twitter._hook; import my.twitter.all as M; print(M.tweets())' + : [main] my.twitter.all hello + : [main] my.twitter.common hello + : [main] my.twitter.gdpr hello + : EXECUTING IMPORT HOOK! + : ['GDPR tweet 1', 'GDPR tweet 2'] + + Ok it worked, and seems pretty neat. + However sadly it doesn't work with =tmp_config= (TODO add a proper demo?) + Not sure if it's more of an issue with =tmp_config= implementation (which is very hacky), or =importhook= itself? + + In addition, still the question is where to put the hook itself, but in that case even a global one could be fine. + + - define hook in =my/twitter/__init__.py= + + Basically, use =extend_path= to make it behave like a namespace package, but in addition, patch original =my.twitter.talon=? + + : $ cat overlay2/src/my/twitter/__init__.py + : print(f'[overlay2] {__name__} hello') + : + : from pkgutil import extend_path + : __path__ = extend_path(__path__, __name__) + : + : def hack_gdpr_module() -> None: + : from . import gdpr + : tweets_orig = gdpr.tweets + : def tweets_patched(): + : return [t.replace('gdpr', 'GDPR') for t in tweets_orig()] + : gdpr.tweets = tweets_patched + : + : hack_gdpr_module() + + This actually seems to work?? + + : PYTHONPATH=overlay2/src:main/src python3 -c 'import my.twitter.all as M; print(M.tweets())' + : [overlay2] my.twitter hello + : [main] my.twitter.gdpr hello + : [main] my.twitter.all hello + : [main] my.twitter.common hello + : ['GDPR tweet 1', 'GDPR tweet 2'] + + However, this doesn't stack, i.e. if the 'parent' overlay had its own =__init__.py=, it won't get called. + +- shadow the original module and temporarily modify =__path__= before importing the same module from the parent overlay + + This approach is implemented in =my.core.experimental.import_original_module= + + TODO demonstrate it properly, but I think that also works in a 'chain' of overlays + + Seems like that option is the most promising so far, albeit very hacky. + +Note that none of these options work well with mypy (since it's all dynamic hackery), even if you disregard the issues described in the previous sections. + +# TODO .pkg files? somewhat interesting... https://github.com/python/cpython/blob/3.12/Lib/pkgutil.py#L395-L410 diff --git a/doc/overlays/overlay2/setup.py b/doc/overlays/overlay2/setup.py new file mode 100644 index 0000000..e34e7de --- /dev/null +++ b/doc/overlays/overlay2/setup.py @@ -0,0 +1,17 @@ +from setuptools import setup, find_namespace_packages # type: ignore + + +def main() -> None: + pkgs = find_namespace_packages('src') + pkg = min(pkgs) + setup( + name='hpi-overlay2', + zip_safe=False, + packages=pkgs, + package_dir={'': 'src'}, + package_data={pkg: ['py.typed']}, + ) + + +if __name__ == '__main__': + main() diff --git a/doc/overlays/overlay2/src/my/py.typed b/doc/overlays/overlay2/src/my/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/doc/overlays/overlay2/src/my/twitter/__init__.py b/doc/overlays/overlay2/src/my/twitter/__init__.py new file mode 100644 index 0000000..9c5674f --- /dev/null +++ b/doc/overlays/overlay2/src/my/twitter/__init__.py @@ -0,0 +1,13 @@ +print(f'[overlay2] {__name__} hello') + +from pkgutil import extend_path +__path__ = extend_path(__path__, __name__) + +def hack_gdpr_module() -> None: + from . import gdpr + tweets_orig = gdpr.tweets + def tweets_patched(): + return [t.replace('gdpr', 'GDPR') for t in tweets_orig()] + gdpr.tweets = tweets_patched + +hack_gdpr_module() diff --git a/doc/overlays/overlay3/setup.py b/doc/overlays/overlay3/setup.py new file mode 100644 index 0000000..106a115 --- /dev/null +++ b/doc/overlays/overlay3/setup.py @@ -0,0 +1,17 @@ +from setuptools import setup, find_namespace_packages # type: ignore + + +def main() -> None: + pkgs = find_namespace_packages('src') + pkg = min(pkgs) + setup( + name='hpi-overlay3', + zip_safe=False, + packages=pkgs, + package_dir={'': 'src'}, + package_data={pkg: ['py.typed']}, + ) + + +if __name__ == '__main__': + main() diff --git a/doc/overlays/overlay3/src/my/py.typed b/doc/overlays/overlay3/src/my/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/doc/overlays/overlay3/src/my/twitter/_hook.py b/doc/overlays/overlay3/src/my/twitter/_hook.py new file mode 100644 index 0000000..ce249ae --- /dev/null +++ b/doc/overlays/overlay3/src/my/twitter/_hook.py @@ -0,0 +1,9 @@ +import importhook + +@importhook.on_import('my.twitter.gdpr') +def on_import(gdpr): + print("EXECUTING IMPORT HOOK!") + tweets_orig = gdpr.tweets + def tweets_patched(): + return [t.replace('gdpr', 'GDPR') for t in tweets_orig()] + gdpr.tweets = tweets_patched From 3d75abafe96c6ff10563a7c0f96cf3eed7455410 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 23 Dec 2023 02:33:23 +0000 Subject: [PATCH 202/302] my.twitter.android: some intial work on pasring sqlite databases from official Android app --- my/twitter/android.py | 93 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 my/twitter/android.py diff --git a/my/twitter/android.py b/my/twitter/android.py new file mode 100644 index 0000000..ac834df --- /dev/null +++ b/my/twitter/android.py @@ -0,0 +1,93 @@ +""" +Data from offficial app for Android +""" +from struct import unpack_from, calcsize + +from my.core.sqlite import sqlite_connect_immutable + + +def _parse_content(data: bytes): + pos = 0 + + def skip(count: int) -> None: + nonlocal pos + pos += count + + def getstring(slen: int) -> str: + if slen == 1: + lfmt = '>B' + elif slen == 2: + lfmt = '>H' + else: + raise RuntimeError + + (sz,) = unpack_from(lfmt, data, offset=pos) + skip(slen) + assert sz > 0 + assert sz <= 10000 # sanity check? + + # soo, this is how it should ideally work: + # (ss,) = unpack_from(f'{sz}s', data, offset=pos) + # skip(sz) + # however sometimes there is a discrepancy between string length in header and actual length (if you stare at the data) + # example is 1725868458246570412 + # wtf??? (see logging below) + + # ughhhh + seps = [ + b'I\x08', + b'I\x09', + ] + sep_idxs = [data[pos:].find(sep) for sep in seps] + sep_idxs = [i for i in sep_idxs if i != -1] + assert len(sep_idxs) > 0 + sep_idx = min(sep_idxs) + + # print("EXPECTED LEN", sz, "GOT", sep_idx, "DIFF", sep_idx - sz) + + zz = data[pos : pos + sep_idx] + return zz.decode('utf8') + + skip(2) # always starts with 4a03? + + (xx,) = unpack_from('B', data, offset=pos) + skip(1) + # print("TYPE:", xx) + + # wtf is this... maybe it's a bitmask? + slen = { + 66 : 1, + 67 : 2, + 106: 1, + 107: 2, + }[xx] + + try: + print(getstring(slen=slen)) + finally: + pass + # print(data[pos:]) + + +PATH_TO_DB = '/path/to/db' + + +with sqlite_connect_immutable(PATH_TO_DB) as db: + # TODO use statuses table instead? + # has r_ent_content?? + # TODO hmm r_ent_content contains expanded urls? + # but they are still ellipsized? e.g. you can check 1692905005479580039 + # TODO also I think content table has mappings from short urls to full, need to extract + for (tid, blob, blob2) in db.execute( + f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1', + ): + if blob is None: # TODO exclude in sql query? + continue + print("----") + try: + print("PARSING", tid) + _parse_content(blob) + # _parse_content(blob2) + except UnicodeDecodeError as ue: + raise ue + # print("DECODING ERROR FOR ", tid, ue.object) From a4a7bc41b90e3968dfa39b31712f6fb9edc74570 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 23 Dec 2023 23:14:21 +0000 Subject: [PATCH 203/302] my.twitter.android: extract entities --- my/twitter/android.py | 52 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/my/twitter/android.py b/my/twitter/android.py index ac834df..dbb4946 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -1,6 +1,7 @@ """ Data from offficial app for Android """ +import re from struct import unpack_from, calcsize from my.core.sqlite import sqlite_connect_immutable @@ -46,6 +47,7 @@ def _parse_content(data: bytes): # print("EXPECTED LEN", sz, "GOT", sep_idx, "DIFF", sep_idx - sz) zz = data[pos : pos + sep_idx] + skip(sep_idx) return zz.decode('utf8') skip(2) # always starts with 4a03? @@ -62,11 +64,51 @@ def _parse_content(data: bytes): 107: 2, }[xx] - try: - print(getstring(slen=slen)) - finally: - pass - # print(data[pos:]) + text = getstring(slen=slen) + + # after the main tweet text it contains entities (e.g. shortened urls) + # however couldn't reverse engineer the schema properly, the links are kinda all over the place + + # TODO this also contains image alt descriptions? + # see 1665029077034565633 + + extracted = [] + linksep = 0x6a + while True: + m = re.search(b'\x6a.http', data[pos:]) + if m is None: + break + + qq = m.start() + pos += qq + + while True: + if data[pos] != linksep: + break + pos += 1 + (sz,) = unpack_from('B', data, offset=pos) + pos += 1 + (ss,) = unpack_from(f'{sz}s', data, offset=pos) + pos += sz + extracted.append(ss) + + replacements = {} + i = 0 + while i < len(extracted): + if b'https://t.co/' in extracted[i]: + key = extracted[i].decode('utf8') + value = extracted[i + 1].decode('utf8') + i += 2 + replacements[key] = value + else: + i += 1 + + for k, v in replacements.items(): + text = text.replace(k, v) + assert 'https://t.co/' not in text # make sure we detected all links + + print(text) + PATH_TO_DB = '/path/to/db' From 51209c547e7a756d4e21b02512a875a474a26c28 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sun, 24 Dec 2023 00:06:29 +0000 Subject: [PATCH 204/302] my.twitter.android: refactor into a proper module for now only extracting bookmarks, will use it for some time and see how it goes --- my/config.py | 2 + my/core/common.py | 1 + my/twitter/android.py | 119 +++++++++++++++++++++++++++++++++--------- 3 files changed, 96 insertions(+), 26 deletions(-) diff --git a/my/config.py b/my/config.py index ac44f41..e9b0ec8 100644 --- a/my/config.py +++ b/my/config.py @@ -177,6 +177,8 @@ class twitter_archive: class twitter: class talon: export_path: Paths + class android: + export_path: Paths class twint: diff --git a/my/core/common.py b/my/core/common.py index f1441a9..c429c8c 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -686,6 +686,7 @@ def unique_everseen( if key is None: # todo check key return type as well? but it's more likely to be hashable if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None: + # TODO return better error here, e.g. if there is no return type it crashes _check_all_hashable(fun) return more_itertools.unique_everseen(iterable=iterable, key=key) diff --git a/my/twitter/android.py b/my/twitter/android.py index dbb4946..be411e3 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -1,13 +1,49 @@ """ -Data from offficial app for Android +Twitter data from offficial app for Android """ -import re -from struct import unpack_from, calcsize +from __future__ import annotations +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +import re +from struct import unpack_from +from typing import Iterator, Sequence + +from my.core import datetime_aware, get_files, LazyLogger, Paths, Res +from my.core.common import unique_everseen from my.core.sqlite import sqlite_connect_immutable +import my.config -def _parse_content(data: bytes): +from .common import permalink + +logger = LazyLogger(__name__) + + +@dataclass +class config(my.config.twitter.android): + # paths[s]/glob to the exported sqlite databases + export_path: Paths + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +@dataclass(unsafe_hash=True) +class Tweet: + id_str: str + created_at: datetime_aware + screen_name: str + text: str + + @property + def permalink(self) -> str: + return permalink(screen_name=self.screen_name, id=self.id_str) + + +def _parse_content(data: bytes) -> str: pos = 0 def skip(count: int) -> None: @@ -107,29 +143,60 @@ def _parse_content(data: bytes): text = text.replace(k, v) assert 'https://t.co/' not in text # make sure we detected all links - print(text) + return text - -PATH_TO_DB = '/path/to/db' +def _process_one(f: Path) -> Iterator[Res[Tweet]]: + with sqlite_connect_immutable(f) as db: + # NOTE: + # - it also has statuses_r_ent_content which has entities' links replaced + # but they are still ellipsized (e.g. check 1692905005479580039) + # so let's just uses statuses_content + # - there is also timeline_created_at, but they look like made up timestamps + # don't think they represent bookmarking time + # - not sure what's timeline_type? + # seems like 30 means bookmarks? + # there is one tweet with timeline type 18, but it has timeline_is_preview=1 + for ( + tweet_id, + user_name, + user_username, + created_ms, + blob, + ) in db.execute( + ''' + SELECT + statuses_status_id, + users_name, + users_username, + statuses_created, + CAST(statuses_content AS BLOB) + FROM timeline_view + WHERE statuses_bookmarked = 1 + ORDER BY timeline_sort_index DESC + ''', + ): + if blob is None: # TODO exclude in sql query? + continue + yield Tweet( + id_str=tweet_id, + # TODO double check it's utc? + created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc), + screen_name=user_username, + text=_parse_content(blob), + ) -with sqlite_connect_immutable(PATH_TO_DB) as db: - # TODO use statuses table instead? - # has r_ent_content?? - # TODO hmm r_ent_content contains expanded urls? - # but they are still ellipsized? e.g. you can check 1692905005479580039 - # TODO also I think content table has mappings from short urls to full, need to extract - for (tid, blob, blob2) in db.execute( - f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1', - ): - if blob is None: # TODO exclude in sql query? - continue - print("----") - try: - print("PARSING", tid) - _parse_content(blob) - # _parse_content(blob2) - except UnicodeDecodeError as ue: - raise ue - # print("DECODING ERROR FOR ", tid, ue.object) +def bookmarks() -> Iterator[Res[Tweet]]: + # TODO might need to sort by timeline_sort_index again? + # not sure if each database contains full history of bookmarks (likely not!) + def it() -> Iterator[Res[Tweet]]: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + yield from _process_one(path) + + # TODO hmm maybe unique_everseen should be a decorator? + return unique_everseen(it) From 1c452b12d44e8ac79fd9cd3b2590b4dae61c2fba Mon Sep 17 00:00:00 2001 From: karlicoss Date: Thu, 28 Dec 2023 00:04:49 +0000 Subject: [PATCH 205/302] twitter.android: extract likes and own tweets as well --- my/twitter/android.py | 91 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 19 deletions(-) diff --git a/my/twitter/android.py b/my/twitter/android.py index be411e3..a05d15a 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -1,5 +1,5 @@ """ -Twitter data from offficial app for Android +Twitter data from official app for Android """ from __future__ import annotations @@ -28,6 +28,9 @@ class config(my.config.twitter.android): def inputs() -> Sequence[Path]: + # NOTE: individual databases are very patchy. + # e.g. some contain hundreds of my bookmarks, whereas other contain just a few + # good motivation for synthetic exports return get_files(config.export_path) @@ -146,17 +149,48 @@ def _parse_content(data: bytes) -> str: return text -def _process_one(f: Path) -> Iterator[Res[Tweet]]: +# NOTE: +# - it also has statuses_r_ent_content which has entities' links replaced +# but they are still ellipsized (e.g. check 1692905005479580039) +# so let's just uses statuses_content +# - there is also timeline_created_at, but they look like made up timestamps +# don't think they represent bookmarking time +# - timeline_type +# 7, 8, 9: some sort of notifications or cursors, should exclude +# 17: ??? some cursors but also tweets +# 18: ??? relatively few, maybe 20 of them, also they all have timeline_is_preview=1? +# most of them have our own id as timeline_sender? +# I think it might actually be 'replies' tab -- also contains some retweets etc +# 26: ??? very low sort index +# 28: weird, contains lots of our own tweets, but also a bunch of unrelated.. +# 29: seems like it contains the favorites! +# 30: seems like it contains the bookmarks +# 34: contains some tweets -- not sure.. +# 63: contains the bulk of data +# 69: ??? just a few tweets +# - timeline_data_type +# 1 : the bulk of tweets, but also some notifications etc?? +# 2 : who-to-follow/community-to-join. contains a couple of tweets, but their corresponding status_id is NULL +# 8 : who-to-follow/notfication +# 13: semantic-core/who-to-follow +# 14: cursor +# 17: trends +# 27: notification +# 31: some superhero crap +# 37: semantic-core +# 42: community-to-join +# - timeline_entity_type +# 1 : contains the bulk of data -- either tweet-*/promoted-tweet-*. However some notification-* and some just contain raw ids?? +# 11: some sort of 'superhero-superhero' crap +# 13: always cursors +# 15: tweet-*/tweet:*/home-conversation-*/trends-*/and lots of other crap +# 31: always notification-* +# - timeline_data_type_group +# 0 : tweets? +# 6 : always notifications?? +# 42: tweets (bulk of them) +def _process_one(f: Path, *, where: str) -> Iterator[Res[Tweet]]: with sqlite_connect_immutable(f) as db: - # NOTE: - # - it also has statuses_r_ent_content which has entities' links replaced - # but they are still ellipsized (e.g. check 1692905005479580039) - # so let's just uses statuses_content - # - there is also timeline_created_at, but they look like made up timestamps - # don't think they represent bookmarking time - # - not sure what's timeline_type? - # seems like 30 means bookmarks? - # there is one tweet with timeline type 18, but it has timeline_is_preview=1 for ( tweet_id, user_name, @@ -164,7 +198,7 @@ def _process_one(f: Path) -> Iterator[Res[Tweet]]: created_ms, blob, ) in db.execute( - ''' + f''' SELECT statuses_status_id, users_name, @@ -172,12 +206,14 @@ def _process_one(f: Path) -> Iterator[Res[Tweet]]: statuses_created, CAST(statuses_content AS BLOB) FROM timeline_view - WHERE statuses_bookmarked = 1 + WHERE timeline_data_type == 1 /* the only one containing tweets (among with some other stuff) */ + AND timeline_data_type_group != 6 /* excludes notifications (some of them even have statuses_bookmarked == 1) */ + AND {where} ORDER BY timeline_sort_index DESC ''', + # TODO not sure about timeline_sort_index for favorites ): - if blob is None: # TODO exclude in sql query? - continue + assert blob is not None # just in case, but should be filtered by the sql query yield Tweet( id_str=tweet_id, # TODO double check it's utc? @@ -186,17 +222,34 @@ def _process_one(f: Path) -> Iterator[Res[Tweet]]: text=_parse_content(blob), ) - -def bookmarks() -> Iterator[Res[Tweet]]: +def _entities(*, where: str) -> Iterator[Res[Tweet]]: # TODO might need to sort by timeline_sort_index again? - # not sure if each database contains full history of bookmarks (likely not!) def it() -> Iterator[Res[Tweet]]: paths = inputs() total = len(paths) width = len(str(total)) for idx, path in enumerate(paths): logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') - yield from _process_one(path) + yield from _process_one(path, where=where) # TODO hmm maybe unique_everseen should be a decorator? return unique_everseen(it) + +def bookmarks() -> Iterator[Res[Tweet]]: + # NOTE: in principle we get the bulk of bookmarks via timeline_type == 30 filter + # however we still might miss on a few (I think the timeline_type 30 only refreshes when you enter bookmarks in the app) + # if you bookmarked in the home feed, it might end up as status_bookmarked == 1 but not necessarily as timeline_type 30 + return _entities(where='statuses_bookmarked == 1') + +def likes() -> Iterator[Res[Tweet]]: + # NOTE: similarly to bookmarks, we could use timeline_type == 29, but it's only refreshed if we actually open likes tab + return _entities(where='statuses_favorited == 1') + +def tweets() -> Iterator[Res[Tweet]]: + # NOTE: seemed like the only way to distinguish our own user reliably? + # could also try matching on users._id == 1, but not sure if it's guaranteed + select_self_user_id = 'SELECT user_id FROM users WHERE extended_profile_fields IS NOT NULL' + + # NOTE: where timeline_type == 18 covers quite a few of our on tweets, but not everything + # querying by our own user id seems the most exhaustive + return _entities(where=f'timeline_sender_id == ({select_self_user_id})') From a0ce666024b0c96a1a36741ba50fc36cbc687183 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Thu, 28 Dec 2023 00:13:01 +0000 Subject: [PATCH 206/302] my.youtube.takeout: fix exception handling --- my/youtube/takeout.py | 1 + 1 file changed, 1 insertion(+) diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index 79b4549..8fe8f2c 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -37,6 +37,7 @@ def watched() -> Iterable[Res[Watched]]: for e in events(): if isinstance(e, Exception): yield e + continue if not isinstance(e, Activity): continue From 3ec362fce90ae9f8d5b6a9b248c07168ba194089 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 28 Dec 2023 18:07:57 +0000 Subject: [PATCH 207/302] fbmessenger.android: expose contacts --- my/fbmessenger/android.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index fa313ea..ecf0eb0 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -111,12 +111,18 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: senders: Dict[str, Sender] = {} for r in db.execute('SELECT CAST(id AS TEXT) AS id, name FROM contacts'): s = Sender( - id=r['id'], + id=r['id'], # looks like it's server id? same used on facebook site name=r['name'], ) + # NOTE https://www.messenger.com/t/{contant_id} for permalink senders[s.id] = s yield s + # TODO what is fb transport?? + # TODO what are client_contacts?? has pk or something + + # TODO client_threads/client_messages -- possibly for end to end encryption or something? + # TODO can we get it from db? could infer as the most common id perhaps? self_id = config.facebook_id thread_users: Dict[str, List[Sender]] = {} @@ -237,6 +243,15 @@ def _process_db_threads_db2(db: sqlite3.Connection) -> Iterator[Res[Entity]]: ) +def contacts() -> Iterator[Res[Sender]]: + for x in unique_everseen(_entities): + if isinstance(x, Exception): + yield x + continue + if isinstance(x, Sender): + yield x + + def messages() -> Iterator[Res[Message]]: senders: Dict[str, Sender] = {} msgs: Dict[str, Message] = {} From 1b187b2c1b35544462eb674052279373e2c88971 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 29 Dec 2023 00:53:14 +0000 Subject: [PATCH 208/302] whatsapp.android: expose all entities extracted from the db --- my/whatsapp/android.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py index 295d831..3dfed3e 100644 --- a/my/whatsapp/android.py +++ b/my/whatsapp/android.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path import sqlite3 -from typing import Sequence, Iterator, Optional +from typing import Union, Sequence, Iterator, Optional from my.core import get_files, Paths, datetime_aware, Res, make_logger, make_config from my.core.common import unique_everseen @@ -56,7 +56,10 @@ class Message: text: Optional[str] -def _process_db(db: sqlite3.Connection): +Entity = Union[Chat, Sender, Message] + + +def _process_db(db: sqlite3.Connection) -> Iterator[Entity]: # TODO later, split out Chat/Sender objects separately to safe on object creation, similar to other android data sources chats = {} @@ -73,6 +76,7 @@ def _process_db(db: sqlite3.Connection): id=chat_id, name=subject, ) + yield chat chats[chat.id] = chat senders = {} @@ -88,6 +92,7 @@ def _process_db(db: sqlite3.Connection): id=r['raw_string'], name=None, ) + yield s senders[r['_id']] = s # NOTE: hmm, seems that message_view or available_message_view use lots of NULL as ... @@ -187,7 +192,7 @@ def _process_db(db: sqlite3.Connection): yield m -def _messages() -> Iterator[Res[Message]]: +def _entities() -> Iterator[Res[Entity]]: paths = inputs() total = len(paths) width = len(str(total)) @@ -200,5 +205,14 @@ def _messages() -> Iterator[Res[Message]]: yield echain(RuntimeError(f'While processing {path}'), cause=e) +def entities() -> Iterator[Res[Entity]]: + return unique_everseen(_entities) + + def messages() -> Iterator[Res[Message]]: - yield from unique_everseen(_messages) + # TODO hmm, specify key=lambda m: m.id? + # not sure since might be useful to keep track of sender changes etc + # probably best not to, or maybe query messages/senders separately and merge later? + for e in entities(): + if isinstance(e, (Exception, Message)): + yield e From 93e475795dabd1ff366c8f1e5993cf566eed5252 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 30 Dec 2023 16:52:14 -0800 Subject: [PATCH 209/302] google takeout: support multiple locales uses the known locales in google_takeout_parser to determine the expected paths for each locale, and performs a partial match on the paths to detect and use match_structure --- my/google/takeout/parser.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 96acfff..2322ef0 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -64,13 +64,19 @@ def inputs() -> Sequence[Path]: return get_files(config.takeout_path) -EXPECTED = ( - "My Activity", - "Chrome", - "Location History", - "Youtube", - "YouTube and YouTube Music", -) +try: + from google_takeout_parser.locales.main import get_paths_for_functions + + EXPECTED = tuple(get_paths_for_functions()) + +except ImportError: + EXPECTED = ( + "My Activity", + "Chrome", + "Location History", + "Youtube", + "YouTube and YouTube Music", + ) google_takeout_version = str(getattr(google_takeout_parser, '__version__', 'unknown')) From 87a8a7781bfb335dfc139a62ca0e04bb8045087f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 1 Jan 2024 23:15:43 +0000 Subject: [PATCH 210/302] my.google.maps: intitial module for extracting placed data from Android app --- my/config.py | 4 + my/google/maps/_android_protobuf.py | 113 ++++++++++++++++ my/google/maps/android.py | 202 ++++++++++++++++++++++++++++ tox.ini | 1 + 4 files changed, 320 insertions(+) create mode 100644 my/google/maps/_android_protobuf.py create mode 100644 my/google/maps/android.py diff --git a/my/config.py b/my/config.py index e9b0ec8..a92b2bc 100644 --- a/my/config.py +++ b/my/config.py @@ -68,6 +68,10 @@ class pinboard: export_dir: Paths = '' class google: + class maps: + class android: + export_path: Paths = '' + takeout_path: Paths = '' diff --git a/my/google/maps/_android_protobuf.py b/my/google/maps/_android_protobuf.py new file mode 100644 index 0000000..1d43ae0 --- /dev/null +++ b/my/google/maps/_android_protobuf.py @@ -0,0 +1,113 @@ +from my.core import __NOT_HPI_MODULE__ + +# NOTE: this tool was quite useful https://github.com/aj3423/aproto + +from google.protobuf import descriptor_pool, descriptor_pb2, message_factory + +TYPE_STRING = descriptor_pb2.FieldDescriptorProto.TYPE_STRING +TYPE_BYTES = descriptor_pb2.FieldDescriptorProto.TYPE_BYTES +TYPE_UINT64 = descriptor_pb2.FieldDescriptorProto.TYPE_UINT64 +TYPE_MESSAGE = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE + +OPTIONAL = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL +REQUIRED = descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED + + +def get_place_protos(): + f1 = descriptor_pb2.DescriptorProto(name='xf1') + # TODO 2 -> 5 is address? 2 -> 6 is a pair of coordinates + f1.field.add(name='title', number=3, type=TYPE_STRING, label=REQUIRED) + f1.field.add(name='note' , number=4, type=TYPE_STRING, label=OPTIONAL) + # TODO what's the difference between required and optional? doesn't impact decoding? + + ts = descriptor_pb2.DescriptorProto(name='Timestamp') + ts.field.add(name='seconds', number=1, type=TYPE_UINT64, label=REQUIRED) + ts.field.add(name='nanos' , number=2, type=TYPE_UINT64, label=REQUIRED) + + f1.field.add(name='created', number=10 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name) + f1.field.add(name='updated', number=11 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name) + + f2 = descriptor_pb2.DescriptorProto(name='xf2') + f2.field.add(name='addr1', number=2, type=TYPE_STRING, label=REQUIRED) + f2.field.add(name='addr2', number=3, type=TYPE_STRING, label=REQUIRED) + f2.field.add(name='f21' , number=4, type=TYPE_BYTES , label=REQUIRED) + f2.field.add(name='f22' , number=5, type=TYPE_UINT64, label=REQUIRED) + f2.field.add(name='f23' , number=6, type=TYPE_STRING, label=REQUIRED) + # NOTE: this also contains place ID + + f3 = descriptor_pb2.DescriptorProto(name='xf3') + # NOTE: looks like it's the same as 'updated' from above?? + f3.field.add(name='f31', number=1, type=TYPE_UINT64, label=OPTIONAL) + + descriptor_proto = descriptor_pb2.DescriptorProto(name='PlaceParser') + descriptor_proto.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name) + descriptor_proto.field.add(name='f2', number=2, type=TYPE_MESSAGE, label=REQUIRED, type_name=f2.name) + descriptor_proto.field.add(name='f3', number=3, type=TYPE_MESSAGE, label=OPTIONAL, type_name=f3.name) + descriptor_proto.field.add(name='f4', number=4, type=TYPE_STRING , label=OPTIONAL) + # NOTE: f4 is the list id + + return [descriptor_proto, ts, f1, f2, f3] + + +def get_labeled_protos(): + address = descriptor_pb2.DescriptorProto(name='address') + # 1: address + # 2: parts of address (multiple) + # 3: full address + address.field.add(name='full', number=3, type=TYPE_STRING, label=REQUIRED) + + main = descriptor_pb2.DescriptorProto(name='LabeledParser') + # field 1 contains item type and item id + main.field.add(name='title' , number=3, type=TYPE_STRING , label=REQUIRED) + main.field.add(name='address', number=5, type=TYPE_MESSAGE, label=OPTIONAL, type_name=address.name) + + return [main, address] + + +def get_list_protos(): + f1 = descriptor_pb2.DescriptorProto(name='xf1') + f1.field.add(name='name', number=5, type=TYPE_STRING, label=REQUIRED) + + main = descriptor_pb2.DescriptorProto(name='ListParser') + main.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name) + main.field.add(name='f2', number=2, type=TYPE_STRING , label=REQUIRED) + + return [main, f1] + + +def make_parser(main, *extras): + file_descriptor_proto = descriptor_pb2.FileDescriptorProto(name='dynamic.proto', package='dynamic_package') + for proto in [main, *extras]: + file_descriptor_proto.message_type.add().CopyFrom(proto) + + pool = descriptor_pool.DescriptorPool() + file_descriptor = pool.Add(file_descriptor_proto) + + message_descriptor = pool.FindMessageTypeByName(f'{file_descriptor_proto.package}.{main.name}') + factory = message_factory.MessageFactory(pool) + dynamic_message_class = factory.GetPrototype(message_descriptor) + + return dynamic_message_class + + +place_parser_class = make_parser(*get_place_protos()) +labeled_parser_class = make_parser(*get_labeled_protos()) +list_parser_class = make_parser(*get_list_protos()) + + +def parse_place(blob: bytes): + m = place_parser_class() + m.ParseFromString(blob) + return m + + +def parse_labeled(blob: bytes): + m = labeled_parser_class() + m.ParseFromString(blob) + return m + + +def parse_list(blob: bytes): + msg = list_parser_class() + msg.ParseFromString(blob) + return msg diff --git a/my/google/maps/android.py b/my/google/maps/android.py new file mode 100644 index 0000000..279231a --- /dev/null +++ b/my/google/maps/android.py @@ -0,0 +1,202 @@ +""" +Extracts data from the official Google Maps app for Android (uses gmm_sync.db for now) +""" +from __future__ import annotations + +REQUIRES = [ + "protobuf", # for parsing blobs from the database +] + +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Iterator, Optional, Sequence +from urllib.parse import quote + +from my.core import datetime_aware, get_files, LazyLogger, Paths, Res +from my.core.common import unique_everseen +from my.core.sqlite import sqlite_connection + +import my.config + +from ._android_protobuf import parse_labeled, parse_list, parse_place + + +logger = LazyLogger(__name__) + + +@dataclass +class config(my.config.google.maps.android): + # paths[s]/glob to the exported sqlite databases + export_path: Paths + + +def inputs() -> Sequence[Path]: + # TODO note sure if need to use all dbs? possibly the last one contains everything? + return get_files(config.export_path) + + +PlaceId = str +ListId = str +ListName = str + + +@dataclass(eq=True, frozen=True) +class Location: + lat: float + lon: float + + @property + def url(self) -> str: + return f'https://maps.google.com/?q={self.lat},{self.lon}' + + +@dataclass(unsafe_hash=True) +class Place: + id: PlaceId + list_name: ListName # TODO maybe best to keep list id? + created_at: datetime_aware # TODO double check it's utc? + updated_at: datetime_aware # TODO double check it's utc? + title: str + location: Location + address: Optional[str] + note: Optional[str] + + @property + def place_url(self) -> str: + title = quote(self.title) + return f'https://www.google.com/maps/place/{title}/data=!4m2!3m1!1s{self.id}' + + @property + def location_url(self) -> str: + return self.location.url + + +def _process_one(f: Path): + with sqlite_connection(f, row_factory='row') as conn: + msg: Any + + lists: dict[ListId, ListName] = {} + for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 13'): # 13 looks like lists (e.g. saved/favorited etc) + server_id = row['server_id'] + + if server_id is None: + # this is the case for Travel plans, Followed places, Offers + # todo alternatively could use string_index column instead maybe? + continue + + blob = row['item_proto'] + msg = parse_list(blob) + name = msg.f1.name + lists[server_id] = name + + for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 11'): # this looks like 'Labeled' list + ts = row['timestamp'] / 1000 + created = datetime.fromtimestamp(ts, tz=timezone.utc) + + server_id = row['server_id'] + [item_type, item_id] = server_id.split(':') + if item_type != '3': + # the ones that are not 3 are home/work address? + continue + + blob = row['item_proto'] + msg = parse_labeled(blob) + address = msg.address.full + if address == '': + address = None + + location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6) + + yield Place( + id=item_id, + list_name='Labeled', + created_at=created, + updated_at=created, # doesn't look like it has 'updated'? + title=msg.title, + location=location, + address=address, + note=None, # don't think these allow notes + ) + + for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 14'): # this looks like actual individual places + server_id = row['server_id'] + [list_id, _, id1, id2] = server_id.split(':') + item_id = f'{id1}:{id2}' + + list_name = lists[list_id] + + blob = row['item_proto'] + msg = parse_place(blob) + title = msg.f1.title + note = msg.f1.note + if note == '': # seems that protobuf does that? + note = None + + # TODO double check timezone + created = datetime.fromtimestamp(msg.f1.created.seconds, tz=timezone.utc).replace(microsecond=msg.f1.created.nanos // 1000) + + # NOTE: this one seems to be the same as row['timestamp'] + updated = datetime.fromtimestamp(msg.f1.updated.seconds, tz=timezone.utc).replace(microsecond=msg.f1.updated.nanos // 1000) + + address = msg.f2.addr1 # NOTE: there is also addr2, but they seem identical :shrug: + if address == '': + address = None + + location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6) + + place = Place( + id=item_id, + list_name=list_name, + created_at=created, + updated_at=updated, + title=title, + location=location, + address=address, + note=note, + ) + + # ugh. in my case it's violated by one place by about 1 second?? + # assert place.created_at <= place.updated_at + yield place + + +def saved() -> Iterator[Res[Place]]: + def it() -> Iterator[Res[Place]]: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + yield from _process_one(path) + return unique_everseen(it) + + +# Summary of databases on Android (as of 20240101) +# -1_optimized_threads.notifications.db -- empty +# 1_optimized_threads.notifications.db -- empty +# 1_tasks.notifications.db -- empty +# -1_threads.notifications.db -- empty +# 1_threads.notifications.db -- doesn't look like anything interested, some trip anniversaries etc? +# 1_thread_surveys.notifications.db -- empty +# 2_threads.notifications.db -- empty +# accounts.notifications.db -- just one row with account id +# brella_example_store -- empty +# gmm_myplaces.db -- contains just a few places? I think it's a subset of "Labeled" +# gmm_storage.db -- pretty huge, like 50Mb. I suspect it contains cache for places on maps or something +# gmm_sync.db -- processed above +# gnp_fcm_database -- list of accounts +# google_app_measurement_local.db -- empty +# inbox_notifications.db -- nothing interesting +# _room_notifications.db -- trip anniversaties? +# lighter_messaging_1.db -- empty +# lighter_messaging_2.db -- empty +# lighter_registration.db -- empty +# peopleCache__com.google_14.db -- contacts cache or something +# portable_geller_.db -- looks like analytics +# primes_example_store -- looks like analytics +# pseudonymous_room_notifications.db -- looks like analytics +# ue3.db -- empty +# ugc_photos_location_data.db -- empty +# ugc-sync.db -- empty +# updates-tab-visit.db -- empty diff --git a/tox.ini b/tox.ini index e51d0b6..25874f4 100644 --- a/tox.ini +++ b/tox.ini @@ -143,6 +143,7 @@ commands = my.fbmessenger.export \ my.github.ghexport \ my.goodreads \ + my.google.maps.android \ my.google.takeout.parser \ my.hackernews.harmonic \ my.hypothesis \ From 7236024c7a2e2af56c846ae27036e5f281b6c44b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 12 Mar 2024 21:59:35 +0000 Subject: [PATCH 211/302] my.twitter.android: better detection of own user id --- my/twitter/android.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/my/twitter/android.py b/my/twitter/android.py index a05d15a..fc5d230 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -149,6 +149,24 @@ def _parse_content(data: bytes) -> str: return text +_SELECT_OWN_TWEETS = '_SELECT_OWN_TWEETS' + + +def get_own_user_id(conn) -> str: + # unclear what's the reliable way to query it, so we use multiple different ones and arbitrate + # NOTE: 'SELECT DISTINCT ev_owner_id FROM lists' doesn't work, might include lists from other people? + res = set() + for q in [ + 'SELECT DISTINCT list_mapping_user_id FROM list_mapping', + 'SELECT DISTINCT owner_id FROM cursors', + 'SELECT DISTINCT user_id FROM users WHERE _id == 1', + ]: + for (r,) in conn.execute(q): + res.add(r) + assert len(res) == 1, res + return str(list(res)[0]) + + # NOTE: # - it also has statuses_r_ent_content which has entities' links replaced # but they are still ellipsized (e.g. check 1692905005479580039) @@ -191,6 +209,10 @@ def _parse_content(data: bytes) -> str: # 42: tweets (bulk of them) def _process_one(f: Path, *, where: str) -> Iterator[Res[Tweet]]: with sqlite_connect_immutable(f) as db: + if _SELECT_OWN_TWEETS in where: + own_user_id = get_own_user_id(db) + where = where.replace(_SELECT_OWN_TWEETS, own_user_id) + for ( tweet_id, user_name, @@ -222,6 +244,7 @@ def _process_one(f: Path, *, where: str) -> Iterator[Res[Tweet]]: text=_parse_content(blob), ) + def _entities(*, where: str) -> Iterator[Res[Tweet]]: # TODO might need to sort by timeline_sort_index again? def it() -> Iterator[Res[Tweet]]: @@ -235,21 +258,20 @@ def _entities(*, where: str) -> Iterator[Res[Tweet]]: # TODO hmm maybe unique_everseen should be a decorator? return unique_everseen(it) + def bookmarks() -> Iterator[Res[Tweet]]: # NOTE: in principle we get the bulk of bookmarks via timeline_type == 30 filter # however we still might miss on a few (I think the timeline_type 30 only refreshes when you enter bookmarks in the app) # if you bookmarked in the home feed, it might end up as status_bookmarked == 1 but not necessarily as timeline_type 30 return _entities(where='statuses_bookmarked == 1') + def likes() -> Iterator[Res[Tweet]]: # NOTE: similarly to bookmarks, we could use timeline_type == 29, but it's only refreshed if we actually open likes tab return _entities(where='statuses_favorited == 1') -def tweets() -> Iterator[Res[Tweet]]: - # NOTE: seemed like the only way to distinguish our own user reliably? - # could also try matching on users._id == 1, but not sure if it's guaranteed - select_self_user_id = 'SELECT user_id FROM users WHERE extended_profile_fields IS NOT NULL' +def tweets() -> Iterator[Res[Tweet]]: # NOTE: where timeline_type == 18 covers quite a few of our on tweets, but not everything # querying by our own user id seems the most exhaustive - return _entities(where=f'timeline_sender_id == ({select_self_user_id})') + return _entities(where=f'timeline_sender_id == {_SELECT_OWN_TWEETS}') From 0f3d09915cb26860d00535f78289fe1a3bed6794 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 12 Mar 2024 22:03:21 +0000 Subject: [PATCH 212/302] ci: update actions versions --- .github/workflows/main.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f49c6b5..53d8e53 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -45,11 +45,11 @@ jobs: # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation - run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: recursive fetch-depth: 0 # nicer to have all git history when debugging/for tests @@ -61,12 +61,12 @@ jobs: - run: bash .ci/run - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: .coverage.mypy-misc_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-misc/ - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: .coverage.mypy-core_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-core/ @@ -79,11 +79,11 @@ jobs: # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation - run: echo "$HOME/.local/bin" >> $GITHUB_PATH - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: '3.8' - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: recursive From 477b7e8fd30192b82547573fc78fa99128763f99 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 12 Mar 2024 22:03:46 +0000 Subject: [PATCH 213/302] docs: minor update to overlays docs --- doc/OVERLAYS.org | 4 ++++ doc/overlays/install_packages.sh | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/OVERLAYS.org b/doc/OVERLAYS.org index 98687b7..1e6cf8f 100644 --- a/doc/OVERLAYS.org +++ b/doc/OVERLAYS.org @@ -64,6 +64,10 @@ Verify the setup: This basically means that modules will be searched in both paths, with overlay taking precedence. +** Installing with =--use-pep517= + +See here for discussion https://github.com/seanbreckenridge/reorder_editable/issues/2, but TLDR it should work similarly. + * Testing runtime behaviour (editable install) : $ python3 -c 'import my.reddit as R; print(R.upvotes())' diff --git a/doc/overlays/install_packages.sh b/doc/overlays/install_packages.sh index 3fc38d3..5853e08 100755 --- a/doc/overlays/install_packages.sh +++ b/doc/overlays/install_packages.sh @@ -1,4 +1,4 @@ #!/bin/bash set -eux -pip3 install --user -e overlay/ -pip3 install --user -e main/ +pip3 install --user "$@" -e main/ +pip3 install --user "$@" -e overlay/ From 751ed02f4303ee79d8d89cab3f7175514b1c2f19 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 12 Mar 2024 22:16:35 +0000 Subject: [PATCH 214/302] tests: pin pytest version to <8 for now, having some test collection errors https://docs.pytest.org/en/stable/changelog.html#collection-changes --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b857662..e6bc9fa 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ def main() -> None: install_requires=INSTALL_REQUIRES, extras_require={ 'testing': [ - 'pytest', + 'pytest<8', # FIXME <8 is temporary workaround till we fix collection with pytest 8; see https://docs.pytest.org/en/stable/changelog.html#collection-changes 'ruff', 'mypy', 'lxml', # for mypy coverage From 103ea2096ee842dd10172ae68574173875a6dbea Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 13 Mar 2024 00:18:12 +0000 Subject: [PATCH 215/302] my.coding.commits: fix for git repo discovery after fdfind v9 --- my/coding/commits.py | 1 + 1 file changed, 1 insertion(+) diff --git a/my/coding/commits.py b/my/coding/commits.py index 67ee77d..51f9222 100644 --- a/my/coding/commits.py +++ b/my/coding/commits.py @@ -155,6 +155,7 @@ def git_repos_in(roots: List[Path]) -> List[Path]: _fd_path(), # '--follow', # right, not so sure about follow... make configurable? '--hidden', + '--no-ignore', # otherwise doesn't go inside .git directory (from fd v9) '--full-path', '--type', 'f', '/HEAD', # judging by is_git_dir, it should always be here.. From 8a8a1ebb0e8eeaa612bf78e8f3aa6c2fc3942df4 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 3 Apr 2024 19:58:44 +0100 Subject: [PATCH 216/302] my.tinder.android: better error handing and fix case with empty db --- my/tinder/android.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/my/tinder/android.py b/my/tinder/android.py index 7e5f535..56ee1cb 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -92,7 +92,10 @@ def _entities() -> Iterator[Res[_Entity]]: for idx, path in enumerate(paths): logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') with sqlite_connection(path, immutable=True, row_factory='row') as db: - yield from _handle_db(db) + try: + yield from _handle_db(db) + except Exception as e: + yield e def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]: @@ -103,8 +106,9 @@ def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]: # shit, sometime in 2023 profile_user_view stoppped containing user profile.. # presumably the most common from_id/to_id would be our own username counter = Counter([id_ for (id_,) in db.execute('SELECT from_id FROM message UNION ALL SELECT to_id FROM message')]) - [(you_id, _)] = counter.most_common(1) - yield Person(id=you_id, name='you') + if len(counter) > 0: # this might happen if db is empty (e.g. user got logged out) + [(you_id, _)] = counter.most_common(1) + yield Person(id=you_id, name='you') for row in chain( user_profile_rows, From 35dd5d82a0e399f684fb3d09763bab44146927fc Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Wed, 5 Jun 2024 14:03:03 -0700 Subject: [PATCH 217/302] smscalls: parse mms from smscalls export (#370) * initial mms exploration --- my/smscalls.py | 162 +++++++++++++++++++++++++++++++++++++++++++++- tests/smscalls.py | 3 +- 2 files changed, 163 insertions(+), 2 deletions(-) diff --git a/my/smscalls.py b/my/smscalls.py index dbcf8b2..23fb5cc 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -20,7 +20,7 @@ config = make_config(smscalls) from datetime import datetime, timezone from pathlib import Path -from typing import NamedTuple, Iterator, Set, Tuple, Optional +from typing import NamedTuple, Iterator, Set, Tuple, Optional, Any, Dict, List from lxml import etree @@ -150,6 +150,165 @@ def _extract_messages(path: Path) -> Iterator[Res[Message]]: ) +class MMSContentPart(NamedTuple): + sequence_index: int + content_type: str + filename: str + text: Optional[str] + data: Optional[str] + + +class MMS(NamedTuple): + dt: datetime + dt_readable: str + parts: List[MMSContentPart] + # NOTE: these is often something like 'Name 1, Name 2', but might be different depending on your client + who: Optional[str] + # NOTE: This can be a single phone number, or multiple, split by '~' or ','. Its better to think + # of this as a 'key' or 'conversation ID', phone numbers are also present in 'addresses' + phone_number: str + addresses: List[Tuple[str, int]] + # 1 = Received, 2 = Sent, 3 = Draft, 4 = Outbox + message_type: int + + @property + def from_user(self) -> str: + # since these can be group messages, we can't just check message_type, + # we need to iterate through and find who sent it + # who is CC/'To' is not obvious in many message clients + # + # 129 = BCC, 130 = CC, 151 = To, 137 = From + for (addr, _type) in self.addresses: + if _type == 137: + return addr + else: + # hmm, maybe return instead? but this probably shouldnt happen, means + # something is very broken + raise RuntimeError(f'No from address matching 137 found in {self.addresses}') + + @property + def from_me(self) -> bool: + return self.message_type == 2 + + +def mms() -> Iterator[Res[MMS]]: + files = get_files(config.export_path, glob='sms-*.xml') + + emitted: Set[Tuple[datetime, Optional[str], str]] = set() + for p in files: + for c in _extract_mms(p): + if isinstance(c, Exception): + yield c + continue + key = (c.dt, c.phone_number, c.from_user) + if key in emitted: + continue + emitted.add(key) + yield c + + +def _resolve_null_str(value: Optional[str]) -> Optional[str]: + if value is None: + return None + # hmm.. theres some risk of the text actually being 'null', but theres + # no way to distinguish that from XML values + if value == 'null': + return None + return value + + +def _extract_mms(path: Path) -> Iterator[Res[MMS]]: + tr = etree.parse(str(path)) + + for mxml in tr.findall('mms'): + dt = mxml.get('date') + dt_readable = mxml.get('readable_date') + message_type = mxml.get('msg_box') + + who = mxml.get('contact_name') + if who is not None and who in UNKNOWN: + who = None + phone_number = mxml.get('address') + + if dt is None or dt_readable is None or message_type is None or phone_number is None: + mxml_str = etree.tostring(mxml).decode('utf-8') + yield RuntimeError(f'Missing one or more required attributes [date, readable_date, msg_box, address] in {mxml_str}') + continue + + addresses: List[Tuple[str, int]] = [] + for addr_parent in mxml.findall('addrs'): + for addr in addr_parent.findall('addr'): + addr_data = addr.attrib + user_address = addr_data.get('address') + user_type = addr_data.get('type') + if user_address is None or user_type is None: + addr_str = etree.tostring(addr_parent).decode() + yield RuntimeError(f'Missing one or more required attributes [address, type] in {addr_str}') + continue + if not user_type.isdigit(): + yield RuntimeError(f'Invalid address type {user_type} {type(user_type)}, cannot convert to number') + continue + addresses.append((user_address, int(user_type))) + + content: List[MMSContentPart] = [] + + for part_root in mxml.findall('parts'): + + for part in part_root.findall('part'): + + # the first item is an SMIL XML element encoded as a string which describes + # how the rest of the parts are laid out + # https://www.w3.org/TR/SMIL3/smil-timing.html#Timing-TimeContainerSyntax + # An example: + # + # + # This seems pretty useless, so we should try and skip it, and just return the + # text/images/data + # + # man, attrib is some internal cpython ._Attrib type which can't + # be typed by any sort of mappingproxy. maybe a protocol could work..? + part_data: Dict[str, Any] = part.attrib # type: ignore + seq: Optional[str] = part_data.get('seq') + if seq == '-1': + continue + + if seq is None or not seq.isdigit(): + yield RuntimeError(f'seq must be a number, was seq={seq} {type(seq)} in {part_data}') + continue + + charset_type: Optional[str] = _resolve_null_str(part_data.get('ct')) + filename: Optional[str] = _resolve_null_str(part_data.get('name')) + # in some cases (images, cards), the filename is set in 'cl' instead + if filename is None: + filename = _resolve_null_str(part_data.get('cl')) + text: Optional[str] = _resolve_null_str(part_data.get('text')) + data: Optional[str] = _resolve_null_str(part_data.get('data')) + + if charset_type is None or filename is None or (text is None and data is None): + yield RuntimeError(f'Missing one or more required attributes [ct, name, (text, data)] must be present in {part_data}') + continue + + content.append( + MMSContentPart( + sequence_index=int(seq), + content_type=charset_type, + filename=filename, + text=text, + data=data + ) + ) + + yield MMS( + dt=_parse_dt_ms(dt), + dt_readable=dt_readable, + who=who, + phone_number=phone_number, + message_type=int(message_type), + parts=content, + addresses=addresses, + ) + + # See https://github.com/karlicoss/HPI/pull/90#issuecomment-702422351 # for potentially parsing timezone from the readable_date def _parse_dt_ms(d: str) -> datetime: @@ -162,4 +321,5 @@ def stats() -> Stats: return { **stat(calls), **stat(messages), + **stat(mms), } diff --git a/tests/smscalls.py b/tests/smscalls.py index d063de1..ef78786 100644 --- a/tests/smscalls.py +++ b/tests/smscalls.py @@ -4,6 +4,7 @@ from my.tests.common import skip_if_not_karlicoss as pytestmark # TODO implement via stat? def test() -> None: - from my.smscalls import calls, messages + from my.smscalls import calls, messages, mms assert len(list(calls())) > 10 assert len(list(messages())) > 10 + assert len(list(mms())) > 10 From c9c0e1954313b1ae531791ea1a361265ace2776f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 3 Aug 2024 15:03:55 +0100 Subject: [PATCH 218/302] my.instagram.gdpr: fix for new format --- my/instagram/gdpr.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 233f040..1415d55 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -1,6 +1,7 @@ """ Instagram data (uses [[https://www.instagram.com/download/request][official GDPR export]]) """ + from dataclasses import dataclass from datetime import datetime import json @@ -103,7 +104,12 @@ def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]: # old path, used up to somewhere between feb-aug 2022 personal_info = path / 'account_information' - j = json.loads((personal_info / 'personal_information.json').read_text()) + personal_info_json = personal_info / 'personal_information.json' + if not personal_info_json.exists(): + # new path, started somewhere around april 2024 + personal_info_json = personal_info / 'personal_information' / 'personal_information.json' + + j = json.loads(personal_info_json.read_text()) [profile] = j['profile_user'] pdata = profile['string_map_data'] username = pdata['Username']['value'] From 0e6dd32afe1a49de3dacecaede7ee149ab67e4b9 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 3 Aug 2024 15:27:11 +0100 Subject: [PATCH 219/302] ci: minor fixes after mypy update --- my/core/common.py | 12 ++++++------ my/core/error.py | 4 ++-- my/core/query.py | 4 ++-- my/core/query_range.py | 2 +- my/core/serialize.py | 7 ++++--- my/instagram/android.py | 2 +- ruff.toml | 2 +- tox.ini | 2 +- 8 files changed, 18 insertions(+), 17 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index c429c8c..f7bb010 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -1,6 +1,7 @@ from glob import glob as do_glob from pathlib import Path from datetime import datetime +from dataclasses import is_dataclass, asdict as dataclasses_asdict import functools from contextlib import contextmanager import os @@ -292,15 +293,14 @@ Json = Dict[str, Any] from typing import TypeVar, Callable, Generic -_C = TypeVar('_C') _R = TypeVar('_R') # https://stackoverflow.com/a/5192374/706389 class classproperty(Generic[_R]): - def __init__(self, f: Callable[[_C], _R]) -> None: + def __init__(self, f: Callable[..., _R]) -> None: self.f = f - def __get__(self, obj: None, cls: _C) -> _R: + def __get__(self, obj, cls) -> _R: return self.f(cls) @@ -580,9 +580,9 @@ def asdict(thing: Any) -> Json: # todo exception? if isinstance(thing, dict): return thing - import dataclasses as D - if D.is_dataclass(thing): - return D.asdict(thing) + if is_dataclass(thing): + assert not isinstance(thing, type) # to help mypy + return dataclasses_asdict(thing) if is_namedtuple(thing): return thing._asdict() raise TypeError(f'Could not convert object {thing} to dict') diff --git a/my/core/error.py b/my/core/error.py index e1737c1..fa59137 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -195,7 +195,7 @@ def warn_my_config_import_error(err: Union[ImportError, AttributeError], help_ur import click if help_url is None: help_url = MODULE_SETUP_URL - if type(err) == ImportError: + if type(err) is ImportError: if err.name != 'my.config': return False # parse name that user attempted to import @@ -207,7 +207,7 @@ You may be missing the '{section_name}' section from your config. See {help_url}\ """, fg='yellow', err=True) return True - elif type(err) == AttributeError: + elif type(err) is AttributeError: # test if user had a nested config block missing # https://github.com/karlicoss/HPI/issues/223 if hasattr(err, 'obj') and hasattr(err, "name"): diff --git a/my/core/query.py b/my/core/query.py index 7c22838..071f7e0 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -214,7 +214,7 @@ def _determine_order_by_value_key(obj_res: ET) -> Any: Returns either the class, or a tuple of the dictionary keys """ key = obj_res.__class__ - if key == dict: + if key is dict: # assuming same keys signify same way to determine ordering return tuple(obj_res.keys()) # type: ignore[union-attr] return key @@ -583,7 +583,7 @@ def test_couldnt_determine_order() -> None: res = list(select(iter([object()]), order_value=lambda o: isinstance(o, datetime))) assert len(res) == 1 assert isinstance(res[0], Unsortable) - assert type(res[0].obj) == object + assert type(res[0].obj) is object # same value type, different keys, with clashing keys diff --git a/my/core/query_range.py b/my/core/query_range.py index dfb9e55..2b3a3d3 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -471,7 +471,7 @@ def test_range_predicate() -> None: ) # filter from 0 to 5 - rn: Optional[RangeTuple] = RangeTuple("0", "5", None) + rn: RangeTuple = RangeTuple("0", "5", None) zero_to_five_filter: Optional[Where] = int_filter_func(unparsed_range=rn) assert zero_to_five_filter is not None # this is just a Where function, given some input it return True/False if the value is allowed diff --git a/my/core/serialize.py b/my/core/serialize.py index c5f4cba..1f55f40 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -1,5 +1,5 @@ import datetime -import dataclasses +from dataclasses import is_dataclass, asdict from pathlib import Path from decimal import Decimal from typing import Any, Optional, Callable, NamedTuple @@ -33,8 +33,9 @@ def _default_encode(obj: Any) -> Any: # convert paths to their string representation if isinstance(obj, Path): return str(obj) - if dataclasses.is_dataclass(obj): - return dataclasses.asdict(obj) + if is_dataclass(obj): + assert not isinstance(obj, type) # to help mypy + return asdict(obj) if isinstance(obj, Exception): return error_to_json(obj) # if something was stored as 'decimal', you likely diff --git a/my/instagram/android.py b/my/instagram/android.py index ea5ee35..96b75d2 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -92,7 +92,7 @@ class MessageError(RuntimeError): super().__init__(msg_id, *rest) self.rest = rest - def __hash__(self, other): + def __hash__(self): return hash(self.rest) def __eq__(self, other) -> bool: diff --git a/ruff.toml b/ruff.toml index 0be93e0..54f621c 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,4 +1,4 @@ -ignore = [ +lint.ignore = [ ### too opinionated style checks "E501", # too long lines "E702", # Multiple statements on one line (semicolon) diff --git a/tox.ini b/tox.ini index 25874f4..0676eef 100644 --- a/tox.ini +++ b/tox.ini @@ -26,7 +26,7 @@ passenv = [testenv:ruff] commands = {envpython} -m pip install --use-pep517 -e .[testing] - {envpython} -m ruff my/ + {envpython} -m ruff check my/ # just the very core tests with minimal dependencies From d5fccf1874f5306445830377d5418776ccaa4ede Mon Sep 17 00:00:00 2001 From: karlicoss Date: Thu, 28 Dec 2023 02:07:27 +0000 Subject: [PATCH 220/302] twitter.android: more comments on timeline types --- my/twitter/android.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/my/twitter/android.py b/my/twitter/android.py index fc5d230..af16b11 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -1,6 +1,7 @@ """ Twitter data from official app for Android """ + from __future__ import annotations from dataclasses import dataclass @@ -93,12 +94,11 @@ def _parse_content(data: bytes) -> str: (xx,) = unpack_from('B', data, offset=pos) skip(1) - # print("TYPE:", xx) # wtf is this... maybe it's a bitmask? slen = { - 66 : 1, - 67 : 2, + 66: 1, + 67: 2, 106: 1, 107: 2, }[xx] @@ -112,7 +112,7 @@ def _parse_content(data: bytes) -> str: # see 1665029077034565633 extracted = [] - linksep = 0x6a + linksep = 0x6A while True: m = re.search(b'\x6a.http', data[pos:]) if m is None: @@ -175,7 +175,8 @@ def get_own_user_id(conn) -> str: # don't think they represent bookmarking time # - timeline_type # 7, 8, 9: some sort of notifications or cursors, should exclude -# 17: ??? some cursors but also tweets +# 14: some converstaionthread stuff? +# 17: ??? some cursors but also tweets NOTE: they seem to contribute to user's tweets data, so make sure not to delete # 18: ??? relatively few, maybe 20 of them, also they all have timeline_is_preview=1? # most of them have our own id as timeline_sender? # I think it might actually be 'replies' tab -- also contains some retweets etc From 9e72672b4f765dc1807ac15990322a0a4087648b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 3 Aug 2024 16:43:10 +0100 Subject: [PATCH 221/302] legacy google takeout: fix timezone localization --- my/google/takeout/html.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/my/google/takeout/html.py b/my/google/takeout/html.py index 5d65a86..d393957 100644 --- a/my/google/takeout/html.py +++ b/my/google/takeout/html.py @@ -5,12 +5,14 @@ Google Takeout exports: browsing history, search/youtube/google play activity from enum import Enum import re from pathlib import Path -from datetime import datetime, timezone +from datetime import datetime from html.parser import HTMLParser from typing import List, Optional, Any, Callable, Iterable, Tuple from collections import OrderedDict from urllib.parse import unquote +import pytz + from ...core.time import abbr_to_timezone @@ -29,7 +31,8 @@ def parse_dt(s: str) -> datetime: # old takeouts didn't have timezone # hopefully it was utc? Legacy, so no that much of an issue anymore.. # todo although maybe worth adding timezone from location provider? - tz = timezone.utc + # note: need to use pytz here for localize call later + tz = pytz.utc else: s, tzabbr = s.rsplit(maxsplit=1) tz = abbr_to_timezone(tzabbr) From 652ee9b875275d25ca4cbfdee4f63c8f822b3ba7 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 3 Aug 2024 18:45:22 +0100 Subject: [PATCH 222/302] fbmessenger.android: fix minor issue with processing thread participants --- my/fbmessenger/android.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index ecf0eb0..8a4bf4c 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -1,6 +1,7 @@ """ Messenger data from Android app database (in =/data/data/com.facebook.orca/databases/threads_db2=) """ + from __future__ import annotations from dataclasses import dataclass @@ -111,7 +112,7 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: senders: Dict[str, Sender] = {} for r in db.execute('SELECT CAST(id AS TEXT) AS id, name FROM contacts'): s = Sender( - id=r['id'], # looks like it's server id? same used on facebook site + id=r['id'], # looks like it's server id? same used on facebook site name=r['name'], ) # NOTE https://www.messenger.com/t/{contant_id} for permalink @@ -129,14 +130,17 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: for r in db.execute('SELECT CAST(thread_key AS TEXT) AS thread_key, CAST(contact_id AS TEXT) AS contact_id FROM participants'): thread_key = r['thread_key'] user_key = r['contact_id'] - if self_id is not None and user_key == self_id: - # exclude yourself, otherwise it's just spammy to show up in all participants - continue ll = thread_users.get(thread_key) if ll is None: ll = [] thread_users[thread_key] = ll + + if self_id is not None and user_key == self_id: + # exclude yourself, otherwise it's just spammy to show up in all participants + # TODO not sure about that, maybe change later + continue + ll.append(senders[user_key]) # 15 is a weird thread that doesn't have any participants and messages From 2c63fe25c0aedcf11f83053cc1fcd1868fb7bacc Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 5 Aug 2024 22:53:25 +0100 Subject: [PATCH 223/302] my.twitter.android: get data from statues table rather that timeline_view --- my/twitter/android.py | 95 +++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/my/twitter/android.py b/my/twitter/android.py index af16b11..f40ad0e 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -9,7 +9,7 @@ from datetime import datetime, timezone from pathlib import Path import re from struct import unpack_from -from typing import Iterator, Sequence +from typing import Iterator, Sequence, Set from my.core import datetime_aware, get_files, LazyLogger, Paths, Res from my.core.common import unique_everseen @@ -209,41 +209,66 @@ def get_own_user_id(conn) -> str: # 6 : always notifications?? # 42: tweets (bulk of them) def _process_one(f: Path, *, where: str) -> Iterator[Res[Tweet]]: + # meh... maybe separate this function into special ones for tweets/bookmarks/likes + select_own = _SELECT_OWN_TWEETS in where with sqlite_connect_immutable(f) as db: - if _SELECT_OWN_TWEETS in where: + if select_own: own_user_id = get_own_user_id(db) - where = where.replace(_SELECT_OWN_TWEETS, own_user_id) + db_where = where.replace(_SELECT_OWN_TWEETS, own_user_id) + else: + db_where = where - for ( - tweet_id, - user_name, - user_username, - created_ms, - blob, - ) in db.execute( - f''' + # NOTE: we used to get this from 'timeline_view' + # however seems that it's missing a fair amount of data that's present instatuses table... + QUERY = ''' SELECT - statuses_status_id, - users_name, - users_username, - statuses_created, - CAST(statuses_content AS BLOB) - FROM timeline_view - WHERE timeline_data_type == 1 /* the only one containing tweets (among with some other stuff) */ - AND timeline_data_type_group != 6 /* excludes notifications (some of them even have statuses_bookmarked == 1) */ - AND {where} - ORDER BY timeline_sort_index DESC - ''', - # TODO not sure about timeline_sort_index for favorites - ): - assert blob is not None # just in case, but should be filtered by the sql query - yield Tweet( - id_str=tweet_id, - # TODO double check it's utc? - created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc), - screen_name=user_username, - text=_parse_content(blob), - ) + CAST(statuses.status_id AS TEXT), /* int by default */ + users.username, + statuses.created, + CAST(statuses.content AS BLOB), + statuses.quoted_tweet_id + FROM statuses FULL OUTER JOIN users + ON statuses.author_id == users.user_id + WHERE + /* there are sometimes a few shitty statuses in the db with weird ids which are duplicating other tweets + don't want to filter by status_id < 10 ** 10, since there might legit be statuses with low ids? + so this is the best I came up with.. + */ + NOT (statuses.in_r_user_id == -1 AND statuses.in_r_status_id == -1 AND statuses.conversation_id == 0) + ''' + + def _query_one(*, where: str, quoted: Set[int]) -> Iterator[Res[Tweet]]: + for ( + tweet_id, + user_username, + created_ms, + blob, + quoted_id, + ) in db.execute(f'{QUERY} AND {where}'): + quoted.add(quoted_id) # if no quoted tweet, id is 0 here + + try: + content = _parse_content(blob) + except Exception as e: + yield e + continue + + yield Tweet( + id_str=tweet_id, + # TODO double check it's utc? + created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc), + screen_name=user_username, + text=content, + ) + + quoted: Set[int] = set() + yield from _query_one(where=db_where, quoted=quoted) + # get quoted tweets 'recursively' + # TODO maybe do it for favs/bookmarks too? not sure + while select_own and len(quoted) > 0: + db_where = 'status_id IN (' + ','.join(map(str, sorted(quoted))) + ')' + quoted = set() + yield from _query_one(where=db_where, quoted=quoted) def _entities(*, where: str) -> Iterator[Res[Tweet]]: @@ -264,15 +289,15 @@ def bookmarks() -> Iterator[Res[Tweet]]: # NOTE: in principle we get the bulk of bookmarks via timeline_type == 30 filter # however we still might miss on a few (I think the timeline_type 30 only refreshes when you enter bookmarks in the app) # if you bookmarked in the home feed, it might end up as status_bookmarked == 1 but not necessarily as timeline_type 30 - return _entities(where='statuses_bookmarked == 1') + return _entities(where='statuses.bookmarked == 1') def likes() -> Iterator[Res[Tweet]]: # NOTE: similarly to bookmarks, we could use timeline_type == 29, but it's only refreshed if we actually open likes tab - return _entities(where='statuses_favorited == 1') + return _entities(where='statuses.favorited == 1') def tweets() -> Iterator[Res[Tweet]]: # NOTE: where timeline_type == 18 covers quite a few of our on tweets, but not everything # querying by our own user id seems the most exhaustive - return _entities(where=f'timeline_sender_id == {_SELECT_OWN_TWEETS}') + return _entities(where=f'users.user_id == {_SELECT_OWN_TWEETS} OR statuses.retweeted == 1') From b615ba10b15df8dee8406a62b56feb829ab1fbf9 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 5 Aug 2024 23:15:49 +0100 Subject: [PATCH 224/302] ci: temporary suppress pandas mypy error in check_dateish --- my/core/pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/my/core/pandas.py b/my/core/pandas.py index 1b7a644..621682f 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -50,7 +50,7 @@ def check_dateish(s: SeriesT[S1]) -> Iterable[str]: all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all() if not all_timestamps: return # not sure why it would happen, but ok - tzs = s.map(lambda x: x.tzinfo).drop_duplicates() + tzs = s.map(lambda x: x.tzinfo).drop_duplicates() # type: ignore[union-attr, var-annotated, arg-type, return-value, unused-ignore] examples = s[tzs.index] # todo not so sure this warning is that useful... except for stuff without tz yield f''' From 3aebc573e80f3686e218eeb924a29b92221e5020 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 6 Aug 2024 20:41:50 +0100 Subject: [PATCH 225/302] tests: use updated conftest from pymplate, this allows to run individual test modules properly e.g. pytest --pyargs my.core.tests.test_get_files --- conftest.py | 47 +++++++++++++++++++++++++++++++++ my/core/tests/test_get_files.py | 9 +++++-- 2 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 conftest.py diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..b959cfa --- /dev/null +++ b/conftest.py @@ -0,0 +1,47 @@ +# this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly +# without it, pytest can't discover the package root for some reason +# also see https://github.com/karlicoss/pytest_namespace_pkgs for more + +import os +import pathlib +from typing import Optional + +import _pytest.main +import _pytest.pathlib + +# we consider all dirs in repo/ to be namespace packages +root_dir = pathlib.Path(__file__).absolute().parent.resolve() # / 'src' +assert root_dir.exists(), root_dir + +# TODO assert it contains package name?? maybe get it via setuptools.. + +namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()] + +# resolve_package_path is called from _pytest.pathlib.import_path +# takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem +resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path +def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]: + result = path # search from the test file upwards + for parent in result.parents: + if str(parent) in namespace_pkg_dirs: + return parent + if os.name == 'nt': + # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx + if path.name == 'conftest.py': + return resolve_pkg_path_orig(path) + raise RuntimeError("Couldn't determine path for ", path) +_pytest.pathlib.resolve_package_path = resolve_package_path + + +# without patching, the orig function returns just a package name for some reason +# (I think it's used as a sort of fallback) +# so we need to point it at the absolute path properly +# not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure.. +search_pypath_orig = _pytest.main.search_pypath +def search_pypath(module_name: str) -> str: + mpath = root_dir / module_name.replace('.', os.sep) + if not mpath.is_dir(): + mpath = mpath.with_suffix('.py') + assert mpath.exists(), mpath # just in case + return str(mpath) +_pytest.main.search_pypath = search_pypath diff --git a/my/core/tests/test_get_files.py b/my/core/tests/test_get_files.py index 2bdc903..e9f216a 100644 --- a/my/core/tests/test_get_files.py +++ b/my/core/tests/test_get_files.py @@ -175,12 +175,17 @@ TMP = tempfile.gettempdir() test_path = Path(TMP) / 'hpi_test' -def setup(): +@pytest.fixture(autouse=True) +def prepare(): teardown() test_path.mkdir() + try: + yield + finally: + teardown() -def teardown(): +def teardown() -> None: if test_path.is_dir(): shutil.rmtree(test_path) From fb8e9909a498c16a92e276e13e40ab8fb1ce189c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 6 Aug 2024 23:09:56 +0100 Subject: [PATCH 226/302] tests: simplify tests for my.core.serialize a bit and simplify tox file --- my/core/pytest.py | 22 ++++++ my/core/serialize.py | 127 ++++++++++++++++++++++------------ setup.py | 5 +- tests/serialize.py | 1 - tests/serialize_simplejson.py | 23 ------ tox.ini | 19 +---- 6 files changed, 109 insertions(+), 88 deletions(-) create mode 100644 my/core/pytest.py delete mode 100644 tests/serialize.py delete mode 100644 tests/serialize_simplejson.py diff --git a/my/core/pytest.py b/my/core/pytest.py new file mode 100644 index 0000000..a2596fb --- /dev/null +++ b/my/core/pytest.py @@ -0,0 +1,22 @@ +""" +Helpers to prevent depending on pytest in runtime +""" + +from .common import assert_subpackage; assert_subpackage(__name__) + +import sys +import typing + +under_pytest = 'pytest' in sys.modules + +if typing.TYPE_CHECKING or under_pytest: + import pytest + + parametrize = pytest.mark.parametrize +else: + + def parametrize(*args, **kwargs): + def wrapper(f): + return f + + return wrapper diff --git a/my/core/serialize.py b/my/core/serialize.py index 1f55f40..563e114 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -2,11 +2,12 @@ import datetime from dataclasses import is_dataclass, asdict from pathlib import Path from decimal import Decimal -from typing import Any, Optional, Callable, NamedTuple +from typing import Any, Optional, Callable, NamedTuple, Protocol from functools import lru_cache from .common import is_namedtuple from .error import error_to_json +from .pytest import parametrize # note: it would be nice to combine the 'asdict' and _default_encode to some function # that takes a complex python object and returns JSON-compatible fields, while still @@ -16,6 +17,8 @@ from .error import error_to_json DefaultEncoder = Callable[[Any], Any] +Dumps = Callable[[Any], str] + def _default_encode(obj: Any) -> Any: """ @@ -75,22 +78,29 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: kwargs["default"] = use_default - try: - import orjson + prefer_factory: Optional[str] = kwargs.pop('_prefer_factory', None) + + def orjson_factory() -> Optional[Dumps]: + try: + import orjson + except ModuleNotFoundError: + return None # todo: add orjson.OPT_NON_STR_KEYS? would require some bitwise ops # most keys are typically attributes from a NT/Dataclass, # so most seem to work: https://github.com/ijl/orjson#opt_non_str_keys - def _orjson_dumps(obj: Any) -> str: + def _orjson_dumps(obj: Any) -> str: # TODO rename? # orjson returns json as bytes, encode to string return orjson.dumps(obj, **kwargs).decode('utf-8') return _orjson_dumps - except ModuleNotFoundError: - pass - try: - from simplejson import dumps as simplejson_dumps + def simplejson_factory() -> Optional[Dumps]: + try: + from simplejson import dumps as simplejson_dumps + except ModuleNotFoundError: + return None + # if orjson couldn't be imported, try simplejson # This is included for compatibility reasons because orjson # is rust-based and compiling on rarer architectures may not work @@ -105,18 +115,37 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: return _simplejson_dumps - except ModuleNotFoundError: - pass + def stdlib_factory() -> Optional[Dumps]: + import json + from .warnings import high - import json - from .warnings import high + high( + "You might want to install 'orjson' to support serialization for lots more types! If that does not work for you, you can install 'simplejson' instead" + ) - high("You might want to install 'orjson' to support serialization for lots more types! If that does not work for you, you can install 'simplejson' instead") + def _stdlib_dumps(obj: Any) -> str: + return json.dumps(obj, **kwargs) - def _stdlib_dumps(obj: Any) -> str: - return json.dumps(obj, **kwargs) + return _stdlib_dumps - return _stdlib_dumps + factories = { + 'orjson': orjson_factory, + 'simplejson': simplejson_factory, + 'stdlib': stdlib_factory, + } + + if prefer_factory is not None: + factory = factories[prefer_factory] + res = factory() + assert res is not None, prefer_factory + return res + + for factory in factories.values(): + res = factory() + if res is not None: + return res + else: + raise RuntimeError("Should not happen!") def dumps( @@ -154,8 +183,17 @@ def dumps( return _dumps_factory(default=default, **kwargs)(obj) -def test_serialize_fallback() -> None: - import json as jsn # dont cause possible conflicts with module code +@parametrize('factory', ['orjson', 'simplejson', 'stdlib']) +def test_dumps(factory: str) -> None: + import pytest + + orig_dumps = globals()['dumps'] # hack to prevent error from using local variable before declaring + + def dumps(*args, **kwargs) -> str: + kwargs['_prefer_factory'] = factory + return orig_dumps(*args, **kwargs) + + import json as json_builtin # dont cause possible conflicts with module code # can't use a namedtuple here, since the default json.dump serializer # serializes namedtuples as tuples, which become arrays @@ -166,36 +204,12 @@ def test_serialize_fallback() -> None: # the lru_cache'd warning may have already been sent, # so checking may be nondeterministic? import warnings + with warnings.catch_warnings(): warnings.simplefilter("ignore") - res = jsn.loads(dumps(X)) + res = json_builtin.loads(dumps(X)) assert res == [5, 5.0] - -# this needs to be defined here to prevent a mypy bug -# see https://github.com/python/mypy/issues/7281 -class _A(NamedTuple): - x: int - y: float - - -def test_nt_serialize() -> None: - import json as jsn # dont cause possible conflicts with module code - import orjson # import to make sure this is installed - - res: str = dumps(_A(x=1, y=2.0)) - assert res == '{"x":1,"y":2.0}' - - # test orjson option kwarg - data = {datetime.date(year=1970, month=1, day=1): 5} - res2 = jsn.loads(dumps(data, option=orjson.OPT_NON_STR_KEYS)) - assert res2 == {'1970-01-01': 5} - - -def test_default_serializer() -> None: - import pytest - import json as jsn # dont cause possible conflicts with module code - class Unserializable: def __init__(self, x: int): self.x = x @@ -209,7 +223,7 @@ def test_default_serializer() -> None: def _serialize(self) -> Any: return {"x": self.x, "y": self.y} - res = jsn.loads(dumps(WithUnderscoreSerialize(6))) + res = json_builtin.loads(dumps(WithUnderscoreSerialize(6))) assert res == {"x": 6, "y": 6.0} # test passing additional 'default' func @@ -221,5 +235,26 @@ def test_default_serializer() -> None: # this serializes both Unserializable, which is a custom type otherwise # not handled, and timedelta, which is handled by the '_default_encode' # in the 'wrapped_default' function - res2 = jsn.loads(dumps(Unserializable(10), default=_serialize_with_default)) + res2 = json_builtin.loads(dumps(Unserializable(10), default=_serialize_with_default)) assert res2 == {"x": 10, "y": 10.0} + + if factory == 'orjson': + import orjson + + # test orjson option kwarg + data = {datetime.date(year=1970, month=1, day=1): 5} + res2 = json_builtin.loads(dumps(data, option=orjson.OPT_NON_STR_KEYS)) + assert res2 == {'1970-01-01': 5} + + +@parametrize('factory', ['orjson', 'simplejson']) +def test_dumps_namedtuple(factory: str) -> None: + import json as json_builtin # dont cause possible conflicts with module code + import orjson # import to make sure this is installed + + class _A(NamedTuple): + x: int + y: float + + res: str = dumps(_A(x=1, y=2.0), _prefer_factory=factory) + assert json_builtin.loads(res) == {'x': 1, 'y': 2.0} diff --git a/setup.py b/setup.py index e6bc9fa..ab96616 100644 --- a/setup.py +++ b/setup.py @@ -47,13 +47,16 @@ def main() -> None: install_requires=INSTALL_REQUIRES, extras_require={ 'testing': [ - 'pytest<8', # FIXME <8 is temporary workaround till we fix collection with pytest 8; see https://docs.pytest.org/en/stable/changelog.html#collection-changes + 'pytest', 'ruff', 'mypy', 'lxml', # for mypy coverage # used in some tests.. although shouldn't rely on it 'pandas', + + 'orjson', # for my.core.serialize and denylist + 'simplejson', # for my.core.serialize ], 'optional': [ # todo document these? diff --git a/tests/serialize.py b/tests/serialize.py deleted file mode 100644 index d9ee9a3..0000000 --- a/tests/serialize.py +++ /dev/null @@ -1 +0,0 @@ -from my.core.serialize import * diff --git a/tests/serialize_simplejson.py b/tests/serialize_simplejson.py deleted file mode 100644 index d421a15..0000000 --- a/tests/serialize_simplejson.py +++ /dev/null @@ -1,23 +0,0 @@ -''' -This file should only run when simplejson is installed, -but orjson is not installed to check compatibility -''' - -# none of these should fail - -import json -import simplejson -import pytest - -from my.core.serialize import dumps, _A - -def test_simplejson_fallback() -> None: - - # should fail to import - with pytest.raises(ModuleNotFoundError): - import orjson - - # simplejson should serialize namedtuple properly - res: str = dumps(_A(x=1, y=2.0)) - assert json.loads(res) == {"x": 1, "y": 2.0} - diff --git a/tox.ini b/tox.ini index 0676eef..0b75b44 100644 --- a/tox.ini +++ b/tox.ini @@ -34,14 +34,11 @@ commands = commands = {envpython} -m pip install --use-pep517 -e .[testing] - # seems that denylist tests rely on it? ideally we should get rid of this in tests-core - {envpython} -m pip install orjson - {envpython} -m pytest \ # importlib is the new suggested import-mode # without it test package names end up as core.tests.* instead of my.core.tests.* --import-mode=importlib \ - --pyargs my.core \ + --pyargs {[testenv]package_name}.core \ # ignore orgmode because it imports orgparse # tbh not sure if it even belongs to core, maybe move somewhere else.. # same with pandas? @@ -49,9 +46,6 @@ commands = # causes error during test collection on 3.8 # dataset is deprecated anyway so whatever --ignore my/core/dataset.py \ - # this test uses orjson which is an optional dependency - # it would be covered by tests-all - -k 'not test_nt_serialize' \ {posargs} @@ -63,14 +57,7 @@ setenv = MY_CONFIG = nonexistent commands = {envpython} -m pip install --use-pep517 -e .[testing] - # installed to test my.core.serialize while using simplejson and not orjson - {envpython} -m pip install simplejson - {envpython} -m pytest \ - tests/serialize_simplejson.py \ - {posargs} - {envpython} -m pip install cachew - {envpython} -m pip install orjson {envpython} -m my.core module install my.location.google {envpython} -m pip install ijson # optional dependency @@ -103,9 +90,7 @@ commands = {envpython} -m pytest tests \ # ignore some tests which might take a while to run on ci.. --ignore tests/takeout.py \ - --ignore tests/extra/polar.py \ - # dont run simplejson compatibility test since orjson is now installed - --ignore tests/serialize_simplejson.py \ + --ignore tests/extra/polar.py {posargs} From 074e24c3098aed3da81155c51f6afb1c28debc46 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 6 Aug 2024 23:44:01 +0100 Subject: [PATCH 227/302] general: deprecate my.core.dataset and simplify tox file --- my/core/_deprecated/dataset.py | 12 ++++++++++++ my/core/dataset.py | 32 +++----------------------------- my/core/serialize.py | 3 +-- tox.ini | 3 --- 4 files changed, 16 insertions(+), 34 deletions(-) create mode 100644 my/core/_deprecated/dataset.py diff --git a/my/core/_deprecated/dataset.py b/my/core/_deprecated/dataset.py new file mode 100644 index 0000000..9cca2fd --- /dev/null +++ b/my/core/_deprecated/dataset.py @@ -0,0 +1,12 @@ +from ..common import PathIsh +from ..sqlite import sqlite_connect_immutable + + +def connect_readonly(db: PathIsh): + import dataset # type: ignore + + # see https://github.com/pudo/dataset/issues/136#issuecomment-128693122 + # todo not sure if mode=ro has any benefit, but it doesn't work on read-only filesystems + # maybe it should autodetect readonly filesystems and apply this? not sure + creator = lambda: sqlite_connect_immutable(db) + return dataset.connect('sqlite:///', engine_kwargs={'creator': creator}) diff --git a/my/core/dataset.py b/my/core/dataset.py index 31de4f4..40237a0 100644 --- a/my/core/dataset.py +++ b/my/core/dataset.py @@ -1,31 +1,5 @@ -from __future__ import annotations -from .common import assert_subpackage; assert_subpackage(__name__) +from . import warnings -from .common import PathIsh -from .sqlite import sqlite_connect_immutable +warnings.high(f"{__name__} is deprecated, please use dataset directly if you need or switch to my.core.sqlite") -## sadly dataset doesn't have any type definitions -from typing import Iterable, Iterator, Dict, Optional, Any, Protocol -from contextlib import AbstractContextManager - - -# NOTE: may not be true in general, but will be in the vast majority of cases -row_type_T = Dict[str, Any] - - -class TableT(Iterable, Protocol): - def find(self, *, order_by: Optional[str]=None) -> Iterator[row_type_T]: ... - - -class DatabaseT(AbstractContextManager['DatabaseT'], Protocol): - def __getitem__(self, table: str) -> TableT: ... -## - -# TODO wonder if also need to open without WAL.. test this on read-only directory/db file -def connect_readonly(db: PathIsh) -> DatabaseT: - import dataset # type: ignore - # see https://github.com/pudo/dataset/issues/136#issuecomment-128693122 - # todo not sure if mode=ro has any benefit, but it doesn't work on read-only filesystems - # maybe it should autodetect readonly filesystems and apply this? not sure - creator = lambda: sqlite_connect_immutable(db) - return dataset.connect('sqlite:///', engine_kwargs={'creator': creator}) +from ._deprecated.dataset import * diff --git a/my/core/serialize.py b/my/core/serialize.py index 563e114..b5b1b3a 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -2,7 +2,7 @@ import datetime from dataclasses import is_dataclass, asdict from pathlib import Path from decimal import Decimal -from typing import Any, Optional, Callable, NamedTuple, Protocol +from typing import Any, Optional, Callable, NamedTuple from functools import lru_cache from .common import is_namedtuple @@ -250,7 +250,6 @@ def test_dumps(factory: str) -> None: @parametrize('factory', ['orjson', 'simplejson']) def test_dumps_namedtuple(factory: str) -> None: import json as json_builtin # dont cause possible conflicts with module code - import orjson # import to make sure this is installed class _A(NamedTuple): x: int diff --git a/tox.ini b/tox.ini index 0b75b44..02acdfc 100644 --- a/tox.ini +++ b/tox.ini @@ -43,9 +43,6 @@ commands = # tbh not sure if it even belongs to core, maybe move somewhere else.. # same with pandas? --ignore my/core/orgmode.py \ - # causes error during test collection on 3.8 - # dataset is deprecated anyway so whatever - --ignore my/core/dataset.py \ {posargs} From 34593c032d7360f09642da889a6ca0b236bda810 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 7 Aug 2024 00:55:30 +0100 Subject: [PATCH 228/302] tests: move more tests into core, more consistent tests running in tox --- my/core/tests/auto_stats.py | 1 + my/core/tests/common.py | 12 +++++ my/core/tests/test_cachew.py | 53 ++++++++++++++++++++ my/core/tests/test_common.py | 54 +++++++++++++++++++++ my/tests/common.py | 2 +- tests/misc.py | 94 ------------------------------------ tox.ini | 10 ++-- 7 files changed, 127 insertions(+), 99 deletions(-) create mode 100644 my/core/tests/common.py create mode 100644 my/core/tests/test_cachew.py create mode 100644 my/core/tests/test_common.py delete mode 100644 tests/misc.py diff --git a/my/core/tests/auto_stats.py b/my/core/tests/auto_stats.py index 2c09b5b..bf4764c 100644 --- a/my/core/tests/auto_stats.py +++ b/my/core/tests/auto_stats.py @@ -1,6 +1,7 @@ """ Helper 'module' for test_guess_stats """ + from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime, timedelta diff --git a/my/core/tests/common.py b/my/core/tests/common.py new file mode 100644 index 0000000..d6fb71e --- /dev/null +++ b/my/core/tests/common.py @@ -0,0 +1,12 @@ +import os + +import pytest + + +V = 'HPI_TESTS_USES_OPTIONAL_DEPS' + +# TODO use it for serialize tests that are using simplejson/orjson? +skip_if_uses_optional_deps = pytest.mark.skipif( + V not in os.environ, + reason=f'test only works when optional dependencies are installed. Set env variable {V}=true to override.', +) diff --git a/my/core/tests/test_cachew.py b/my/core/tests/test_cachew.py new file mode 100644 index 0000000..86344fd --- /dev/null +++ b/my/core/tests/test_cachew.py @@ -0,0 +1,53 @@ +from .common import skip_if_uses_optional_deps as pytestmark + +from typing import List + +# TODO ugh, this is very messy.. need to sort out config overriding here + + +def test_cachew() -> None: + from cachew import settings + + settings.ENABLE = True # by default it's off in tests (see conftest.py) + + from my.core.common import mcachew + + called = 0 + + # TODO ugh. need doublewrap or something to avoid having to pass parens + @mcachew() + def cf() -> List[int]: + nonlocal called + called += 1 + return [1, 2, 3] + + list(cf()) + cc = called + # todo ugh. how to clean cache? + # assert called == 1 # precondition, to avoid turdes from previous tests + + assert list(cf()) == [1, 2, 3] + assert called == cc + + +def test_cachew_dir_none() -> None: + from cachew import settings + + settings.ENABLE = True # by default it's off in tests (see conftest.py) + + from my.core.cachew import cache_dir + from my.core.common import mcachew + from my.core.core_config import _reset_config as reset + + with reset() as cc: + cc.cache_dir = None + called = 0 + + @mcachew(cache_path=cache_dir() / 'ctest') + def cf() -> List[int]: + nonlocal called + called += 1 + return [called, called, called] + + assert list(cf()) == [1, 1, 1] + assert list(cf()) == [2, 2, 2] diff --git a/my/core/tests/test_common.py b/my/core/tests/test_common.py new file mode 100644 index 0000000..a2019e4 --- /dev/null +++ b/my/core/tests/test_common.py @@ -0,0 +1,54 @@ +from typing import Iterable, List +import warnings + +from ..common import ( + warn_if_empty, + _warn_iterable, +) + + +def test_warn_if_empty() -> None: + @warn_if_empty + def nonempty() -> Iterable[str]: + yield 'a' + yield 'aba' + + @warn_if_empty + def empty() -> List[int]: + return [] + + # should be rejected by mypy! + # todo how to actually test it? + # @warn_if_empty + # def baad() -> float: + # return 0.00 + + # reveal_type(nonempty) + # reveal_type(empty) + + with warnings.catch_warnings(record=True) as w: + assert list(nonempty()) == ['a', 'aba'] + assert len(w) == 0 + + eee = empty() + assert eee == [] + assert len(w) == 1 + + +def test_warn_iterable() -> None: + i1: List[str] = ['a', 'b'] + i2: Iterable[int] = iter([1, 2, 3]) + # reveal_type(i1) + # reveal_type(i2) + x1 = _warn_iterable(i1) + x2 = _warn_iterable(i2) + # vvvv this should be flagged by mypy + # _warn_iterable(123) + # reveal_type(x1) + # reveal_type(x2) + with warnings.catch_warnings(record=True) as w: + assert x1 is i1 # should be unchanged! + assert len(w) == 0 + + assert list(x2) == [1, 2, 3] + assert len(w) == 0 diff --git a/my/tests/common.py b/my/tests/common.py index c8d88ff..e3060e1 100644 --- a/my/tests/common.py +++ b/my/tests/common.py @@ -9,7 +9,7 @@ V = 'HPI_TESTS_KARLICOSS' skip_if_not_karlicoss = pytest.mark.skipif( V not in os.environ, - reason=f'test only works on @karlicoss data for now. Set evn variable {V}=true to override.', + reason=f'test only works on @karlicoss data for now. Set env variable {V}=true to override.', ) diff --git a/tests/misc.py b/tests/misc.py deleted file mode 100644 index 7e666d7..0000000 --- a/tests/misc.py +++ /dev/null @@ -1,94 +0,0 @@ -from typing import Iterable, List -import warnings -from my.core import warn_if_empty - - -def test_warn_if_empty() -> None: - @warn_if_empty - def nonempty() -> Iterable[str]: - yield 'a' - yield 'aba' - - @warn_if_empty - def empty() -> List[int]: - return [] - - # should be rejected by mypy! - # todo how to actually test it? - # @warn_if_empty - # def baad() -> float: - # return 0.00 - - # reveal_type(nonempty) - # reveal_type(empty) - - with warnings.catch_warnings(record=True) as w: - assert list(nonempty()) == ['a', 'aba'] - assert len(w) == 0 - - eee = empty() - assert eee == [] - assert len(w) == 1 - - -def test_warn_iterable() -> None: - from my.core.common import _warn_iterable - i1: List[str] = ['a', 'b'] - i2: Iterable[int] = iter([1, 2, 3]) - # reveal_type(i1) - # reveal_type(i2) - x1 = _warn_iterable(i1) - x2 = _warn_iterable(i2) - # vvvv this should be flagged by mypy - # _warn_iterable(123) - # reveal_type(x1) - # reveal_type(x2) - with warnings.catch_warnings(record=True) as w: - assert x1 is i1 # should be unchanged! - assert len(w) == 0 - - assert list(x2) == [1, 2, 3] - assert len(w) == 0 - - -def test_cachew() -> None: - from cachew import settings - settings.ENABLE = True # by default it's off in tests (see conftest.py) - - from my.core.cachew import cache_dir - from my.core.common import mcachew - - called = 0 - # FIXME ugh. need doublewrap or something - @mcachew() - def cf() -> List[int]: - nonlocal called - called += 1 - return [1, 2, 3] - - list(cf()) - cc = called - # todo ugh. how to clean cache? - # assert called == 1 # precondition, to avoid turdes from previous tests - - assert list(cf()) == [1, 2, 3] - assert called == cc - - -def test_cachew_dir_none() -> None: - from cachew import settings - settings.ENABLE = True # by default it's off in tests (see conftest.py) - - from my.core.cachew import cache_dir - from my.core.common import mcachew - from my.core.core_config import _reset_config as reset - with reset() as cc: - cc.cache_dir = None - called = 0 - @mcachew(cache_path=cache_dir() / 'ctest') - def cf() -> List[int]: - nonlocal called - called += 1 - return [called, called, called] - assert list(cf()) == [1, 1, 1] - assert list(cf()) == [2, 2, 2] diff --git a/tox.ini b/tox.ini index 02acdfc..248469e 100644 --- a/tox.ini +++ b/tox.ini @@ -48,9 +48,11 @@ commands = # todo maybe also have core tests and misc tests? since ideally want them without dependencies [testenv:tests-all] -# deliberately set to nonexistent path to check the fallback logic -# TODO not sure if need it? -setenv = MY_CONFIG = nonexistent +setenv = + # deliberately set to nonexistent path to check the fallback logic + # TODO not sure if need it? + MY_CONFIG=nonexistent + HPI_TESTS_USES_OPTIONAL_DEPS=true commands = {envpython} -m pip install --use-pep517 -e .[testing] @@ -81,7 +83,7 @@ commands = # importlib is the new suggested import-mode # without it test package names end up as core.tests.* instead of my.core.tests.* --import-mode=importlib \ - --pyargs {[testenv]package_name}.tests \ + --pyargs {[testenv]package_name}.core {[testenv]package_name}.tests \ {posargs} {envpython} -m pytest tests \ From c69a0b43baeffb1fae5f35b5914f07040d1c30b6 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 10 Aug 2024 18:35:30 +0300 Subject: [PATCH 229/302] my.vk.favorites: some minor cleanup --- my/vk/favorites.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/my/vk/favorites.py b/my/vk/favorites.py index eb1a89b..9caae6d 100644 --- a/my/vk/favorites.py +++ b/my/vk/favorites.py @@ -1,29 +1,27 @@ -# todo: uses my private export script? -from datetime import datetime +# todo: uses my private export script?, timezone +from dataclasses import dataclass +from datetime import datetime, timezone import json -from typing import NamedTuple, Iterable, Sequence, Optional +from typing import Iterator, Iterable, Optional +from my.core import Json, datetime_aware, stat, Stats +from my.core.error import Res from my.config import vk as config # type: ignore[attr-defined] -class Favorite(NamedTuple): - dt: datetime +@dataclass +class Favorite: + dt: datetime_aware title: str url: Optional[str] text: str -from ..core import Json -from ..core.error import Res - - skip = ( 'graffiti', 'poll', - - # TODO could be useful.. - 'note', + 'note', # TODO could be useful.. 'doc', 'audio', 'photo', @@ -32,10 +30,11 @@ skip = ( 'page', ) + def parse_fav(j: Json) -> Favorite: # TODO copy_history?? url = None - title = '' # TODO ??? + title = '' # TODO ??? atts = j.get('attachments', []) for a in atts: if any(k in a for k in skip): @@ -47,14 +46,14 @@ def parse_fav(j: Json) -> Favorite: # TODO would be nice to include user return Favorite( - dt=datetime.utcfromtimestamp(j['date']), + dt=datetime.fromtimestamp(j['date'], tz=timezone.utc), title=title, url=url, text=j['text'], ) -def _iter_favs() -> Iterable[Res]: +def _iter_favs() -> Iterator[Res]: jj = json.loads(config.favs_file.read_text()) for j in jj: try: @@ -65,7 +64,7 @@ def _iter_favs() -> Iterable[Res]: yield ex -def favorites() -> Sequence[Res]: +def favorites() -> Iterable[Res]: it = _iter_favs() # trick to sort errors along with the actual objects # TODO wonder if there is a shorter way? @@ -76,12 +75,11 @@ def favorites() -> Sequence[Res]: for i, f in enumerate(favs): if not isinstance(f, Exception): prev = f.dt - keys.append((prev, i)) # include index to resolve ties + keys.append((prev, i)) # include index to resolve ties sorted_items = [p[1] for p in sorted(zip(keys, favs))] # return sorted_items -def stats(): - from ..core import stat +def stats() -> Stats: return stat(favorites) From 069264ce52046d0feeee9fd46850339354256900 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 10 Aug 2024 19:28:28 +0300 Subject: [PATCH 230/302] core.common: get rid of deprecated utcfromtimestamp --- my/core/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index f7bb010..98dbacb 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -511,10 +511,10 @@ def _stat_iterable(it: Iterable[C], quick: bool = False) -> Any: def test_stat_iterable() -> None: - from datetime import datetime, timedelta + from datetime import datetime, timedelta, timezone from typing import NamedTuple - dd = datetime.utcfromtimestamp(123) + dd = datetime.fromtimestamp(123, tz=timezone.utc) day = timedelta(days=3) X = NamedTuple('X', [('x', int), ('d', datetime)]) From 1e1e8d8494c5296d276c1191d8581f6d2df3833c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 10 Aug 2024 23:42:32 +0300 Subject: [PATCH 231/302] my.topcoder: get rid of kjson in favor of using builtin dict methods --- my/core/compat.py | 6 +++ my/topcoder.py | 104 ++++++++++++++++++++++++++-------------------- 2 files changed, 65 insertions(+), 45 deletions(-) diff --git a/my/core/compat.py b/my/core/compat.py index 9cdea27..e984695 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -115,3 +115,9 @@ def test_fromisoformat() -> None: # assert isoparse('2017-07-18T18:59:38.21731Z') == datetime( # 2017, 7, 18, 18, 59, 38, 217310, timezone.utc, # ) + + +if sys.version_info[:2] >= (3, 10): + from types import NoneType +else: + NoneType = type(None) diff --git a/my/topcoder.py b/my/topcoder.py index 7432379..d9631dc 100644 --- a/my/topcoder.py +++ b/my/topcoder.py @@ -1,77 +1,91 @@ from my.config import topcoder as config # type: ignore[attr-defined] -from datetime import datetime +from dataclasses import dataclass from functools import cached_property import json -from typing import NamedTuple, Iterator +from pathlib import Path +from typing import Iterator, Sequence + +from my.core import get_files, Res, datetime_aware +from my.core.compat import fromisoformat, NoneType -from my.core import get_files, Res, Json -from my.core.konsume import zoom, wrap, ignore +def inputs() -> Sequence[Path]: + return get_files(config.export_path) -def _get_latest() -> Json: - pp = max(get_files(config.export_path)) - return json.loads(pp.read_text()) - - -class Competition(NamedTuple): +@dataclass +class Competition: contest_id: str contest: str percentile: float - dates: str + date_str: str @cached_property def uid(self) -> str: return self.contest_id - def __hash__(self): - return hash(self.contest_id) - @cached_property - def when(self) -> datetime: - return datetime.strptime(self.dates, '%Y-%m-%dT%H:%M:%S.%fZ') + def when(self) -> datetime_aware: + return fromisoformat(self.date_str) @cached_property def summary(self) -> str: return f'participated in {self.contest}: {self.percentile:.0f}' @classmethod - def make(cls, json) -> Iterator[Res['Competition']]: - ignore(json, 'rating', 'placement') - cid = json['challengeId'].zoom().value - cname = json['challengeName'].zoom().value - percentile = json['percentile'].zoom().value - dates = json['date'].zoom().value + def make(cls, j) -> Iterator[Res['Competition']]: + assert isinstance(j.pop('rating'), float) + assert isinstance(j.pop('placement'), int) + + cid = j.pop('challengeId') + cname = j.pop('challengeName') + percentile = j.pop('percentile') + date_str = j.pop('date') + yield cls( contest_id=cid, contest=cname, percentile=percentile, - dates=dates, + date_str=date_str, ) +def _parse_one(p: Path) -> Iterator[Res[Competition]]: + j = json.loads(p.read_text()) + + # this is kind of an experiment to parse it exhaustively, making sure we don't miss any data + assert isinstance(j.pop('version'), str) + assert isinstance(j.pop('id'), str) + [j] = j.values() # zoom in + + assert j.pop('success') is True, j + assert j.pop('status') == 200, j + assert j.pop('metadata') is None, j + [j] = j.values() # zoom in + + # todo hmm, potentially error handling could be nicer since .pop just reports key error + # also by the time error is reported, key is already removed? + for k in ['handle', 'handleLower', 'userId', 'createdAt', 'updatedAt', 'createdBy', 'updatedBy']: + # check it's primitive + assert isinstance(j.pop(k), (str, bool, float, int, NoneType)), k + + j.pop('DEVELOP') # TODO how to handle it? + [j] = j.values() # zoom in, DATA_SCIENCE section + + mm = j.pop('MARATHON_MATCH') + [mm] = mm.values() # zoom into historu + + srm = j.pop('SRM') + [srm] = srm.values() # zoom into history + + assert len(j) == 0, j + + for c in mm + srm: + yield from Competition.make(j=c) + + def data() -> Iterator[Res[Competition]]: - with wrap(_get_latest()) as j: - ignore(j, 'id', 'version') - - res = j['result'].zoom() # type: ignore[index] - ignore(res, 'success', 'status', 'metadata') - - cont = res['content'].zoom() - ignore(cont, 'handle', 'handleLower', 'userId', 'createdAt', 'updatedAt', 'createdBy', 'updatedBy') - - cont['DEVELOP'].ignore() # TODO handle it?? - ds = cont['DATA_SCIENCE'].zoom() - - mar, srm = zoom(ds, 'MARATHON_MATCH', 'SRM') - - mar = mar['history'].zoom() - srm = srm['history'].zoom() - # TODO right, I guess I could rely on pylint for unused variables?? - - for c in mar + srm: - yield from Competition.make(json=c) - c.consume() - + *_, last = inputs() + return _parse_one(last) From 1317914bfff217e747edcc93f8167b2bad14f3fb Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 12 Aug 2024 14:56:18 +0300 Subject: [PATCH 232/302] general: add 'destructive parsing' (kinda what we were doing in my.core.konsume) to my.experimental also some cleanup for my.codeforces and my.topcoder --- my/codeforces.py | 126 ++++++++++++------------- my/core/konsume.py | 31 ++++++ my/experimental/destructive_parsing.py | 60 ++++++++++++ my/topcoder.py | 63 +++++++------ 4 files changed, 183 insertions(+), 97 deletions(-) create mode 100644 my/experimental/destructive_parsing.py diff --git a/my/codeforces.py b/my/codeforces.py index a97c360..7b37ec6 100644 --- a/my/codeforces.py +++ b/my/codeforces.py @@ -1,86 +1,80 @@ -from my.config import codeforces as config # type: ignore[attr-defined] - - +from dataclasses import dataclass from datetime import datetime, timezone from functools import cached_property import json -from typing import NamedTuple, Dict, Iterator +from pathlib import Path +from typing import Dict, Iterator, Sequence + +from my.core import get_files, Res, datetime_aware +from my.core.common import assert_never + +from my.config import codeforces as config # type: ignore[attr-defined] -from my.core import get_files, Res -from my.core.konsume import ignore, wrap +def inputs() -> Sequence[Path]: + return get_files(config.export_path) -Cid = int - -class Contest(NamedTuple): - cid: Cid - when: datetime - - @classmethod - def make(cls, j) -> 'Contest': - return cls( - cid=j['id'], - when=datetime.fromtimestamp(j['startTimeSeconds'], tz=timezone.utc), - ) - -Cmap = Dict[Cid, Contest] +ContestId = int -def get_contests() -> Cmap: - last = max(get_files(config.export_path, 'allcontests*.json')) - j = json.loads(last.read_text()) - d = {} - for c in j['result']: - cc = Contest.make(c) - d[cc.cid] = cc - return d +@dataclass +class Contest: + contest_id: ContestId + when: datetime_aware + name: str -class Competition(NamedTuple): - contest_id: Cid - contest: str - cmap: Cmap +@dataclass +class Competition: + contest: Contest + old_rating: int + new_rating: int @cached_property - def uid(self) -> Cid: - return self.contest_id + def when(self) -> datetime_aware: + return self.contest.when - def __hash__(self): - return hash(self.contest_id) - @cached_property - def when(self) -> datetime: - return self.cmap[self.uid].when +# todo not sure if parser is the best name? hmm +class Parser: + def __init__(self, *, inputs: Sequence[Path]) -> None: + self.inputs = inputs + self.contests: Dict[ContestId, Contest] = {} - @cached_property - def summary(self) -> str: - return f'participated in {self.contest}' # TODO + def _parse_allcontests(self, p: Path) -> Iterator[Contest]: + j = json.loads(p.read_text()) + for c in j['result']: + yield Contest( + contest_id=c['id'], + when=datetime.fromtimestamp(c['startTimeSeconds'], tz=timezone.utc), + name=c['name'], + ) - @classmethod - def make(cls, cmap, json) -> Iterator[Res['Competition']]: - # TODO try here?? - contest_id = json['contestId'].zoom().value - contest = json['contestName'].zoom().value - yield cls( - contest_id=contest_id, - contest=contest, - cmap=cmap, - ) - # TODO ytry??? - ignore(json, 'rank', 'oldRating', 'newRating') + def _parse_competitions(self, p: Path) -> Iterator[Competition]: + j = json.loads(p.read_text()) + for c in j['result']: + contest_id = c['contestId'] + contest = self.contests[contest_id] + yield Competition( + contest=contest, + old_rating=c['oldRating'], + new_rating=c['newRating'], + ) + + def parse(self) -> Iterator[Res[Competition]]: + for path in inputs(): + if 'allcontests' in path.name: + # these contain information about all CF contests along with useful metadata + for contest in self._parse_allcontests(path): + # TODO some method to assert on mismatch if it exists? not sure + self.contests[contest.contest_id] = contest + elif 'codeforces' in path.name: + # these contain only contests the user participated in + yield from self._parse_competitions(path) + else: + raise RuntimeError("shouldn't happen") # TODO switch to compat.assert_never def data() -> Iterator[Res[Competition]]: - cmap = get_contests() - last = max(get_files(config.export_path, 'codeforces*.json')) - - with wrap(json.loads(last.read_text())) as j: - j['status'].ignore() # type: ignore[index] - res = j['result'].zoom() # type: ignore[index] - - for c in list(res): # TODO maybe we want 'iter' method?? - ignore(c, 'handle', 'ratingUpdateTimeSeconds') - yield from Competition.make(cmap=cmap, json=c) - c.consume() - # TODO maybe if they are all empty, no need to consume?? + return Parser(inputs=inputs()).parse() diff --git a/my/core/konsume.py b/my/core/konsume.py index 588bfe1..10bea8d 100644 --- a/my/core/konsume.py +++ b/my/core/konsume.py @@ -209,3 +209,34 @@ def test_zoom() -> None: # TODO type check this... + +# TODO feels like the whole thing kind of unnecessarily complex +# - cons: +# - in most cases this is not even needed? who cares if we miss a few attributes? +# - pro: on the other hand it could be interesting to know about new attributes in data, +# and without this kind of processing we wouldn't even know +# alternatives +# - manually process data +# e.g. use asserts, dict.pop and dict.values() methods to unpack things +# - pros: +# - very simple, since uses built in syntax +# - very performant, as fast as it gets +# - very flexible, easy to adjust behaviour +# - cons: +# - can forget to assert about extra entities etc, so error prone +# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes erro handling harder +# - a bit verbose.. so probably requires some helper functions though (could be much leaner than current konsume though) +# - if we assert, then terminates parsing too early, if we're defensive then inflates the code a lot with if statements +# - TODO perhaps combine warnings somehow or at least only emit once per module? +# - hmm actually tbh if we carefully go through everything and don't make copies, then only requires one assert at the very end? +# - TODO this is kinda useful? https://discuss.python.org/t/syntax-for-dictionnary-unpacking-to-variables/18718 +# operator.itemgetter? +# - TODO can use match operator in python for this? quite nice actually! and allows for dynamic behaviour +# only from 3.10 tho, and gonna be tricky to do dynamic defensive behaviour with this +# - TODO in a sense, blenser already would hint if some meaningful fields aren't being processed? only if they are changing though +# - define a "schema" for data, then just recursively match data against the schema? +# possibly pydantic already does something like that? not sure about performance though +# pros: +# - much simpler to extend and understand what's going on +# cons: +# - more rigid, so it becomes tricky to do dynamic stuff (e.g. if schema actually changes) diff --git a/my/experimental/destructive_parsing.py b/my/experimental/destructive_parsing.py new file mode 100644 index 0000000..3fc739c --- /dev/null +++ b/my/experimental/destructive_parsing.py @@ -0,0 +1,60 @@ +from dataclasses import dataclass +from typing import Any, Iterator, List, Tuple + +from my.core import assert_never +from my.core.compat import NoneType + + +# TODO Popper? not sure +@dataclass +class Helper: + manager: 'Manager' + item: Any # todo realistically, list or dict? could at least type as indexable or something + path: Tuple[str, ...] + + def pop_if_primitive(self, *keys: str) -> None: + """ + The idea that primitive TODO + """ + item = self.item + for k in keys: + v = item[k] + if isinstance(v, (str, bool, float, int, NoneType)): + item.pop(k) # todo kinda unfortunate to get dict item twice.. but not sure if can avoid? + + def check(self, key: str, expected: Any) -> None: + actual = self.item.pop(key) + assert actual == expected, (key, actual, expected) + + def zoom(self, key: str) -> 'Helper': + return self.manager.helper(item=self.item.pop(key), path=self.path + (key,)) + + +def is_empty(x) -> bool: + if isinstance(x, dict): + return len(x) == 0 + elif isinstance(x, list): + return all(map(is_empty, x)) + else: + assert_never(x) + + +class Manager: + def __init__(self) -> None: + self.helpers: List[Helper] = [] + + def helper(self, item: Any, *, path: Tuple[str, ...] = ()) -> Helper: + res = Helper(manager=self, item=item, path=path) + self.helpers.append(res) + return res + + def check(self) -> Iterator[Exception]: + remaining = [] + for h in self.helpers: + # TODO recursively check it's primitive? + if is_empty(h.item): + continue + remaining.append((h.path, h.item)) + if len(remaining) == 0: + return + yield RuntimeError(f'Unparsed items remaining: {remaining}') diff --git a/my/topcoder.py b/my/topcoder.py index d9631dc..8e39252 100644 --- a/my/topcoder.py +++ b/my/topcoder.py @@ -1,6 +1,3 @@ -from my.config import topcoder as config # type: ignore[attr-defined] - - from dataclasses import dataclass from functools import cached_property import json @@ -8,7 +5,10 @@ from pathlib import Path from typing import Iterator, Sequence from my.core import get_files, Res, datetime_aware -from my.core.compat import fromisoformat, NoneType +from my.core.compat import fromisoformat +from my.experimental.destructive_parsing import Manager + +from my.config import topcoder as config # type: ignore[attr-defined] def inputs() -> Sequence[Path]: @@ -30,10 +30,6 @@ class Competition: def when(self) -> datetime_aware: return fromisoformat(self.date_str) - @cached_property - def summary(self) -> str: - return f'participated in {self.contest}: {self.percentile:.0f}' - @classmethod def make(cls, j) -> Iterator[Res['Competition']]: assert isinstance(j.pop('rating'), float) @@ -53,38 +49,43 @@ class Competition: def _parse_one(p: Path) -> Iterator[Res[Competition]]: - j = json.loads(p.read_text()) + d = json.loads(p.read_text()) - # this is kind of an experiment to parse it exhaustively, making sure we don't miss any data - assert isinstance(j.pop('version'), str) - assert isinstance(j.pop('id'), str) - [j] = j.values() # zoom in + # TODO manager should be a context manager? + m = Manager() - assert j.pop('success') is True, j - assert j.pop('status') == 200, j - assert j.pop('metadata') is None, j - [j] = j.values() # zoom in + h = m.helper(d) + h.pop_if_primitive('version', 'id') - # todo hmm, potentially error handling could be nicer since .pop just reports key error - # also by the time error is reported, key is already removed? - for k in ['handle', 'handleLower', 'userId', 'createdAt', 'updatedAt', 'createdBy', 'updatedBy']: - # check it's primitive - assert isinstance(j.pop(k), (str, bool, float, int, NoneType)), k + h = h.zoom('result') + h.check('success', True) + h.check('status', 200) + h.pop_if_primitive('metadata') - j.pop('DEVELOP') # TODO how to handle it? - [j] = j.values() # zoom in, DATA_SCIENCE section + h = h.zoom('content') + h.pop_if_primitive('handle', 'handleLower', 'userId', 'createdAt', 'updatedAt', 'createdBy', 'updatedBy') - mm = j.pop('MARATHON_MATCH') - [mm] = mm.values() # zoom into historu + # NOTE at the moment it's empty for me, but it will result in an error later if there is some data here + h.zoom('DEVELOP').zoom('subTracks') - srm = j.pop('SRM') - [srm] = srm.values() # zoom into history + h = h.zoom('DATA_SCIENCE') + # TODO multi zoom? not sure which axis, e.g. + # zoom('SRM', 'history') or zoom('SRM', 'MARATHON_MATCH') + # or zoom(('SRM', 'history'), ('MARATHON_MATCH', 'history')) + srms = h.zoom('SRM').zoom('history') + mms = h.zoom('MARATHON_MATCH').zoom('history') - assert len(j) == 0, j - - for c in mm + srm: + for c in srms.item + mms.item: + # NOTE: so here we are actually just using pure dicts in .make method + # this is kinda ok since it will be checked by parent Helper + # but also expects cooperation from .make method (e.g. popping items from the dict) + # could also wrap in helper and pass to .make .. not sure + # an argument could be made that .make isn't really a class methond.. + # it's pretty specific to this parser onl yield from Competition.make(j=c) + yield from m.check() + def data() -> Iterator[Res[Competition]]: *_, last = inputs() From a7439c7846868350d1d6473a891a0114e3aafeb4 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 12 Aug 2024 15:45:59 +0300 Subject: [PATCH 233/302] general: move assert_never to my.core.compat as it's in stdlib from 3.11 rely on typing-extensions for fallback introducing typing-extensions dependency without fallback, should be ok since it's in the top 10 of popular packages --- my/bumble/android.py | 3 ++- my/codeforces.py | 3 +-- my/core/__init__.py | 6 +++--- my/core/common.py | 15 ++++++++++----- my/core/compat.py | 12 ++++++++++++ my/core/sqlite.py | 3 ++- my/experimental/destructive_parsing.py | 3 +-- my/fbmessenger/android.py | 3 ++- my/tinder/android.py | 3 ++- setup.py | 1 + 10 files changed, 36 insertions(+), 16 deletions(-) diff --git a/my/bumble/android.py b/my/bumble/android.py index 3a159da..54a0441 100644 --- a/my/bumble/android.py +++ b/my/bumble/android.py @@ -54,9 +54,10 @@ class Message(_BaseMessage): import json from typing import Union -from ..core import Res, assert_never +from ..core import Res import sqlite3 from ..core.sqlite import sqlite_connect_immutable, select +from my.core.compat import assert_never EntitiesRes = Res[Union[Person, _Message]] diff --git a/my/codeforces.py b/my/codeforces.py index 7b37ec6..f2d150a 100644 --- a/my/codeforces.py +++ b/my/codeforces.py @@ -6,7 +6,6 @@ from pathlib import Path from typing import Dict, Iterator, Sequence from my.core import get_files, Res, datetime_aware -from my.core.common import assert_never from my.config import codeforces as config # type: ignore[attr-defined] @@ -73,7 +72,7 @@ class Parser: # these contain only contests the user participated in yield from self._parse_competitions(path) else: - raise RuntimeError("shouldn't happen") # TODO switch to compat.assert_never + raise RuntimeError(f"shouldn't happen: {path.name}") def data() -> Iterator[Res[Competition]]: diff --git a/my/core/__init__.py b/my/core/__init__.py index d753760..c79e36e 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -4,7 +4,7 @@ from .common import Json from .common import warn_if_empty from .common import stat, Stats from .common import datetime_naive, datetime_aware -from .common import assert_never +from .compat import assert_never from .cfg import make_config from .error import Res, unwrap @@ -26,7 +26,7 @@ __all__ = [ 'warn_if_empty', 'stat', 'Stats', 'datetime_aware', 'datetime_naive', - 'assert_never', + 'assert_never', # TODO maybe deprecate from use in my.core? will be in stdlib soon 'make_config', @@ -34,7 +34,7 @@ __all__ = [ 'Res', 'unwrap', - 'dataclass', 'Path', + 'dataclass', 'Path', # TODO deprecate these from use in my.core ] diff --git a/my/core/common.py b/my/core/common.py index 98dbacb..9874bed 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -27,7 +27,10 @@ from typing import ( get_origin, ) import warnings + from . import warnings as core_warnings +from . import compat +from .compat import deprecated # some helper functions PathIsh = Union[Path, str] @@ -633,11 +636,6 @@ class DummyExecutor(Executor): self._shutdown = True -# see https://hakibenita.com/python-mypy-exhaustive-checking#exhaustiveness-checking -def assert_never(value: NoReturn) -> NoReturn: - assert False, f'Unhandled value: {value} ({type(value).__name__})' - - def _check_all_hashable(fun): # TODO ok, take callable? hints = get_type_hints(fun) @@ -693,6 +691,13 @@ def unique_everseen( ## legacy imports, keeping them here for backwards compatibility +## hiding behind TYPE_CHECKING so it works in runtime +## in principle, warnings.deprecated decorator should cooperate with mypy, but doesn't look like it works atm? +## perhaps it doesn't work when it's used from typing_extensions +if not TYPE_CHECKING: + assert_never = deprecated('use my.core.compat.assert_never instead')(compat.assert_never) + +# TODO wrap in deprecated decorator as well? from functools import cached_property as cproperty from typing import Literal from .cachew import mcachew diff --git a/my/core/compat.py b/my/core/compat.py index e984695..2c1687d 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -121,3 +121,15 @@ if sys.version_info[:2] >= (3, 10): from types import NoneType else: NoneType = type(None) + + +if sys.version_info[:2] >= (3, 13): + from warnings import deprecated +else: + from typing_extensions import deprecated + + +if sys.version_info[:2] >= (3, 11): + from typing import assert_never +else: + from typing_extensions import assert_never diff --git a/my/core/sqlite.py b/my/core/sqlite.py index e04f6fc..2580e15 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -9,7 +9,8 @@ from tempfile import TemporaryDirectory from typing import Tuple, Any, Iterator, Callable, Optional, Union, Literal -from .common import PathIsh, assert_never +from .common import PathIsh +from .compat import assert_never def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection: diff --git a/my/experimental/destructive_parsing.py b/my/experimental/destructive_parsing.py index 3fc739c..05c5920 100644 --- a/my/experimental/destructive_parsing.py +++ b/my/experimental/destructive_parsing.py @@ -1,8 +1,7 @@ from dataclasses import dataclass from typing import Any, Iterator, List, Tuple -from my.core import assert_never -from my.core.compat import NoneType +from my.core.compat import NoneType, assert_never # TODO Popper? not sure diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index 8a4bf4c..bc06114 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -10,8 +10,9 @@ from pathlib import Path import sqlite3 from typing import Iterator, Sequence, Optional, Dict, Union, List -from my.core import get_files, Paths, datetime_aware, Res, assert_never, LazyLogger, make_config +from my.core import get_files, Paths, datetime_aware, Res, LazyLogger, make_config from my.core.common import unique_everseen +from my.core.compat import assert_never from my.core.error import echain from my.core.sqlite import sqlite_connection diff --git a/my/tinder/android.py b/my/tinder/android.py index 56ee1cb..d9b256b 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -11,8 +11,9 @@ from pathlib import Path import sqlite3 from typing import Sequence, Iterator, Union, Dict, List, Mapping -from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware, make_logger +from my.core import Paths, get_files, Res, stat, Stats, datetime_aware, make_logger from my.core.common import unique_everseen +from my.core.compat import assert_never from my.core.error import echain from my.core.sqlite import sqlite_connection import my.config diff --git a/setup.py b/setup.py index ab96616..cf4b79f 100644 --- a/setup.py +++ b/setup.py @@ -5,6 +5,7 @@ from setuptools import setup, find_namespace_packages # type: ignore INSTALL_REQUIRES = [ 'pytz', # even though it's not needed by the core, it's so common anyway... + 'typing-extensions', # one of the most common pypi packages, ok to depend for core 'appdirs', # very common, and makes it portable 'more-itertools', # it's just too useful and very common anyway 'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core From 973c4205df8908ec47a0139780d947ca63368936 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 12 Aug 2024 16:46:21 +0300 Subject: [PATCH 234/302] core: cleanup deprecations, exclude from type checking and show runtime warnings among affected things: - core.common.assert_never - core.common.cproperty - core.common.isoparse - core.common.mcachew - core.common.the - core.common.tzdatetime - core.compat.sqlite_backup --- my/bluemaestro.py | 2 +- my/browser/export.py | 2 +- my/coding/commits.py | 3 +- my/core/common.py | 62 +++++++++++++----------- my/core/compat.py | 67 ++++++++++++++------------ my/core/tests/test_cachew.py | 4 +- my/core/tests/test_get_files.py | 4 +- my/emfit/__init__.py | 3 +- my/github/ghexport.py | 5 +- my/google/takeout/parser.py | 3 +- my/lastfm.py | 3 +- my/location/google.py | 4 +- my/location/google_takeout.py | 7 ++- my/location/google_takeout_semantic.py | 7 ++- my/location/gpslogger.py | 3 +- my/orgmode.py | 3 +- my/pdfs.py | 3 +- my/photos/main.py | 8 +-- my/reddit/rexport.py | 2 +- my/rescuetime.py | 2 +- my/rss/feedbin.py | 8 ++- my/time/tz/common.py | 10 ++-- my/time/tz/main.py | 4 +- my/time/tz/via_location.py | 2 +- 24 files changed, 118 insertions(+), 103 deletions(-) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index 8f05aac..3e25cae 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -21,7 +21,7 @@ from my.core import ( Stats, influxdb, ) -from my.core.common import mcachew +from my.core.cachew import mcachew from my.core.error import unwrap from my.core.pandas import DataFrameT, as_dataframe from my.core.sqlite import sqlite_connect_immutable diff --git a/my/browser/export.py b/my/browser/export.py index ce5a6de..1b428b5 100644 --- a/my/browser/export.py +++ b/my/browser/export.py @@ -16,7 +16,7 @@ from my.core import ( make_logger, stat, ) -from my.core.common import mcachew +from my.core.cachew import mcachew from browserexport.merge import read_and_merge, Visit diff --git a/my/coding/commits.py b/my/coding/commits.py index 51f9222..dac3b1f 100644 --- a/my/coding/commits.py +++ b/my/coding/commits.py @@ -14,8 +14,7 @@ from typing import List, Optional, Iterator, Set, Sequence, cast from my.core import PathIsh, LazyLogger, make_config -from my.core.cachew import cache_dir -from my.core.common import mcachew +from my.core.cachew import cache_dir, mcachew from my.core.warnings import high diff --git a/my/core/common.py b/my/core/common.py index 9874bed..460a658 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -14,7 +14,6 @@ from typing import ( Iterable, Iterator, List, - NoReturn, Optional, Sequence, TYPE_CHECKING, @@ -70,17 +69,6 @@ T = TypeVar('T') K = TypeVar('K') V = TypeVar('V') -# TODO deprecate? more_itertools.one should be used -def the(l: Iterable[T]) -> T: - it = iter(l) - try: - first = next(it) - except StopIteration: - raise RuntimeError('Empty iterator?') - assert all(e == first for e in it) - return first - - # TODO more_itertools.bucket? def group_by_key(l: Iterable[T], key: Callable[[T], K]) -> Dict[K, List[T]]: res: Dict[K, List[T]] = {} @@ -322,14 +310,6 @@ datetime_naive = datetime datetime_aware = datetime -# TODO deprecate -tzdatetime = datetime_aware - - -# TODO deprecate (although could be used in modules) -from .compat import fromisoformat as isoparse - - import re # https://stackoverflow.com/a/295466/706389 def get_valid_filename(s: str) -> str: @@ -554,7 +534,7 @@ def test_guess_datetime() -> None: from dataclasses import dataclass from typing import NamedTuple - dd = isoparse('2021-02-01T12:34:56Z') + dd = compat.fromisoformat('2021-02-01T12:34:56Z') # ugh.. https://github.com/python/mypy/issues/7281 A = NamedTuple('A', [('x', int)]) @@ -690,15 +670,41 @@ def unique_everseen( return more_itertools.unique_everseen(iterable=iterable, key=key) -## legacy imports, keeping them here for backwards compatibility +### legacy imports, keeping them here for backwards compatibility ## hiding behind TYPE_CHECKING so it works in runtime ## in principle, warnings.deprecated decorator should cooperate with mypy, but doesn't look like it works atm? ## perhaps it doesn't work when it's used from typing_extensions if not TYPE_CHECKING: - assert_never = deprecated('use my.core.compat.assert_never instead')(compat.assert_never) -# TODO wrap in deprecated decorator as well? -from functools import cached_property as cproperty -from typing import Literal -from .cachew import mcachew -## + @deprecated('use my.core.compat.assert_never instead') + def assert_never(*args, **kwargs): + return compat.assert_never(*args, **kwargs) + + @deprecated('use my.core.compat.fromisoformat instead') + def isoparse(*args, **kwargs): + return compat.fromisoformat(*args, **kwargs) + + @deprecated('use more_itertools.one instead') + def the(*args, **kwargs): + import more_itertools + + return more_itertools.one(*args, **kwargs) + + @deprecated('use functools.cached_property instead') + def cproperty(*args, **kwargs): + import functools + + return functools.cached_property(*args, **kwargs) + + # todo wrap these in deprecated decorator as well? + from .cachew import mcachew # noqa: F401 + + from typing import Literal # noqa: F401 + + # TODO hmm how to deprecate it in runtime? tricky cause it's actually a class? + tzdatetime = datetime_aware +else: + from .compat import Never + + tzdatetime = Never # makes it invalid as a type while working in runtime +### diff --git a/my/core/compat.py b/my/core/compat.py index 2c1687d..d73c60c 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -2,56 +2,58 @@ Contains backwards compatibility helpers for different python versions. If something is relevant to HPI itself, please put it in .hpi_compat instead ''' -import os + import sys from typing import TYPE_CHECKING -windows = os.name == 'nt' +if sys.version_info[:2] >= (3, 13): + from warnings import deprecated +else: + from typing_extensions import deprecated # keeping just for backwards compatibility, used to have compat implementation for 3.6 -import sqlite3 -def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwargs) -> None: - source.backup(dest, **kwargs) +if not TYPE_CHECKING: + import sqlite3 + + @deprecated('use .backup method on sqlite3.Connection directly instead') + def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwargs) -> None: + # TODO warn here? + source.backup(dest, **kwargs) # can remove after python3.9 (although need to keep the method itself for bwd compat) def removeprefix(text: str, prefix: str) -> str: if text.startswith(prefix): - return text[len(prefix):] + return text[len(prefix) :] return text -## used to have compat function before 3.8 for these -from functools import cached_property -from typing import Literal, Protocol, TypedDict +## used to have compat function before 3.8 for these, keeping for runtime back compatibility +if not TYPE_CHECKING: + from functools import cached_property + from typing import Literal, Protocol, TypedDict +else: + from typing_extensions import Literal, Protocol, TypedDict ## if sys.version_info[:2] >= (3, 10): from typing import ParamSpec else: - if TYPE_CHECKING: - from typing_extensions import ParamSpec - else: - from typing import NamedTuple, Any - # erm.. I guess as long as it's not crashing, whatever... - class _ParamSpec: - def __call__(self, args): - class _res: - args = None - kwargs = None - return _res - ParamSpec = _ParamSpec() + from typing_extensions import ParamSpec # bisect_left doesn't have a 'key' parameter (which we use) # till python3.10 if sys.version_info[:2] <= (3, 9): from typing import List, TypeVar, Any, Optional, Callable + X = TypeVar('X') + # copied from python src + # fmt: off def bisect_left(a: List[Any], x: Any, lo: int=0, hi: Optional[int]=None, *, key: Optional[Callable[..., Any]]=None) -> int: if lo < 0: raise ValueError('lo must be non-negative') @@ -74,19 +76,22 @@ if sys.version_info[:2] <= (3, 9): else: hi = mid return lo + # fmt: on + else: from bisect import bisect_left from datetime import datetime + if sys.version_info[:2] >= (3, 11): fromisoformat = datetime.fromisoformat else: + # fromisoformat didn't support Z as "utc" before 3.11 + # https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat + def fromisoformat(date_string: str) -> datetime: - # didn't support Z as "utc" before 3.11 if date_string.endswith('Z'): - # NOTE: can be removed from 3.11? - # https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat date_string = date_string[:-1] + '+00:00' return datetime.fromisoformat(date_string) @@ -94,6 +99,7 @@ else: def test_fromisoformat() -> None: from datetime import timezone + # fmt: off # feedbin has this format assert fromisoformat('2020-05-01T10:32:02.925961Z') == datetime( 2020, 5, 1, 10, 32, 2, 925961, timezone.utc, @@ -108,6 +114,7 @@ def test_fromisoformat() -> None: assert fromisoformat('2020-11-30T00:53:12Z') == datetime( 2020, 11, 30, 0, 53, 12, 0, timezone.utc, ) + # fmt: on # arbtt has this format (sometimes less/more than 6 digits in milliseconds) # TODO doesn't work atm, not sure if really should be supported... @@ -123,13 +130,13 @@ else: NoneType = type(None) -if sys.version_info[:2] >= (3, 13): - from warnings import deprecated -else: - from typing_extensions import deprecated - - if sys.version_info[:2] >= (3, 11): from typing import assert_never else: from typing_extensions import assert_never + + +if sys.version_info[:2] >= (3, 11): + from typing import Never +else: + from typing_extensions import Never diff --git a/my/core/tests/test_cachew.py b/my/core/tests/test_cachew.py index 86344fd..5f7dd65 100644 --- a/my/core/tests/test_cachew.py +++ b/my/core/tests/test_cachew.py @@ -10,7 +10,7 @@ def test_cachew() -> None: settings.ENABLE = True # by default it's off in tests (see conftest.py) - from my.core.common import mcachew + from my.core.cachew import mcachew called = 0 @@ -36,7 +36,7 @@ def test_cachew_dir_none() -> None: settings.ENABLE = True # by default it's off in tests (see conftest.py) from my.core.cachew import cache_dir - from my.core.common import mcachew + from my.core.cachew import mcachew from my.core.core_config import _reset_config as reset with reset() as cc: diff --git a/my/core/tests/test_get_files.py b/my/core/tests/test_get_files.py index e9f216a..52e43f8 100644 --- a/my/core/tests/test_get_files.py +++ b/my/core/tests/test_get_files.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING import zipfile from ..common import get_files -from ..compat import windows from ..kompress import CPath, ZipPath import pytest @@ -56,8 +55,9 @@ def test_single_file() -> None: ''' assert get_files('/tmp/hpi_test/file.ext') == (Path('/tmp/hpi_test/file.ext'),) + is_windows = os.name == 'nt' "if the path starts with ~, we expand it" - if not windows: # windows doesn't have bashrc.. ugh + if not is_windows: # windows doesn't have bashrc.. ugh assert get_files('~/.bashrc') == (Path('~').expanduser() / '.bashrc',) diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index 30b693c..7fae8ea 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -21,8 +21,7 @@ from my.core import ( Res, Stats, ) -from my.core.common import mcachew -from my.core.cachew import cache_dir +from my.core.cachew import cache_dir, mcachew from my.core.error import set_error_datetime, extract_error_datetime from my.core.pandas import DataFrameT diff --git a/my/github/ghexport.py b/my/github/ghexport.py index d446c35..9dc8fd5 100644 --- a/my/github/ghexport.py +++ b/my/github/ghexport.py @@ -42,10 +42,11 @@ except ModuleNotFoundError as e: ############################ from functools import lru_cache +from pathlib import Path from typing import Tuple, Dict, Sequence, Optional -from my.core import get_files, Path, LazyLogger -from my.core.common import mcachew +from my.core import get_files, LazyLogger +from my.core.cachew import mcachew from .common import Event, parse_dt, Results, EventIds diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 2322ef0..952c9b6 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -19,7 +19,8 @@ import os from typing import List, Sequence, cast from pathlib import Path from my.core import make_config, dataclass -from my.core.common import Stats, LazyLogger, mcachew, get_files, Paths +from my.core.cachew import mcachew +from my.core.common import Stats, LazyLogger, get_files, Paths from my.core.error import ErrorPolicy from my.core.structure import match_structure diff --git a/my/lastfm.py b/my/lastfm.py index 90484b4..64ef1b3 100644 --- a/my/lastfm.py +++ b/my/lastfm.py @@ -26,7 +26,8 @@ import json from pathlib import Path from typing import NamedTuple, Sequence, Iterable -from my.core.common import mcachew, Json, get_files +from my.core.cachew import mcachew +from my.core.common import Json, get_files def inputs() -> Sequence[Path]: diff --git a/my/location/google.py b/my/location/google.py index ed37231..c1539e7 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -19,8 +19,8 @@ import re # pip3 install geopy import geopy # type: ignore -from ..core.common import LazyLogger, mcachew -from ..core.cachew import cache_dir +from my.core.common import LazyLogger +from my.core.cachew import cache_dir, mcachew from my.core.warnings import high diff --git a/my/location/google_takeout.py b/my/location/google_takeout.py index a1c1403..2fac270 100644 --- a/my/location/google_takeout.py +++ b/my/location/google_takeout.py @@ -9,7 +9,8 @@ from typing import Iterator from my.google.takeout.parser import events, _cachew_depends_on from google_takeout_parser.models import Location as GoogleLocation -from my.core.common import mcachew, LazyLogger, Stats +from my.core.cachew import mcachew +from my.core.common import LazyLogger, stat, Stats from .common import Location logger = LazyLogger(__name__) @@ -33,6 +34,4 @@ def locations() -> Iterator[Location]: def stats() -> Stats: - from my.core import stat - - return {**stat(locations)} + return stat(locations) diff --git a/my/location/google_takeout_semantic.py b/my/location/google_takeout_semantic.py index b4f16db..014959c 100644 --- a/my/location/google_takeout_semantic.py +++ b/my/location/google_takeout_semantic.py @@ -13,7 +13,8 @@ from my.google.takeout.parser import events, _cachew_depends_on as _parser_cache from google_takeout_parser.models import PlaceVisit as SemanticLocation from my.core import dataclass, make_config -from my.core.common import mcachew, LazyLogger, Stats +from my.core.cachew import mcachew +from my.core.common import LazyLogger, Stats, stat from my.core.error import Res from .common import Location @@ -72,6 +73,4 @@ def locations() -> Iterator[Res[Location]]: def stats() -> Stats: - from my.core import stat - - return {**stat(locations)} + return stat(locations) diff --git a/my/location/gpslogger.py b/my/location/gpslogger.py index 29e2547..8fb59d0 100644 --- a/my/location/gpslogger.py +++ b/my/location/gpslogger.py @@ -27,7 +27,8 @@ from gpxpy.gpx import GPXXMLSyntaxException from more_itertools import unique_everseen from my.core import Stats, LazyLogger -from my.core.common import get_files, mcachew +from my.core.cachew import mcachew +from my.core.common import get_files from .common import Location diff --git a/my/orgmode.py b/my/orgmode.py index 8293b74..c27f5a7 100644 --- a/my/orgmode.py +++ b/my/orgmode.py @@ -12,8 +12,7 @@ import re from typing import List, Sequence, Iterable, NamedTuple, Optional, Tuple from my.core import get_files -from my.core.common import mcachew -from my.core.cachew import cache_dir +from my.core.cachew import cache_dir, mcachew from my.core.orgmode import collect from my.config import orgmode as user_config diff --git a/my/pdfs.py b/my/pdfs.py index 5355d8a..3305eca 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -15,8 +15,9 @@ from typing import NamedTuple, List, Optional, Iterator, Sequence from my.core import LazyLogger, get_files, Paths, PathIsh +from my.core.cachew import mcachew from my.core.cfg import Attrs, make_config -from my.core.common import mcachew, group_by_key +from my.core.common import group_by_key from my.core.error import Res, split_errors diff --git a/my/photos/main.py b/my/photos/main.py index c491ac1..622d475 100644 --- a/my/photos/main.py +++ b/my/photos/main.py @@ -13,11 +13,11 @@ import json from pathlib import Path from typing import Optional, NamedTuple, Iterator, Iterable, List -from geopy.geocoders import Nominatim # type: ignore +from geopy.geocoders import Nominatim # type: ignore -from ..core.common import LazyLogger, mcachew, fastermime -from ..core.error import Res, sort_res_by -from ..core.cachew import cache_dir +from my.core.common import LazyLogger, fastermime +from my.core.error import Res, sort_res_by +from my.core.cachew import cache_dir, mcachew from my.config import photos as config # type: ignore[attr-defined] diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index a7be39b..6a6be61 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -20,8 +20,8 @@ from my.core import ( Paths, Stats, ) +from my.core.cachew import mcachew from my.core.cfg import make_config, Attrs -from my.core.common import mcachew from my.config import reddit as uconfig diff --git a/my/rescuetime.py b/my/rescuetime.py index 75684d9..774b587 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -10,7 +10,7 @@ from datetime import timedelta from typing import Sequence, Iterable from my.core import get_files, make_logger -from my.core.common import mcachew +from my.core.cachew import mcachew from my.core.error import Res, split_errors from my.config import rescuetime as config diff --git a/my/rss/feedbin.py b/my/rss/feedbin.py index 6160abc..16d4417 100644 --- a/my/rss/feedbin.py +++ b/my/rss/feedbin.py @@ -7,8 +7,8 @@ from my.config import feedbin as config from pathlib import Path from typing import Sequence -from ..core.common import listify, get_files -from ..core.compat import fromisoformat +from my.core.common import listify, get_files +from my.core.compat import fromisoformat from .common import Subscription @@ -33,12 +33,10 @@ def parse_file(f: Path): from typing import Iterable from .common import SubscriptionState def states() -> Iterable[SubscriptionState]: - # meh - from dateutil.parser import isoparse for f in inputs(): # TODO ugh. depends on my naming. not sure if useful? dts = f.stem.split('_')[-1] - dt = isoparse(dts) + dt = fromisoformat(dts) subs = parse_file(f) yield dt, subs diff --git a/my/time/tz/common.py b/my/time/tz/common.py index e2c428d..107410a 100644 --- a/my/time/tz/common.py +++ b/my/time/tz/common.py @@ -1,7 +1,7 @@ from datetime import datetime -from typing import Callable, cast +from typing import Callable, Literal, cast -from ...core.common import tzdatetime, Literal +from my.core.common import datetime_aware ''' @@ -30,7 +30,11 @@ def default_policy() -> TzPolicy: return 'keep' -def localize_with_policy(lfun: Callable[[datetime], tzdatetime], dt: datetime, policy: TzPolicy=default_policy()) -> tzdatetime: +def localize_with_policy( + lfun: Callable[[datetime], datetime_aware], + dt: datetime, + policy: TzPolicy=default_policy() +) -> datetime_aware: tz = dt.tzinfo if tz is None: return lfun(dt) diff --git a/my/time/tz/main.py b/my/time/tz/main.py index 624d7aa..6180160 100644 --- a/my/time/tz/main.py +++ b/my/time/tz/main.py @@ -2,10 +2,10 @@ Timezone data provider, used to localize timezone-unaware timestamps for other modules ''' from datetime import datetime -from ...core.common import tzdatetime +from my.core.common import datetime_aware # todo hmm, kwargs isn't mypy friendly.. but specifying types would require duplicating default args. uhoh -def localize(dt: datetime, **kwargs) -> tzdatetime: +def localize(dt: datetime, **kwargs) -> datetime_aware: # todo document patterns for combining multiple data sources # e.g. see https://github.com/karlicoss/HPI/issues/89#issuecomment-716495136 from . import via_location as L diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 612341a..b66ff8a 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -17,8 +17,8 @@ from typing import Iterator, Optional, Tuple, Any, List, Iterable, Set, Dict import pytz +from my.core.cachew import mcachew from my.core import make_logger, stat, Stats, datetime_aware -from my.core.common import mcachew from my.core.source import import_source from my.core.warnings import high From c64d7f5b676026e145340eca6865f534ff3b2ae4 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 13 Aug 2024 10:22:39 +0300 Subject: [PATCH 235/302] core: cleanup itertool style helpers - deprecate group_by_key, should use itertool.bucket instead - move make_dict and ensure_unique to my.core.utils.itertools --- my/core/common.py | 94 +++++++------------------------------- my/core/utils/itertools.py | 77 +++++++++++++++++++++++++++++++ my/github/ghexport.py | 5 +- my/jawbone/__init__.py | 12 +++-- my/pdfs.py | 6 ++- my/rtm.py | 11 +++-- my/tests/reddit.py | 7 +-- 7 files changed, 119 insertions(+), 93 deletions(-) create mode 100644 my/core/utils/itertools.py diff --git a/my/core/common.py b/my/core/common.py index 460a658..0be4dae 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -65,84 +65,6 @@ def import_dir(path: PathIsh, extra: str='') -> types.ModuleType: return import_from(p.parent, p.name + extra) -T = TypeVar('T') -K = TypeVar('K') -V = TypeVar('V') - -# TODO more_itertools.bucket? -def group_by_key(l: Iterable[T], key: Callable[[T], K]) -> Dict[K, List[T]]: - res: Dict[K, List[T]] = {} - for i in l: - kk = key(i) - lst = res.get(kk, []) - lst.append(i) - res[kk] = lst - return res - - -def _identity(v: T) -> V: # type: ignore[type-var] - return cast(V, v) - - -# ugh. nothing in more_itertools? -def ensure_unique( - it: Iterable[T], - *, - key: Callable[[T], K], - value: Callable[[T], V]=_identity, - key2value: Optional[Dict[K, V]]=None -) -> Iterable[T]: - if key2value is None: - key2value = {} - for i in it: - k = key(i) - v = value(i) - pv = key2value.get(k, None) - if pv is not None: - raise RuntimeError(f"Duplicate key: {k}. Previous value: {pv}, new value: {v}") - key2value[k] = v - yield i - - -def test_ensure_unique() -> None: - import pytest - assert list(ensure_unique([1, 2, 3], key=lambda i: i)) == [1, 2, 3] - - dups = [1, 2, 1, 4] - # this works because it's lazy - it = ensure_unique(dups, key=lambda i: i) - - # but forcing throws - with pytest.raises(RuntimeError, match='Duplicate key'): - list(it) - - # hacky way to force distinct objects? - list(ensure_unique(dups, key=lambda i: object())) - - -def make_dict( - it: Iterable[T], - *, - key: Callable[[T], K], - value: Callable[[T], V]=_identity -) -> Dict[K, V]: - res: Dict[K, V] = {} - uniques = ensure_unique(it, key=key, value=value, key2value=res) - for _ in uniques: - pass # force the iterator - return res - - -def test_make_dict() -> None: - it = range(5) - d = make_dict(it, key=lambda i: i, value=lambda i: i % 2) - assert d == {0: 0, 1: 1, 2: 0, 3: 1, 4: 0} - - # check type inference - d2: Dict[str, int ] = make_dict(it, key=lambda i: str(i)) - d3: Dict[str, bool] = make_dict(it, key=lambda i: str(i), value=lambda i: i % 2 == 0) - - # https://stackoverflow.com/a/12377059/706389 def listify(fn=None, wrapper=list): """ @@ -696,6 +618,22 @@ if not TYPE_CHECKING: return functools.cached_property(*args, **kwargs) + @deprecated('use more_itertools.bucket instead') + def group_by_key(l, key): + res = {} + for i in l: + kk = key(i) + lst = res.get(kk, []) + lst.append(i) + res[kk] = lst + return res + + @deprecated('use my.core.utils.make_dict instead') + def make_dict(*args, **kwargs): + from .utils import itertools as UI + + return UI.make_dict(*args, **kwargs) + # todo wrap these in deprecated decorator as well? from .cachew import mcachew # noqa: F401 diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py new file mode 100644 index 0000000..78b91de --- /dev/null +++ b/my/core/utils/itertools.py @@ -0,0 +1,77 @@ +""" +Various helpers/transforms of iterators + +Ideally this should be as small as possible and we should rely on stdlib itertools or more_itertools +""" + +from typing import Callable, Dict, Iterable, TypeVar, cast + + +T = TypeVar('T') +K = TypeVar('K') +V = TypeVar('V') + + +def _identity(v: T) -> V: # type: ignore[type-var] + return cast(V, v) + + +# ugh. nothing in more_itertools? +# perhaps duplicates_everseen? but it doesn't yield non-unique elements? +def ensure_unique(it: Iterable[T], *, key: Callable[[T], K]) -> Iterable[T]: + key2item: Dict[K, T] = {} + for i in it: + k = key(i) + pi = key2item.get(k, None) + if pi is not None: + raise RuntimeError(f"Duplicate key: {k}. Previous value: {pi}, new value: {i}") + key2item[k] = i + yield i + + +def test_ensure_unique() -> None: + import pytest + + assert list(ensure_unique([1, 2, 3], key=lambda i: i)) == [1, 2, 3] + + dups = [1, 2, 1, 4] + # this works because it's lazy + it = ensure_unique(dups, key=lambda i: i) + + # but forcing throws + with pytest.raises(RuntimeError, match='Duplicate key'): + list(it) + + # hacky way to force distinct objects? + list(ensure_unique(dups, key=lambda i: object())) + + +def make_dict( + it: Iterable[T], + *, + key: Callable[[T], K], + # TODO make value optional instead? but then will need a typing override for it? + value: Callable[[T], V] = _identity, +) -> Dict[K, V]: + with_keys = ((key(i), i) for i in it) + uniques = ensure_unique(with_keys, key=lambda p: p[0]) + res: Dict[K, V] = {} + for k, i in uniques: + res[k] = i if value is None else value(i) + return res + + +def test_make_dict() -> None: + import pytest + + it = range(5) + d = make_dict(it, key=lambda i: i, value=lambda i: i % 2) + assert d == {0: 0, 1: 1, 2: 0, 3: 1, 4: 0} + + it = range(5) + with pytest.raises(RuntimeError, match='Duplicate key'): + d = make_dict(it, key=lambda i: i % 2, value=lambda i: i) + + # check type inference + d2: Dict[str, int] = make_dict(it, key=lambda i: str(i)) + d3: Dict[str, bool] = make_dict(it, key=lambda i: str(i), value=lambda i: i % 2 == 0) diff --git a/my/github/ghexport.py b/my/github/ghexport.py index 9dc8fd5..80106a5 100644 --- a/my/github/ghexport.py +++ b/my/github/ghexport.py @@ -65,11 +65,10 @@ def _dal() -> dal.DAL: @mcachew(depends_on=inputs) def events() -> Results: - from my.core.common import ensure_unique - key = lambda e: object() if isinstance(e, Exception) else e.eid + # key = lambda e: object() if isinstance(e, Exception) else e.eid # crap. sometimes API events can be repeated with exactly the same payload and different id # yield from ensure_unique(_events(), key=key) - yield from _events() + return _events() def _events() -> Results: diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 7f4d6bd..0659bc6 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -108,16 +108,22 @@ def load_sleeps() -> List[SleepEntry]: from ..core.error import Res, set_error_datetime, extract_error_datetime def pre_dataframe() -> Iterable[Res[SleepEntry]]: + from more_itertools import bucket + sleeps = load_sleeps() # todo emit error if graph doesn't exist?? sleeps = [s for s in sleeps if s.graph.exists()] # TODO careful.. - from ..core.common import group_by_key - for dd, group in group_by_key(sleeps, key=lambda s: s.date_).items(): + + bucketed = bucket(sleeps, key=lambda s: s.date_) + + for dd in bucketed: + group = list(bucketed[dd]) if len(group) == 1: yield group[0] else: err = RuntimeError(f'Multiple sleeps per night, not supported yet: {group}') - set_error_datetime(err, dt=dd) # type: ignore[arg-type] + dt = datetime.combine(dd, time.min) + set_error_datetime(err, dt=dt) logger.exception(err) yield err diff --git a/my/pdfs.py b/my/pdfs.py index 3305eca..b3ef85d 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -17,10 +17,10 @@ from typing import NamedTuple, List, Optional, Iterator, Sequence from my.core import LazyLogger, get_files, Paths, PathIsh from my.core.cachew import mcachew from my.core.cfg import Attrs, make_config -from my.core.common import group_by_key from my.core.error import Res, split_errors +from more_itertools import bucket import pdfannots @@ -169,7 +169,9 @@ def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Re ait = annotations() vit, eit = split_errors(ait, ET=Exception) - for k, g in group_by_key(vit, key=lambda a: a.path).items(): + bucketed = bucket(vit, key=lambda a: a.path) + for k in bucketed: + g = list(bucketed[k]) yield Pdf(path=Path(k), annotations=g) yield from eit diff --git a/my/rtm.py b/my/rtm.py index 8d41e7a..56f4d07 100644 --- a/my/rtm.py +++ b/my/rtm.py @@ -11,13 +11,15 @@ from functools import cached_property import re from typing import Dict, List, Iterator -from .core.common import LazyLogger, get_files, group_by_key, make_dict +from my.core.common import LazyLogger, get_files +from my.core.utils.itertools import make_dict from my.config import rtm as config -import icalendar # type: ignore -from icalendar.cal import Todo # type: ignore +from more_itertools import bucket +import icalendar # type: ignore +from icalendar.cal import Todo # type: ignore logger = LazyLogger(__name__) @@ -96,7 +98,8 @@ class DAL: def get_todos_by_title(self) -> Dict[str, List[MyTodo]]: todos = self.all_todos() - return group_by_key(todos, lambda todo: todo.title) + bucketed = bucket(todos, lambda todo: todo.title) + return {k: list(bucketed[k]) for k in bucketed} def dal(): diff --git a/my/tests/reddit.py b/my/tests/reddit.py index 4af95ae..fb8d6d2 100644 --- a/my/tests/reddit.py +++ b/my/tests/reddit.py @@ -1,9 +1,10 @@ from my.core.cfg import tmp_config -from my.core.common import make_dict +from my.core.utils.itertools import ensure_unique # todo ugh, it's discovered as a test??? from .common import testdata +from more_itertools import consume import pytest # deliberately use mixed style imports on the top level and inside the methods to test tmp_config stuff @@ -36,8 +37,8 @@ def test_saves() -> None: saves = list(saved()) assert len(saves) > 0 - # just check that they are unique (makedict will throw) - make_dict(saves, key=lambda s: s.sid) + # will throw if not unique + consume(ensure_unique(saves, key=lambda s: s.sid)) def test_preserves_extra_attr() -> None: From 66c08a6c8067bcee00876f1cd726ff2a3290567f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 14 Aug 2024 10:59:47 +0300 Subject: [PATCH 236/302] core.common: move listify to core.utils.itertools, use better typing annotations for it also some minor refactoring of my.rss --- my/core/common.py | 31 +++++++--------------------- my/core/utils/itertools.py | 41 +++++++++++++++++++++++++++++++++++++- my/rss/all.py | 3 ++- my/rss/common.py | 28 +++++++++++++------------- my/rss/feedbin.py | 29 ++++++++++----------------- my/rss/feedly.py | 14 ++++++------- 6 files changed, 81 insertions(+), 65 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index 0be4dae..920657a 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -65,29 +65,6 @@ def import_dir(path: PathIsh, extra: str='') -> types.ModuleType: return import_from(p.parent, p.name + extra) -# https://stackoverflow.com/a/12377059/706389 -def listify(fn=None, wrapper=list): - """ - Wraps a function's return value in wrapper (e.g. list) - Useful when an algorithm can be expressed more cleanly as a generator - """ - def listify_return(fn): - @functools.wraps(fn) - def listify_helper(*args, **kw): - return wrapper(fn(*args, **kw)) - return listify_helper - if fn is None: - return listify_return - return listify_return(fn) - - -# todo use in bluemaestro -# def dictify(fn=None, key=None, value=None): -# def md(it): -# return make_dict(it, key=key, value=value) -# return listify(fn=fn, wrapper=md) - - from .logging import setup_logger, LazyLogger @@ -628,12 +605,18 @@ if not TYPE_CHECKING: res[kk] = lst return res - @deprecated('use my.core.utils.make_dict instead') + @deprecated('use my.core.utils.itertools.make_dict instead') def make_dict(*args, **kwargs): from .utils import itertools as UI return UI.make_dict(*args, **kwargs) + @deprecated('use my.core.utils.itertools.listify instead') + def listify(*args, **kwargs): + from .utils import itertools as UI + + return UI.listify(*args, **kwargs) + # todo wrap these in deprecated decorator as well? from .cachew import mcachew # noqa: F401 diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index 78b91de..cab4b2c 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -4,8 +4,11 @@ Various helpers/transforms of iterators Ideally this should be as small as possible and we should rely on stdlib itertools or more_itertools """ -from typing import Callable, Dict, Iterable, TypeVar, cast +from typing import Callable, Dict, Iterable, Iterator, TypeVar, List, cast, TYPE_CHECKING +from ..compat import ParamSpec + +from decorator import decorator T = TypeVar('T') K = TypeVar('K') @@ -75,3 +78,39 @@ def test_make_dict() -> None: # check type inference d2: Dict[str, int] = make_dict(it, key=lambda i: str(i)) d3: Dict[str, bool] = make_dict(it, key=lambda i: str(i), value=lambda i: i % 2 == 0) + + +LFP = ParamSpec('LFP') +LV = TypeVar('LV') + + +@decorator +def _listify(func: Callable[LFP, Iterable[LV]], *args: LFP.args, **kwargs: LFP.kwargs) -> List[LV]: + """ + Wraps a function's return value in wrapper (e.g. list) + Useful when an algorithm can be expressed more cleanly as a generator + """ + return list(func(*args, **kwargs)) + + +# ugh. decorator library has stub types, but they are way too generic? +# tried implementing my own stub, but failed -- not sure if it's possible at all? +# so seems easiest to just use specialize instantiations of decorator instead +if TYPE_CHECKING: + + def listify(func: Callable[LFP, Iterable[LV]]) -> Callable[LFP, List[LV]]: ... + +else: + listify = _listify + + +def test_listify() -> None: + @listify + def it() -> Iterator[int]: + yield 1 + yield 2 + + res = it() + from typing_extensions import assert_type # TODO move to compat? + assert_type(res, List[int]) + assert res == [1, 2] diff --git a/my/rss/all.py b/my/rss/all.py index 61f9fab..b4dbdbd 100644 --- a/my/rss/all.py +++ b/my/rss/all.py @@ -1,6 +1,7 @@ ''' Unified RSS data, merged from different services I used historically ''' + # NOTE: you can comment out the sources you're not using from . import feedbin, feedly @@ -12,5 +13,5 @@ def subscriptions() -> Iterable[Subscription]: # TODO google reader? yield from compute_subscriptions( feedbin.states(), - feedly .states(), + feedly.states(), ) diff --git a/my/rss/common.py b/my/rss/common.py index f3893b7..54067d6 100644 --- a/my/rss/common.py +++ b/my/rss/common.py @@ -1,30 +1,32 @@ -# shared Rss stuff -from datetime import datetime -from typing import NamedTuple, Optional, List, Dict +from my.core import __NOT_HPI_MODULE__ + +from dataclasses import dataclass, replace +from itertools import chain +from typing import Optional, List, Dict, Iterable, Tuple, Sequence + +from my.core import warn_if_empty, datetime_aware -class Subscription(NamedTuple): +@dataclass +class Subscription: title: str url: str - id: str # TODO not sure about it... + id: str # TODO not sure about it... # eh, not all of them got reasonable 'created' time - created_at: Optional[datetime] - subscribed: bool=True + created_at: Optional[datetime_aware] + subscribed: bool = True -from typing import Iterable, Tuple, Sequence # snapshot of subscriptions at time -SubscriptionState = Tuple[datetime, Sequence[Subscription]] +SubscriptionState = Tuple[datetime_aware, Sequence[Subscription]] -from ..core import warn_if_empty @warn_if_empty def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscription]: """ Keeps track of everything I ever subscribed to. In addition, keeps track of unsubscribed as well (so you'd remember when and why you unsubscribed) """ - from itertools import chain states = list(chain.from_iterable(sources)) # TODO keep 'source'/'provider'/'service' attribute? @@ -45,7 +47,5 @@ def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscri res = [] for u, x in sorted(by_url.items()): present = u in last_urls - res.append(x._replace(subscribed=present)) + res.append(replace(x, subscribed=present)) return res - -from ..core import __NOT_HPI_MODULE__ diff --git a/my/rss/feedbin.py b/my/rss/feedbin.py index 16d4417..dc13a17 100644 --- a/my/rss/feedbin.py +++ b/my/rss/feedbin.py @@ -2,24 +2,22 @@ Feedbin RSS reader """ -from my.config import feedbin as config - +import json from pathlib import Path -from typing import Sequence +from typing import Iterator, Sequence -from my.core.common import listify, get_files +from my.core import get_files, stat, Stats from my.core.compat import fromisoformat -from .common import Subscription +from .common import Subscription, SubscriptionState + +from my.config import feedbin as config def inputs() -> Sequence[Path]: return get_files(config.export_path) -import json - -@listify -def parse_file(f: Path): +def parse_file(f: Path) -> Iterator[Subscription]: raw = json.loads(f.read_text()) for r in raw: yield Subscription( @@ -30,19 +28,14 @@ def parse_file(f: Path): ) -from typing import Iterable -from .common import SubscriptionState -def states() -> Iterable[SubscriptionState]: +def states() -> Iterator[SubscriptionState]: for f in inputs(): # TODO ugh. depends on my naming. not sure if useful? dts = f.stem.split('_')[-1] dt = fromisoformat(dts) - subs = parse_file(f) + subs = list(parse_file(f)) yield dt, subs -def stats(): - from more_itertools import ilen, last - return { - 'subscriptions': ilen(last(states())[1]) - } +def stats() -> Stats: + return stat(states) diff --git a/my/rss/feedly.py b/my/rss/feedly.py index 4611ced..55bcf9b 100644 --- a/my/rss/feedly.py +++ b/my/rss/feedly.py @@ -1,14 +1,15 @@ """ Feedly RSS reader """ + from my.config import feedly as config from datetime import datetime, timezone import json from pathlib import Path -from typing import Iterable, Sequence +from typing import Iterator, Sequence -from ..core.common import listify, get_files +from my.core import get_files from .common import Subscription, SubscriptionState @@ -16,13 +17,12 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) -@listify -def parse_file(f: Path): +def parse_file(f: Path) -> Iterator[Subscription]: raw = json.loads(f.read_text()) for r in raw: # err, some even don't have website.. rid = r['id'] - website = r.get('website', rid) # meh + website = r.get('website', rid) # meh yield Subscription( created_at=None, title=r['title'], @@ -31,9 +31,9 @@ def parse_file(f: Path): ) -def states() -> Iterable[SubscriptionState]: +def states() -> Iterator[SubscriptionState]: for f in inputs(): dts = f.stem.split('_')[-1] dt = datetime.strptime(dts, '%Y%m%d%H%M%S').replace(tzinfo=timezone.utc) - subs = parse_file(f) + subs = list(parse_file(f)) yield dt, subs From 770dba5506ecd96bf52ca47cd880b7b2ca62c8c7 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 14 Aug 2024 11:28:50 +0300 Subject: [PATCH 237/302] core.common: move away import related stuff to my.core.utils.imports moving without backward compatibility, since it's extremely unlikely they are used for any external modules in fact, unclear if these methods still have much value at all, but keeping for now just in case --- my/core/common.py | 31 ------------------------------- my/core/compat.py | 10 ++-------- my/core/hpi_compat.py | 2 +- my/core/utils/imports.py | 37 +++++++++++++++++++++++++++++++++++++ my/core/utils/itertools.py | 3 ++- my/demo.py | 2 +- tests/extra/polar.py | 2 +- 7 files changed, 44 insertions(+), 43 deletions(-) create mode 100644 my/core/utils/imports.py diff --git a/my/core/common.py b/my/core/common.py index 920657a..389dedc 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -34,37 +34,6 @@ from .compat import deprecated # some helper functions PathIsh = Union[Path, str] -# TODO only used in tests? not sure if useful at all. -def import_file(p: PathIsh, name: Optional[str] = None) -> types.ModuleType: - p = Path(p) - if name is None: - name = p.stem - import importlib.util - spec = importlib.util.spec_from_file_location(name, p) - assert spec is not None, f"Fatal error; Could not create module spec from {name} {p}" - foo = importlib.util.module_from_spec(spec) - loader = spec.loader; assert loader is not None - loader.exec_module(foo) - return foo - - -def import_from(path: PathIsh, name: str) -> types.ModuleType: - path = str(path) - try: - sys.path.append(path) - import importlib - return importlib.import_module(name) - finally: - sys.path.remove(path) - - -def import_dir(path: PathIsh, extra: str='') -> types.ModuleType: - p = Path(path) - if p.parts[0] == '~': - p = p.expanduser() # TODO eh. not sure about this.. - return import_from(p.parent, p.name + extra) - - from .logging import setup_logger, LazyLogger diff --git a/my/core/compat.py b/my/core/compat.py index d73c60c..7bbe509 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -131,12 +131,6 @@ else: if sys.version_info[:2] >= (3, 11): - from typing import assert_never + from typing import assert_never, assert_type, Never else: - from typing_extensions import assert_never - - -if sys.version_info[:2] >= (3, 11): - from typing import Never -else: - from typing_extensions import Never + from typing_extensions import assert_never, assert_type, Never diff --git a/my/core/hpi_compat.py b/my/core/hpi_compat.py index 61121de..3c567d9 100644 --- a/my/core/hpi_compat.py +++ b/my/core/hpi_compat.py @@ -101,7 +101,7 @@ Please install {' '.join(requires)} as PIP packages (see the corresponding READM def _get_dal(cfg, module_name: str): mpath = getattr(cfg, module_name, None) if mpath is not None: - from .common import import_dir + from .utils.imports import import_dir return import_dir(mpath, '.dal') else: diff --git a/my/core/utils/imports.py b/my/core/utils/imports.py new file mode 100644 index 0000000..efd8e9a --- /dev/null +++ b/my/core/utils/imports.py @@ -0,0 +1,37 @@ +import importlib +import importlib.util +from pathlib import Path +import sys +from typing import Optional +from types import ModuleType + +from ..common import PathIsh + + +# TODO only used in tests? not sure if useful at all. +def import_file(p: PathIsh, name: Optional[str] = None) -> ModuleType: + p = Path(p) + if name is None: + name = p.stem + spec = importlib.util.spec_from_file_location(name, p) + assert spec is not None, f"Fatal error; Could not create module spec from {name} {p}" + foo = importlib.util.module_from_spec(spec) + loader = spec.loader; assert loader is not None + loader.exec_module(foo) + return foo + + +def import_from(path: PathIsh, name: str) -> ModuleType: + path = str(path) + sys.path.append(path) + try: + return importlib.import_module(name) + finally: + sys.path.remove(path) + + +def import_dir(path: PathIsh, extra: str = '') -> ModuleType: + p = Path(path) + if p.parts[0] == '~': + p = p.expanduser() # TODO eh. not sure about this.. + return import_from(p.parent, p.name + extra) diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index cab4b2c..7046acf 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -105,12 +105,13 @@ else: def test_listify() -> None: + from ..compat import assert_type + @listify def it() -> Iterator[int]: yield 1 yield 2 res = it() - from typing_extensions import assert_type # TODO move to compat? assert_type(res, List[int]) assert res == [1, 2] diff --git a/my/demo.py b/my/demo.py index 75954d6..645be4f 100644 --- a/my/demo.py +++ b/my/demo.py @@ -23,7 +23,7 @@ class demo(user_config): def external_module(self): rpath = self.external if rpath is not None: - from .core.common import import_dir + from .core.utils.imports import import_dir return import_dir(rpath) import my.config.repos.external as m # type: ignore diff --git a/tests/extra/polar.py b/tests/extra/polar.py index b2bc562..b5858b6 100644 --- a/tests/extra/polar.py +++ b/tests/extra/polar.py @@ -15,7 +15,7 @@ def test_hpi(prepare: str) -> None: assert len(list(get_entries())) > 1 def test_orger(prepare: str, tmp_path: Path) -> None: - from my.core.common import import_from, import_file + from my.core.utils.imports import import_from, import_file om = import_file(ROOT / 'orger/modules/polar.py') # reload(om) From 06084a8787786bacc76705fd9e2177023a52d9fb Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 14 Aug 2024 12:56:48 +0300 Subject: [PATCH 238/302] my.core.common: move warn_if_empty to my.core.utils.itertools, cleanup and add more tests --- my/core/__init__.py | 2 +- my/core/common.py | 64 ++------------------- my/core/tests/test_common.py | 54 ------------------ my/core/utils/itertools.py | 105 ++++++++++++++++++++++++++++++++++- my/ip/all.py | 2 +- 5 files changed, 112 insertions(+), 115 deletions(-) delete mode 100644 my/core/tests/test_common.py diff --git a/my/core/__init__.py b/my/core/__init__.py index c79e36e..0ba8bda 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -1,10 +1,10 @@ # this file only keeps the most common & critical types/utility functions from .common import get_files, PathIsh, Paths from .common import Json -from .common import warn_if_empty from .common import stat, Stats from .common import datetime_naive, datetime_aware from .compat import assert_never +from .utils.itertools import warn_if_empty from .cfg import make_config from .error import Res, unwrap diff --git a/my/core/common.py b/my/core/common.py index 389dedc..f84a395 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -185,64 +185,6 @@ def get_valid_filename(s: str) -> str: return re.sub(r'(?u)[^-\w.]', '', s) -from typing import Generic, Sized, Callable - - -# X = TypeVar('X') -def _warn_iterator(it, f: Any=None): - emitted = False - for i in it: - yield i - emitted = True - if not emitted: - warnings.warn(f"Function {f} didn't emit any data, make sure your config paths are correct") - - -# TODO ugh, so I want to express something like: -# X = TypeVar('X') -# C = TypeVar('C', bound=Iterable[X]) -# _warn_iterable(it: C) -> C -# but apparently I can't??? ugh. -# https://github.com/python/typing/issues/548 -# I guess for now overloads are fine... - -from typing import overload -X = TypeVar('X') -@overload -def _warn_iterable(it: List[X] , f: Any=None) -> List[X] : ... -@overload -def _warn_iterable(it: Iterable[X], f: Any=None) -> Iterable[X]: ... -def _warn_iterable(it, f=None): - if isinstance(it, Sized): - sz = len(it) - if sz == 0: - warnings.warn(f"Function {f} returned empty container, make sure your config paths are correct") - return it - else: - return _warn_iterator(it, f=f) - - -# ok, this seems to work... -# https://github.com/python/mypy/issues/1927#issue-167100413 -FL = TypeVar('FL', bound=Callable[..., List]) -FI = TypeVar('FI', bound=Callable[..., Iterable]) - -@overload -def warn_if_empty(f: FL) -> FL: ... -@overload -def warn_if_empty(f: FI) -> FI: ... - - -def warn_if_empty(f): - from functools import wraps - - @wraps(f) - def wrapped(*args, **kwargs): - res = f(*args, **kwargs) - return _warn_iterable(res, f=f) - return wrapped - - # global state that turns on/off quick stats # can use the 'quick_stats' contextmanager # to enable/disable this in cli so that module 'stats' @@ -586,6 +528,12 @@ if not TYPE_CHECKING: return UI.listify(*args, **kwargs) + @deprecated('use my.core.warn_if_empty instead') + def warn_if_empty(*args, **kwargs): + from .utils import itertools as UI + + return UI.listify(*args, **kwargs) + # todo wrap these in deprecated decorator as well? from .cachew import mcachew # noqa: F401 diff --git a/my/core/tests/test_common.py b/my/core/tests/test_common.py deleted file mode 100644 index a2019e4..0000000 --- a/my/core/tests/test_common.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Iterable, List -import warnings - -from ..common import ( - warn_if_empty, - _warn_iterable, -) - - -def test_warn_if_empty() -> None: - @warn_if_empty - def nonempty() -> Iterable[str]: - yield 'a' - yield 'aba' - - @warn_if_empty - def empty() -> List[int]: - return [] - - # should be rejected by mypy! - # todo how to actually test it? - # @warn_if_empty - # def baad() -> float: - # return 0.00 - - # reveal_type(nonempty) - # reveal_type(empty) - - with warnings.catch_warnings(record=True) as w: - assert list(nonempty()) == ['a', 'aba'] - assert len(w) == 0 - - eee = empty() - assert eee == [] - assert len(w) == 1 - - -def test_warn_iterable() -> None: - i1: List[str] = ['a', 'b'] - i2: Iterable[int] = iter([1, 2, 3]) - # reveal_type(i1) - # reveal_type(i2) - x1 = _warn_iterable(i1) - x2 = _warn_iterable(i2) - # vvvv this should be flagged by mypy - # _warn_iterable(123) - # reveal_type(x1) - # reveal_type(x2) - with warnings.catch_warnings(record=True) as w: - assert x1 is i1 # should be unchanged! - assert len(w) == 0 - - assert list(x2) == [1, 2, 3] - assert len(w) == 0 diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index 7046acf..3997310 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -4,7 +4,8 @@ Various helpers/transforms of iterators Ideally this should be as small as possible and we should rely on stdlib itertools or more_itertools """ -from typing import Callable, Dict, Iterable, Iterator, TypeVar, List, cast, TYPE_CHECKING +from typing import Callable, Dict, Iterable, Iterator, Sized, TypeVar, List, cast, TYPE_CHECKING +import warnings from ..compat import ParamSpec @@ -115,3 +116,105 @@ def test_listify() -> None: res = it() assert_type(res, List[int]) assert res == [1, 2] + + +@decorator +def _warn_if_empty(func, *args, **kwargs): + # so there is a more_itertools.peekable which could work nicely for these purposes + # the downside is that it would start advancing the generator right after it's created + # , which can be somewhat confusing + iterable = func(*args, **kwargs) + + if isinstance(iterable, Sized): + sz = len(iterable) + if sz == 0: + # todo use hpi warnings here? + warnings.warn(f"Function {func} returned empty container, make sure your config paths are correct") + return iterable + else: # must be an iterator + + def wit(): + empty = True + for i in iterable: + yield i + empty = False + if empty: + warnings.warn(f"Function {func} didn't emit any data, make sure your config paths are correct") + + return wit() + + +if TYPE_CHECKING: + FF = TypeVar('FF', bound=Callable[..., Iterable]) + + def warn_if_empty(f: FF) -> FF: ... + +else: + warn_if_empty = _warn_if_empty + + +def test_warn_if_empty_iterator() -> None: + from ..compat import assert_type + + @warn_if_empty + def nonempty() -> Iterator[str]: + yield 'a' + yield 'aba' + + with warnings.catch_warnings(record=True) as w: + res1 = nonempty() + assert len(w) == 0 # warning isn't emitted until iterator is consumed + assert_type(res1, Iterator[str]) + # assert isinstance(res1, generator) # FIXME ??? how + assert list(res1) == ['a', 'aba'] + assert len(w) == 0 + + @warn_if_empty + def empty() -> Iterator[int]: + yield from [] + + with warnings.catch_warnings(record=True) as w: + res2 = empty() + assert len(w) == 0 # warning isn't emitted until iterator is consumed + assert_type(res2, Iterator[int]) + # assert isinstance(res1, generator) # FIXME ??? how + assert list(res2) == [] + assert len(w) == 1 + + +def test_warn_if_empty_list() -> None: + from ..compat import assert_type + + ll = [1, 2, 3] + + @warn_if_empty + def nonempty() -> List[int]: + return ll + + + with warnings.catch_warnings(record=True) as w: + res1 = nonempty() + assert len(w) == 0 + assert_type(res1, List[int]) + assert isinstance(res1, list) + assert res1 is ll # object should be unchanged! + + + @warn_if_empty + def empty() -> List[str]: + return [] + + + with warnings.catch_warnings(record=True) as w: + res2 = empty() + assert len(w) == 1 + assert_type(res2, List[str]) + assert isinstance(res2, list) + assert res2 == [] + + +def test_warn_if_empty_unsupported() -> None: + # these should be rejected by mypy! (will show "unused type: ignore" if we break it) + @warn_if_empty # type: ignore[type-var] + def bad_return_type() -> float: + return 0.00 diff --git a/my/ip/all.py b/my/ip/all.py index f4cdb37..46c1fec 100644 --- a/my/ip/all.py +++ b/my/ip/all.py @@ -11,7 +11,7 @@ REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] from typing import Iterator -from my.core.common import Stats, warn_if_empty +from my.core import Stats, warn_if_empty from my.ip.common import IP From bcc4c1530432754f1f5a9bca759ee48ea6535046 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 15 Aug 2024 13:28:45 +0300 Subject: [PATCH 239/302] core: cleanup my.core.common.unique_everseen - move to my.core.utils.itertools - more robust check for hashable types -- now checks in runtime (since the one based on types purely isn't necessarily sound) - add more testing --- my/core/common.py | 59 +------------ my/core/tests/common.py | 19 +++++ my/core/utils/itertools.py | 168 +++++++++++++++++++++++++++++++++++-- 3 files changed, 183 insertions(+), 63 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index f84a395..bfc3505 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -6,13 +6,11 @@ import functools from contextlib import contextmanager import os import sys -import types from typing import ( Any, Callable, Dict, Iterable, - Iterator, List, Optional, Sequence, @@ -21,9 +19,6 @@ from typing import ( TypeVar, Union, cast, - get_args, - get_type_hints, - get_origin, ) import warnings @@ -426,58 +421,8 @@ class DummyExecutor(Executor): self._shutdown = True -def _check_all_hashable(fun): - # TODO ok, take callable? - hints = get_type_hints(fun) - # TODO needs to be defensive like in cachew? - return_type = hints.get('return') - # TODO check if None - origin = get_origin(return_type) # Iterator etc? - (arg,) = get_args(return_type) - # options we wanna handle are simple type on the top level or union - arg_origin = get_origin(arg) - - if sys.version_info[:2] >= (3, 10): - is_uniontype = arg_origin is types.UnionType - else: - is_uniontype = False - - is_union = arg_origin is Union or is_uniontype - if is_union: - to_check = get_args(arg) - else: - to_check = (arg,) - - no_hash = [ - t - for t in to_check - # seems that objects that have not overridden hash have the attribute but it's set to None - if getattr(t, '__hash__', None) is None - ] - assert len(no_hash) == 0, f'Types {no_hash} are not hashable, this will result in significant performance downgrade for unique_everseen' - - -_UET = TypeVar('_UET') -_UEU = TypeVar('_UEU') - - -def unique_everseen( - fun: Callable[[], Iterable[_UET]], - key: Optional[Callable[[_UET], _UEU]] = None, -) -> Iterator[_UET]: - # TODO support normal iterable as well? - import more_itertools - - # NOTE: it has to take original callable, because otherwise we don't have access to generator type annotations - iterable = fun() - - if key is None: - # todo check key return type as well? but it's more likely to be hashable - if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None: - # TODO return better error here, e.g. if there is no return type it crashes - _check_all_hashable(fun) - - return more_itertools.unique_everseen(iterable=iterable, key=key) +# TODO deprecate and suggest to use one from my.core directly? not sure +from .utils.itertools import unique_everseen ### legacy imports, keeping them here for backwards compatibility diff --git a/my/core/tests/common.py b/my/core/tests/common.py index d6fb71e..a102ad3 100644 --- a/my/core/tests/common.py +++ b/my/core/tests/common.py @@ -1,4 +1,6 @@ +from contextlib import contextmanager import os +from typing import Iterator, Optional import pytest @@ -10,3 +12,20 @@ skip_if_uses_optional_deps = pytest.mark.skipif( V not in os.environ, reason=f'test only works when optional dependencies are installed. Set env variable {V}=true to override.', ) + + +# TODO maybe move to hpi core? +@contextmanager +def tmp_environ_set(key: str, value: Optional[str]) -> Iterator[None]: + prev_value = os.environ.get(key) + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + try: + yield + finally: + if prev_value is None: + os.environ.pop(key, None) + else: + os.environ[key] = prev_value diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index 3997310..e8802bb 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -4,12 +4,26 @@ Various helpers/transforms of iterators Ideally this should be as small as possible and we should rely on stdlib itertools or more_itertools """ -from typing import Callable, Dict, Iterable, Iterator, Sized, TypeVar, List, cast, TYPE_CHECKING +from collections.abc import Hashable +from typing import ( + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Sized, + Union, + TypeVar, + cast, + TYPE_CHECKING, +) import warnings from ..compat import ParamSpec from decorator import decorator +import more_itertools T = TypeVar('T') K = TypeVar('K') @@ -165,7 +179,6 @@ def test_warn_if_empty_iterator() -> None: res1 = nonempty() assert len(w) == 0 # warning isn't emitted until iterator is consumed assert_type(res1, Iterator[str]) - # assert isinstance(res1, generator) # FIXME ??? how assert list(res1) == ['a', 'aba'] assert len(w) == 0 @@ -177,7 +190,6 @@ def test_warn_if_empty_iterator() -> None: res2 = empty() assert len(w) == 0 # warning isn't emitted until iterator is consumed assert_type(res2, Iterator[int]) - # assert isinstance(res1, generator) # FIXME ??? how assert list(res2) == [] assert len(w) == 1 @@ -191,7 +203,6 @@ def test_warn_if_empty_list() -> None: def nonempty() -> List[int]: return ll - with warnings.catch_warnings(record=True) as w: res1 = nonempty() assert len(w) == 0 @@ -199,12 +210,10 @@ def test_warn_if_empty_list() -> None: assert isinstance(res1, list) assert res1 is ll # object should be unchanged! - @warn_if_empty def empty() -> List[str]: return [] - with warnings.catch_warnings(record=True) as w: res2 = empty() assert len(w) == 1 @@ -218,3 +227,150 @@ def test_warn_if_empty_unsupported() -> None: @warn_if_empty # type: ignore[type-var] def bad_return_type() -> float: return 0.00 + + +_HT = TypeVar('_HT', bound=Hashable) + + +# NOTE: ideally we'do It = TypeVar('It', bound=Iterable[_HT]), and function would be It -> It +# Sadly this doesn't work in mypy, doesn't look like we can have double bound TypeVar +# Not a huge deal, since this function is for unique_eversee and +# we need to pass iterator to unique_everseen anyway +# TODO maybe contribute to more_itertools? https://github.com/more-itertools/more-itertools/issues/898 +def check_if_hashable(iterable: Iterable[_HT]) -> Iterable[_HT]: + """ + NOTE: Despite Hashable bound, typing annotation doesn't guarantee runtime safety + Consider hashable type X, and Y that inherits from X, but not hashable + Then l: List[X] = [Y(...)] is a valid expression, and type checks against Hashable, + but isn't runtime hashable + """ + # Sadly this doesn't work 100% correctly with dataclasses atm... + # they all are considered hashable: https://github.com/python/mypy/issues/11463 + + if isinstance(iterable, Iterator): + + def res() -> Iterator[_HT]: + for i in iterable: + assert isinstance(i, Hashable), i + # ugh. need a cast due to https://github.com/python/mypy/issues/10817 + yield cast(_HT, i) + + return res() + else: + # hopefully, iterable that can be iterated over multiple times? + # not sure if should have 'allowlist' of types that don't have to be transformed instead? + for i in iterable: + assert isinstance(i, Hashable), i + return iterable + + +# TODO different policies -- error/warn/ignore? +def test_check_if_hashable() -> None: + from dataclasses import dataclass + from typing import Set, Tuple + import pytest + from ..compat import assert_type + + x1: List[int] = [1, 2] + r1 = check_if_hashable(x1) + # tgype: ignore[comparison-overlap] # object should be unchanged + assert r1 is x1 + assert_type(r1, Iterable[int]) + + x2: Iterator[Union[int, str]] = iter((123, 'aba')) + r2 = check_if_hashable(x2) + assert list(r2) == [123, 'aba'] + assert_type(r2, Iterable[Union[int, str]]) + + x3: Tuple[object, ...] = (789, 'aba') + r3 = check_if_hashable(x3) + # ttype: ignore[comparison-overlap] # object should be unchanged + assert r3 is x3 + assert_type(r3, Iterable[object]) + + x4: List[Set[int]] = [{1, 2, 3}, {4, 5, 6}] + with pytest.raises(Exception): + # should be rejected by mypy sice set isn't Hashable, but also throw at runtime + r4 = check_if_hashable(x4) # type: ignore[type-var] + + x5: Iterator[object] = iter([{1, 2}, {3, 4}]) + # here, we hide behind object, which is hashable + # so mypy can't really help us anything + r5 = check_if_hashable(x5) + with pytest.raises(Exception): + # note: this only throws when iterator is advanced + list(r5) + + # dataclass is unhashable by default! unless frozen=True and eq=True, or unsafe_hash=True + @dataclass(unsafe_hash=True) + class X: + a: int + + x6: List[X] = [X(a=123)] + r6 = check_if_hashable(x6) + assert x6 is r6 + + # inherited dataclass will not be hashable! + @dataclass + class Y(X): + b: str + + x7: List[Y] = [Y(a=123, b='aba')] + with pytest.raises(Exception): + # ideally that would also be rejected by mypy, but currently there is a bug + # which treats all dataclasses as hashable: https://github.com/python/mypy/issues/11463 + check_if_hashable(x7) + + +_UET = TypeVar('_UET') +_UEU = TypeVar('_UEU') + + +# NOTE: for historic reasons, this function had to accept Callable that retuns iterator +# instead of just iterator +# TODO maybe deprecated Callable support? not sure +def unique_everseen( + fun: Union[ + Callable[[], Iterable[_UET]], + Iterable[_UET] + ], + key: Optional[Callable[[_UET], _UEU]] = None, +) -> Iterator[_UET]: + import os + + if callable(fun): + iterable = fun() + else: + iterable = fun + + if key is None: + # todo check key return type as well? but it's more likely to be hashable + if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None: + iterable = check_if_hashable(iterable) + + return more_itertools.unique_everseen(iterable=iterable, key=key) + + +def test_unique_everseen() -> None: + import pytest + from ..tests.common import tmp_environ_set + + def fun_good() -> Iterator[int]: + yield 123 + + def fun_bad(): + return [{1, 2}, {1, 2}, {1, 3}] + + with tmp_environ_set('HPI_CHECK_UNIQUE_EVERSEEN', 'yes'): + assert list(unique_everseen(fun_good)) == [123] + + with pytest.raises(Exception): + # since function retuns a list rather than iterator, check happens immediately + # , even without advancing the iterator + unique_everseen(fun_bad) + + good_list = [4, 3, 2, 1, 2, 3, 4] + assert list(unique_everseen(good_list)) == [4, 3, 2, 1] + + with tmp_environ_set('HPI_CHECK_UNIQUE_EVERSEEN', None): + assert list(unique_everseen(fun_bad)) == [{1, 2}, {1, 3}] From 18529257e76350ba65901060610aa294a3f3629b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 15 Aug 2024 14:15:23 +0300 Subject: [PATCH 240/302] core.common: move DummyExecutor to core.common.utils.concurrent without backwards compat, unlikely it's been used by anyone --- my/core/common.py | 40 ----------------------------- my/core/utils/concurrent.py | 51 +++++++++++++++++++++++++++++++++++++ my/pdfs.py | 2 +- 3 files changed, 52 insertions(+), 41 deletions(-) create mode 100644 my/core/utils/concurrent.py diff --git a/my/core/common.py b/my/core/common.py index bfc3505..739971c 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -5,7 +5,6 @@ from dataclasses import is_dataclass, asdict as dataclasses_asdict import functools from contextlib import contextmanager import os -import sys from typing import ( Any, Callable, @@ -382,45 +381,6 @@ def assert_subpackage(name: str) -> None: assert name == '__main__' or 'my.core' in name, f'Expected module __name__ ({name}) to be __main__ or start with my.core' -from .compat import ParamSpec -_P = ParamSpec('_P') -_T = TypeVar('_T') - -# https://stackoverflow.com/a/10436851/706389 -from concurrent.futures import Future, Executor -class DummyExecutor(Executor): - def __init__(self, max_workers: Optional[int]=1) -> None: - self._shutdown = False - self._max_workers = max_workers - - if TYPE_CHECKING: - if sys.version_info[:2] <= (3, 8): - # 3.8 doesn't support ParamSpec as Callable arg :( - # and any attempt to type results in incompatible supertype.. so whatever - def submit(self, fn, *args, **kwargs): ... - else: - def submit(self, fn: Callable[_P, _T], /, *args: _P.args, **kwargs: _P.kwargs) -> Future[_T]: ... - else: - def submit(self, fn, *args, **kwargs): - if self._shutdown: - raise RuntimeError('cannot schedule new futures after shutdown') - - f: Future[Any] = Future() - try: - result = fn(*args, **kwargs) - except KeyboardInterrupt: - raise - except BaseException as e: - f.set_exception(e) - else: - f.set_result(result) - - return f - - def shutdown(self, wait: bool=True, **kwargs) -> None: - self._shutdown = True - - # TODO deprecate and suggest to use one from my.core directly? not sure from .utils.itertools import unique_everseen diff --git a/my/core/utils/concurrent.py b/my/core/utils/concurrent.py new file mode 100644 index 0000000..cc17cda --- /dev/null +++ b/my/core/utils/concurrent.py @@ -0,0 +1,51 @@ +from concurrent.futures import Future, Executor +import sys +from typing import Any, Callable, Optional, TypeVar, TYPE_CHECKING + +from ..compat import ParamSpec + + +_P = ParamSpec('_P') +_T = TypeVar('_T') + + +# https://stackoverflow.com/a/10436851/706389 +class DummyExecutor(Executor): + """ + This is useful if you're already using Executor for parallelising, + but also want to provide an option to run the code serially (e.g. for debugging) + """ + def __init__(self, max_workers: Optional[int] = 1) -> None: + self._shutdown = False + self._max_workers = max_workers + + if TYPE_CHECKING: + if sys.version_info[:2] <= (3, 8): + # 3.8 doesn't support ParamSpec as Callable arg :( + # and any attempt to type results in incompatible supertype.. so whatever + def submit(self, fn, *args, **kwargs): ... + + else: + + def submit(self, fn: Callable[_P, _T], /, *args: _P.args, **kwargs: _P.kwargs) -> Future[_T]: ... + + else: + + def submit(self, fn, *args, **kwargs): + if self._shutdown: + raise RuntimeError('cannot schedule new futures after shutdown') + + f: Future[Any] = Future() + try: + result = fn(*args, **kwargs) + except KeyboardInterrupt: + raise + except BaseException as e: + f.set_exception(e) + else: + f.set_result(result) + + return f + + def shutdown(self, wait: bool = True, **kwargs) -> None: + self._shutdown = True diff --git a/my/pdfs.py b/my/pdfs.py index b3ef85d..0ab4af3 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -121,7 +121,7 @@ def _iter_annotations(pdfs: Sequence[Path]) -> Iterator[Res[Annotation]]: # todo how to print to stdout synchronously? # todo global config option not to use pools? useful for debugging.. from concurrent.futures import ProcessPoolExecutor - from my.core.common import DummyExecutor + from my.core.utils.concurrent import DummyExecutor workers = None # use 0 for debugging Pool = DummyExecutor if workers == 0 else ProcessPoolExecutor with Pool(workers) as pool: From c45c51af22aa9346bfd1613fccefa0617cec5bf6 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 15 Aug 2024 17:51:46 +0300 Subject: [PATCH 241/302] core.common: move stats-related stuff to my.core.stats and add more thorough tests/docs deprecate core.common.stat and core.common.Stats with backwards compatibility --- my/calendar/holidays.py | 4 +- my/core/__init__.py | 2 +- my/core/__main__.py | 13 +- my/core/common.py | 193 +------------- my/core/stats.py | 333 +++++++++++++++++++++++-- my/core/util.py | 12 - my/google/takeout/parser.py | 6 +- my/location/google.py | 2 +- my/location/google_takeout.py | 2 +- my/location/google_takeout_semantic.py | 3 +- my/reddit/all.py | 3 +- my/reddit/pushshift.py | 4 +- my/rescuetime.py | 3 +- my/smscalls.py | 9 +- 14 files changed, 343 insertions(+), 246 deletions(-) diff --git a/my/calendar/holidays.py b/my/calendar/holidays.py index 6fa3560..f73bf70 100644 --- a/my/calendar/holidays.py +++ b/my/calendar/holidays.py @@ -9,7 +9,8 @@ from datetime import date, datetime, timedelta from functools import lru_cache from typing import Union -from ..core.time import zone_to_countrycode +from my.core import Stats +from my.core.time import zone_to_countrycode @lru_cache(1) @@ -46,7 +47,6 @@ def is_workday(d: DateIsh) -> bool: return not is_holiday(d) -from ..core.common import Stats def stats() -> Stats: # meh, but not sure what would be a better test? res = {} diff --git a/my/core/__init__.py b/my/core/__init__.py index 0ba8bda..3881485 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -1,7 +1,7 @@ # this file only keeps the most common & critical types/utility functions from .common import get_files, PathIsh, Paths from .common import Json -from .common import stat, Stats +from .stats import stat, Stats from .common import datetime_naive, datetime_aware from .compat import assert_never from .utils.itertools import warn_if_empty diff --git a/my/core/__main__.py b/my/core/__main__.py index ca88513..c9a4945 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -243,9 +243,8 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: Li import contextlib - from .common import quick_stats - from .util import get_stats, HPIModule - from .stats import guess_stats + from .util import HPIModule + from .stats import get_stats, quick_stats from .error import warn_my_config_import_error mods: Iterable[HPIModule] @@ -276,11 +275,8 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: Li continue info(f'{click.style("OK", fg="green")} : {m:<50}') - # first try explicitly defined stats function: - stats = get_stats(m) - if stats is None: - # then try guessing.. not sure if should log somehow? - stats = guess_stats(m, quick=quick) + # TODO add hpi 'stats'? instead of doctor? not sure + stats = get_stats(m, guess=True) if stats is None: eprint(" - no 'stats' function, can't check the data") @@ -291,6 +287,7 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: Li try: kwargs = {} + # todo hmm why wouldn't they be callable?? if callable(stats) and 'quick' in inspect.signature(stats).parameters: kwargs['quick'] = quick with quick_context: diff --git a/my/core/common.py b/my/core/common.py index 739971c..926861d 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -3,7 +3,6 @@ from pathlib import Path from datetime import datetime from dataclasses import is_dataclass, asdict as dataclasses_asdict import functools -from contextlib import contextmanager import os from typing import ( Any, @@ -11,13 +10,11 @@ from typing import ( Dict, Iterable, List, - Optional, Sequence, TYPE_CHECKING, Tuple, TypeVar, Union, - cast, ) import warnings @@ -179,183 +176,6 @@ def get_valid_filename(s: str) -> str: return re.sub(r'(?u)[^-\w.]', '', s) -# global state that turns on/off quick stats -# can use the 'quick_stats' contextmanager -# to enable/disable this in cli so that module 'stats' -# functions don't have to implement custom 'quick' logic -QUICK_STATS = False - - -# in case user wants to use the stats functions/quick option -# elsewhere -- can use this decorator instead of editing -# the global state directly -@contextmanager -def quick_stats(): - global QUICK_STATS - prev = QUICK_STATS - try: - QUICK_STATS = True - yield - finally: - QUICK_STATS = prev - - -C = TypeVar('C') -Stats = Dict[str, Any] -StatsFun = Callable[[], Stats] -# todo not sure about return type... -def stat( - func: Union[Callable[[], Iterable[C]], Iterable[C]], - *, - quick: bool = False, - name: Optional[str] = None, -) -> Stats: - if callable(func): - fr = func() - if hasattr(fr, '__enter__') and hasattr(fr, '__exit__'): - # context managers has Iterable type, but they aren't data providers - # sadly doesn't look like there is a way to tell from typing annotations - return {} - fname = func.__name__ - else: - # meh. means it's just a list.. not sure how to generate a name then - fr = func - fname = f'unnamed_{id(fr)}' - type_name = type(fr).__name__ - if type_name == 'DataFrame': - # dynamic, because pandas is an optional dependency.. - df = cast(Any, fr) # todo ugh, not sure how to annotate properly - res = dict( - dtypes=df.dtypes.to_dict(), - rows=len(df), - ) - else: - res = _stat_iterable(fr, quick=quick) - - stat_name = name if name is not None else fname - return { - stat_name: res, - } - - -def _stat_iterable(it: Iterable[C], quick: bool = False) -> Any: - from more_itertools import ilen, take, first - - # todo not sure if there is something in more_itertools to compute this? - total = 0 - errors = 0 - first_item = None - last_item = None - - def funcit(): - nonlocal errors, first_item, last_item, total - for x in it: - total += 1 - if isinstance(x, Exception): - errors += 1 - else: - last_item = x - if first_item is None: - first_item = x - yield x - - eit = funcit() - count: Any - if quick or QUICK_STATS: - initial = take(100, eit) - count = len(initial) - if first(eit, None) is not None: # todo can actually be none... - # haven't exhausted - count = f'{count}+' - else: - count = ilen(eit) - - res = { - 'count': count, - } - - if total == 0: - # not sure but I guess a good balance? wouldn't want to throw early here? - res['warning'] = 'THE ITERABLE RETURNED NO DATA' - - if errors > 0: - res['errors'] = errors - - def stat_item(item): - if item is None: - return None - if isinstance(item, Path): - return str(item) - return guess_datetime(item) - - if (stat_first := stat_item(first_item)) is not None: - res['first'] = stat_first - - if (stat_last := stat_item(last_item)) is not None: - res['last'] = stat_last - - return res - - -def test_stat_iterable() -> None: - from datetime import datetime, timedelta, timezone - from typing import NamedTuple - - dd = datetime.fromtimestamp(123, tz=timezone.utc) - day = timedelta(days=3) - - X = NamedTuple('X', [('x', int), ('d', datetime)]) - - def it(): - yield RuntimeError('oops!') - for i in range(2): - yield X(x=i, d=dd + day * i) - yield RuntimeError('bad!') - for i in range(3): - yield X(x=i * 10, d=dd + day * (i * 10)) - yield X(x=123, d=dd + day * 50) - - res = _stat_iterable(it()) - assert res['count'] == 1 + 2 + 1 + 3 + 1 - assert res['errors'] == 1 + 1 - assert res['last'] == dd + day * 50 - - -# experimental, not sure about it.. -def guess_datetime(x: Any) -> Optional[datetime]: - # todo hmm implement withoutexception.. - try: - d = asdict(x) - except: # noqa: E722 bare except - return None - for k, v in d.items(): - if isinstance(v, datetime): - return v - return None - -def test_guess_datetime() -> None: - from datetime import datetime - from dataclasses import dataclass - from typing import NamedTuple - - dd = compat.fromisoformat('2021-02-01T12:34:56Z') - - # ugh.. https://github.com/python/mypy/issues/7281 - A = NamedTuple('A', [('x', int)]) - B = NamedTuple('B', [('x', int), ('created', datetime)]) - - assert guess_datetime(A(x=4)) is None - assert guess_datetime(B(x=4, created=dd)) == dd - - @dataclass - class C: - a: datetime - x: int - assert guess_datetime(C(a=dd, x=435)) == dd - # TODO not sure what to return when multiple datetime fields? - # TODO test @property? - - def is_namedtuple(thing: Any) -> bool: # basic check to see if this is namedtuple-like _asdict = getattr(thing, '_asdict', None) @@ -389,6 +209,9 @@ from .utils.itertools import unique_everseen ## hiding behind TYPE_CHECKING so it works in runtime ## in principle, warnings.deprecated decorator should cooperate with mypy, but doesn't look like it works atm? ## perhaps it doesn't work when it's used from typing_extensions + +from .compat import Never + if not TYPE_CHECKING: @deprecated('use my.core.compat.assert_never instead') @@ -439,6 +262,12 @@ if not TYPE_CHECKING: return UI.listify(*args, **kwargs) + @deprecated('use my.core.stat instead') + def stat(*args, **kwargs): + from . import stats + + return stats.stat(*args, **kwargs) + # todo wrap these in deprecated decorator as well? from .cachew import mcachew # noqa: F401 @@ -447,7 +276,7 @@ if not TYPE_CHECKING: # TODO hmm how to deprecate it in runtime? tricky cause it's actually a class? tzdatetime = datetime_aware else: - from .compat import Never - tzdatetime = Never # makes it invalid as a type while working in runtime + +Stats = Never ### diff --git a/my/core/stats.py b/my/core/stats.py index 44735b8..d5a43c3 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -1,23 +1,181 @@ ''' Helpers for hpi doctor/stats functionality. ''' + import collections +from contextlib import contextmanager +from datetime import datetime import importlib import inspect +from pathlib import Path +from types import ModuleType import typing -from typing import Optional, Callable, Any, Iterator, Sequence, Dict, List +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Protocol, + Sequence, + Union, + cast, +) -from .common import StatsFun, Stats, stat + +Stats = Dict[str, Any] + + +class StatsFun(Protocol): + def __call__(self, quick: bool = False) -> Stats: ... + + +# global state that turns on/off quick stats +# can use the 'quick_stats' contextmanager +# to enable/disable this in cli so that module 'stats' +# functions don't have to implement custom 'quick' logic +QUICK_STATS = False + + +# in case user wants to use the stats functions/quick option +# elsewhere -- can use this decorator instead of editing +# the global state directly +@contextmanager +def quick_stats(): + global QUICK_STATS + prev = QUICK_STATS + try: + QUICK_STATS = True + yield + finally: + QUICK_STATS = prev + + +def stat( + func: Union[Callable[[], Iterable[Any]], Iterable[Any]], + *, + quick: bool = False, + name: Optional[str] = None, +) -> Stats: + """ + Extracts various statistics from a passed iterable/callable, e.g.: + - number of items + - first/last item + - timestamps associated with first/last item + + If quick is set, then only first 100 items of the iterable will be processed + """ + if callable(func): + fr = func() + if hasattr(fr, '__enter__') and hasattr(fr, '__exit__'): + # context managers has Iterable type, but they aren't data providers + # sadly doesn't look like there is a way to tell from typing annotations + # Ideally we'd detect this in is_data_provider... + # but there is no way of knowing without actually calling it first :( + return {} + fname = func.__name__ + else: + # meh. means it's just a list.. not sure how to generate a name then + fr = func + fname = f'unnamed_{id(fr)}' + type_name = type(fr).__name__ + extras = {} + if type_name == 'DataFrame': + # dynamic, because pandas is an optional dependency.. + df = cast(Any, fr) # todo ugh, not sure how to annotate properly + df = df.reset_index() + + fr = df.to_dict(orient='records') + + dtypes = df.dtypes.to_dict() + extras['dtypes'] = dtypes + + res = _stat_iterable(fr, quick=quick) + res.update(extras) + + stat_name = name if name is not None else fname + return { + stat_name: res, + } + + +def test_stat() -> None: + # the bulk of testing is in test_stat_iterable + + # works with 'anonymous' lists + res = stat([1, 2, 3]) + [(name, v)] = res.items() + # note: name will be a little funny since anonymous list doesn't have one + assert v == {'count': 3} + # + + # works with functions: + def fun(): + return [4, 5, 6] + + assert stat(fun) == {'fun': {'count': 3}} + # + + # context managers are technically iterable + # , but usually we wouldn't want to compute stats for them + # this is mainly intended for guess_stats, + # since it can't tell whether the function is a ctx manager without calling it + @contextmanager + def cm(): + yield 1 + yield 3 + + assert stat(cm) == {} # type: ignore[arg-type] + # + + # works with pandas dataframes + import pandas as pd + import numpy as np + + def df() -> pd.DataFrame: + dates = pd.date_range(start='2024-02-10 08:00', end='2024-02-11 16:00', freq='5h') + return pd.DataFrame([f'value{i}' for i, _ in enumerate(dates)], index=dates, columns=['value']) + + assert stat(df) == { + 'df': { + 'count': 7, + 'dtypes': { + 'index': np.dtype(' Optional[StatsFun]: + stats: Optional[StatsFun] = None + try: + module = importlib.import_module(module_name) + except Exception: + return None + stats = getattr(module, 'stats', None) + if stats is None: + stats = guess_stats(module) + return stats # TODO maybe could be enough to annotate OUTPUTS or something like that? # then stats could just use them as hints? -def guess_stats(module_name: str, quick: bool = False) -> Optional[StatsFun]: - providers = guess_data_providers(module_name) +def guess_stats(module: ModuleType) -> Optional[StatsFun]: + """ + If the module doesn't have explicitly defined 'stat' function, + this is used to try to guess what could be included in stats automatically + """ + providers = _guess_data_providers(module) if len(providers) == 0: return None - def auto_stats() -> Stats: + def auto_stats(quick: bool = False) -> Stats: res = {} for k, v in providers.items(): res.update(stat(v, quick=quick, name=k)) @@ -27,12 +185,11 @@ def guess_stats(module_name: str, quick: bool = False) -> Optional[StatsFun]: def test_guess_stats() -> None: - from datetime import datetime import my.core.tests.auto_stats as M - auto_stats = guess_stats(M.__name__) + auto_stats = guess_stats(M) assert auto_stats is not None - res = auto_stats() + res = auto_stats(quick=False) assert res == { 'inputs': { @@ -48,15 +205,15 @@ def test_guess_stats() -> None: } -def guess_data_providers(module_name: str) -> Dict[str, Callable]: - module = importlib.import_module(module_name) +def _guess_data_providers(module: ModuleType) -> Dict[str, Callable]: mfunctions = inspect.getmembers(module, inspect.isfunction) return {k: v for k, v in mfunctions if is_data_provider(v)} -# todo how to exclude deprecated stuff? +# todo how to exclude deprecated data providers? def is_data_provider(fun: Any) -> bool: """ + Criteria for being a "data provider": 1. returns iterable or something like that 2. takes no arguments? (otherwise not callable by stats anyway?) 3. doesn't start with an underscore (those are probably helper functions?) @@ -72,7 +229,7 @@ def is_data_provider(fun: Any) -> bool: return False # has at least one argument without default values - if len(list(sig_required_params(sig))) > 0: + if len(list(_sig_required_params(sig))) > 0: return False if hasattr(fun, '__name__'): @@ -88,7 +245,7 @@ def is_data_provider(fun: Any) -> bool: if return_type is None: return False - return type_is_iterable(return_type) + return _type_is_iterable(return_type) def test_is_data_provider() -> None: @@ -99,6 +256,7 @@ def test_is_data_provider() -> None: def no_return_type(): return [1, 2, 3] + assert not idp(no_return_type) lam = lambda: [1, 2] @@ -106,27 +264,34 @@ def test_is_data_provider() -> None: def has_extra_args(count) -> List[int]: return list(range(count)) + assert not idp(has_extra_args) def has_return_type() -> Sequence[str]: return ['a', 'b', 'c'] + assert idp(has_return_type) def _helper_func() -> Iterator[Any]: yield 1 + assert not idp(_helper_func) def inputs() -> Iterator[Any]: yield 1 + assert idp(inputs) def producer_inputs() -> Iterator[Any]: yield 1 + assert idp(producer_inputs) -# return any parameters the user is required to provide - those which don't have default values -def sig_required_params(sig: inspect.Signature) -> Iterator[inspect.Parameter]: +def _sig_required_params(sig: inspect.Signature) -> Iterator[inspect.Parameter]: + """ + Returns parameters the user is required to provide - e.g. ones that don't have default values + """ for param in sig.parameters.values(): if param.default == inspect.Parameter.empty: yield param @@ -136,21 +301,24 @@ def test_sig_required_params() -> None: def x() -> int: return 5 - assert len(list(sig_required_params(inspect.signature(x)))) == 0 + + assert len(list(_sig_required_params(inspect.signature(x)))) == 0 def y(arg: int) -> int: return arg - assert len(list(sig_required_params(inspect.signature(y)))) == 1 + + assert len(list(_sig_required_params(inspect.signature(y)))) == 1 # from stats perspective, this should be treated as a data provider as well # could be that the default value to the data provider is the 'default' # path to use for inputs/a function to provide input data def z(arg: int = 5) -> int: return arg - assert len(list(sig_required_params(inspect.signature(z)))) == 0 + + assert len(list(_sig_required_params(inspect.signature(z)))) == 0 -def type_is_iterable(type_spec) -> bool: +def _type_is_iterable(type_spec) -> bool: origin = typing.get_origin(type_spec) if origin is None: return False @@ -167,9 +335,7 @@ def type_is_iterable(type_spec) -> bool: # todo docstring test? def test_type_is_iterable() -> None: - from typing import List, Sequence, Iterable, Dict, Any - - fun = type_is_iterable + fun = _type_is_iterable assert not fun(None) assert not fun(int) assert not fun(Any) @@ -178,3 +344,126 @@ def test_type_is_iterable() -> None: assert fun(List[int]) assert fun(Sequence[Dict[str, str]]) assert fun(Iterable[Any]) + + +def _stat_item(item): + if item is None: + return None + if isinstance(item, Path): + return str(item) + return _guess_datetime(item) + + +def _stat_iterable(it: Iterable[Any], quick: bool = False) -> Stats: + from more_itertools import ilen, take, first + + # todo not sure if there is something in more_itertools to compute this? + total = 0 + errors = 0 + first_item = None + last_item = None + + def funcit(): + nonlocal errors, first_item, last_item, total + for x in it: + total += 1 + if isinstance(x, Exception): + errors += 1 + else: + last_item = x + if first_item is None: + first_item = x + yield x + + eit = funcit() + count: Any + if quick or QUICK_STATS: + initial = take(100, eit) + count = len(initial) + if first(eit, None) is not None: # todo can actually be none... + # haven't exhausted + count = f'{count}+' + else: + count = ilen(eit) + + res = { + 'count': count, + } + + if total == 0: + # not sure but I guess a good balance? wouldn't want to throw early here? + res['warning'] = 'THE ITERABLE RETURNED NO DATA' + + if errors > 0: + res['errors'] = errors + + if (stat_first := _stat_item(first_item)) is not None: + res['first'] = stat_first + + if (stat_last := _stat_item(last_item)) is not None: + res['last'] = stat_last + + return res + + +def test_stat_iterable() -> None: + from datetime import datetime, timedelta, timezone + from typing import NamedTuple + + dd = datetime.fromtimestamp(123, tz=timezone.utc) + day = timedelta(days=3) + + X = NamedTuple('X', [('x', int), ('d', datetime)]) + + def it(): + yield RuntimeError('oops!') + for i in range(2): + yield X(x=i, d=dd + day * i) + yield RuntimeError('bad!') + for i in range(3): + yield X(x=i * 10, d=dd + day * (i * 10)) + yield X(x=123, d=dd + day * 50) + + res = _stat_iterable(it()) + assert res['count'] == 1 + 2 + 1 + 3 + 1 + assert res['errors'] == 1 + 1 + assert res['last'] == dd + day * 50 + + +# experimental, not sure about it.. +def _guess_datetime(x: Any) -> Optional[datetime]: + from .common import asdict # avoid circular imports + + # todo hmm implement without exception.. + try: + d = asdict(x) + except: # noqa: E722 bare except + return None + for k, v in d.items(): + if isinstance(v, datetime): + return v + return None + + +def test_guess_datetime() -> None: + from dataclasses import dataclass + from typing import NamedTuple + from .compat import fromisoformat + + dd = fromisoformat('2021-02-01T12:34:56Z') + + # ugh.. https://github.com/python/mypy/issues/7281 + A = NamedTuple('A', [('x', int)]) + B = NamedTuple('B', [('x', int), ('created', datetime)]) + + assert _guess_datetime(A(x=4)) is None + assert _guess_datetime(B(x=4, created=dd)) == dd + + @dataclass + class C: + a: datetime + x: int + + assert _guess_datetime(C(a=dd, x=435)) == dd + # TODO not sure what to return when multiple datetime fields? + # TODO test @property? diff --git a/my/core/util.py b/my/core/util.py index 1ca2de1..57e41d4 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -1,6 +1,5 @@ from pathlib import Path from itertools import chain -from importlib import import_module import os import pkgutil import sys @@ -15,17 +14,6 @@ def modules() -> Iterable[HPIModule]: yield m -from .common import StatsFun -def get_stats(module: str) -> Optional[StatsFun]: - # todo detect via ast? - try: - mod = import_module(module) - except Exception: - return None - - return getattr(mod, 'stats', None) - - __NOT_HPI_MODULE__ = 'Import this to mark a python file as a helper, not an actual HPI module' from .discovery_pure import NOT_HPI_MODULE_VAR assert NOT_HPI_MODULE_VAR in globals() # check name consistency diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 952c9b6..3ddd99d 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -18,9 +18,9 @@ from contextlib import ExitStack import os from typing import List, Sequence, cast from pathlib import Path -from my.core import make_config, dataclass +from my.core import make_config, dataclass, stat, Stats from my.core.cachew import mcachew -from my.core.common import Stats, LazyLogger, get_files, Paths +from my.core.common import LazyLogger, get_files, Paths from my.core.error import ErrorPolicy from my.core.structure import match_structure @@ -133,8 +133,6 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: def stats() -> Stats: - from my.core import stat - return { **stat(events), } diff --git a/my/location/google.py b/my/location/google.py index c1539e7..11cb576 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -19,6 +19,7 @@ import re # pip3 install geopy import geopy # type: ignore +from my.core import stat, Stats from my.core.common import LazyLogger from my.core.cachew import cache_dir, mcachew @@ -164,7 +165,6 @@ def locations(**kwargs) -> Iterable[Location]: return _iter_locations(path=last_takeout, **kwargs) -from ..core.common import stat, Stats def stats() -> Stats: return stat(locations) diff --git a/my/location/google_takeout.py b/my/location/google_takeout.py index 2fac270..eb757ce 100644 --- a/my/location/google_takeout.py +++ b/my/location/google_takeout.py @@ -9,8 +9,8 @@ from typing import Iterator from my.google.takeout.parser import events, _cachew_depends_on from google_takeout_parser.models import Location as GoogleLocation +from my.core import stat, Stats, LazyLogger from my.core.cachew import mcachew -from my.core.common import LazyLogger, stat, Stats from .common import Location logger = LazyLogger(__name__) diff --git a/my/location/google_takeout_semantic.py b/my/location/google_takeout_semantic.py index 014959c..4257f81 100644 --- a/my/location/google_takeout_semantic.py +++ b/my/location/google_takeout_semantic.py @@ -12,9 +12,8 @@ from typing import Iterator, List from my.google.takeout.parser import events, _cachew_depends_on as _parser_cachew_depends_on from google_takeout_parser.models import PlaceVisit as SemanticLocation -from my.core import dataclass, make_config +from my.core import dataclass, make_config, stat, LazyLogger, Stats from my.core.cachew import mcachew -from my.core.common import LazyLogger, Stats, stat from my.core.error import Res from .common import Location diff --git a/my/reddit/all.py b/my/reddit/all.py index a668081..daedba1 100644 --- a/my/reddit/all.py +++ b/my/reddit/all.py @@ -1,5 +1,5 @@ from typing import Iterator -from my.core.common import Stats +from my.core import stat, Stats from my.core.source import import_source from .common import Save, Upvote, Comment, Submission, _merge_comments @@ -58,7 +58,6 @@ def upvoted() -> Iterator[Upvote]: yield from upvoted() def stats() -> Stats: - from my.core import stat return { **stat(saved), **stat(comments), diff --git a/my/reddit/pushshift.py b/my/reddit/pushshift.py index 1c7ec8d..9580005 100644 --- a/my/reddit/pushshift.py +++ b/my/reddit/pushshift.py @@ -8,8 +8,9 @@ REQUIRES = [ "git+https://github.com/seanbreckenridge/pushshift_comment_export", ] -from my.core.common import Paths, Stats from dataclasses import dataclass + +from my.core import Paths, Stats, stat from my.core.cfg import make_config # note: keeping pushshift import before config import, so it's handled gracefully by import_source @@ -43,7 +44,6 @@ def comments() -> Iterator[PComment]: yield from read_file(f) def stats() -> Stats: - from my.core import stat return { **stat(comments) } diff --git a/my/rescuetime.py b/my/rescuetime.py index 774b587..c493e8e 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -9,7 +9,7 @@ from pathlib import Path from datetime import timedelta from typing import Sequence, Iterable -from my.core import get_files, make_logger +from my.core import get_files, make_logger, stat, Stats from my.core.cachew import mcachew from my.core.error import Res, split_errors @@ -47,7 +47,6 @@ def dataframe() -> DataFrameT: return as_dataframe(entries()) -from .core import stat, Stats def stats() -> Stats: return { **stat(groups), diff --git a/my/smscalls.py b/my/smscalls.py index 23fb5cc..f436709 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -7,7 +7,9 @@ Exported using https://play.google.com/store/apps/details?id=com.riteshsahu.SMSB REQUIRES = ['lxml'] -from .core import Paths, dataclass +from dataclasses import dataclass + +from my.core import get_files, stat, Paths, Stats from my.config import smscalls as user_config @dataclass @@ -15,7 +17,7 @@ class smscalls(user_config): # path[s] that SMSBackupRestore syncs XML files to export_path: Paths -from .core.cfg import make_config +from my.core.cfg import make_config config = make_config(smscalls) from datetime import datetime, timezone @@ -24,7 +26,6 @@ from typing import NamedTuple, Iterator, Set, Tuple, Optional, Any, Dict, List from lxml import etree -from my.core.common import get_files, Stats from my.core.error import Res @@ -316,8 +317,6 @@ def _parse_dt_ms(d: str) -> datetime: def stats() -> Stats: - from .core import stat - return { **stat(calls), **stat(messages), From 88f3c17c2726d9c2449043dc8599413892b0325d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 15 Aug 2024 18:44:12 +0300 Subject: [PATCH 242/302] core.common: move mime-related stuff to my.core.mime no backward compat, unlikely it was used by anyone else --- my/core/common.py | 22 ---------------------- my/core/mime.py | 34 ++++++++++++++++++++++++++++++++++ my/photos/main.py | 3 ++- 3 files changed, 36 insertions(+), 23 deletions(-) create mode 100644 my/core/mime.py diff --git a/my/core/common.py b/my/core/common.py index 926861d..32e277b 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -2,7 +2,6 @@ from glob import glob as do_glob from pathlib import Path from datetime import datetime from dataclasses import is_dataclass, asdict as dataclasses_asdict -import functools import os from typing import ( Any, @@ -117,27 +116,6 @@ def get_files( return tuple(paths) -@functools.lru_cache(1) -def _magic(): - import magic # type: ignore - return magic.Magic(mime=True) - - -# TODO could reuse in pdf module? -import mimetypes # todo do I need init()? -# todo wtf? fastermime thinks it's mime is application/json even if the extension is xz?? -# whereas magic detects correctly: application/x-zstd and application/x-xz -def fastermime(path: PathIsh) -> str: - paths = str(path) - # mimetypes is faster - (mime, _) = mimetypes.guess_type(paths) - if mime is not None: - return mime - # magic is slower but returns more stuff - # TODO Result type?; it's kinda racey, but perhaps better to let the caller decide? - return _magic().from_file(paths) - - Json = Dict[str, Any] diff --git a/my/core/mime.py b/my/core/mime.py new file mode 100644 index 0000000..c8abc7e --- /dev/null +++ b/my/core/mime.py @@ -0,0 +1,34 @@ +""" +Utils for mime/filetype handling +""" + +from .common import assert_subpackage; assert_subpackage(__name__) + +import functools + +from .common import PathIsh + + +@functools.lru_cache(1) +def _magic(): + import magic # type: ignore + + # TODO also has uncompess=True? could be useful + return magic.Magic(mime=True) + + +# TODO could reuse in pdf module? +import mimetypes # todo do I need init()? + + +# todo wtf? fastermime thinks it's mime is application/json even if the extension is xz?? +# whereas magic detects correctly: application/x-zstd and application/x-xz +def fastermime(path: PathIsh) -> str: + paths = str(path) + # mimetypes is faster, so try it first + (mime, _) = mimetypes.guess_type(paths) + if mime is not None: + return mime + # magic is slower but handles more types + # TODO Result type?; it's kinda racey, but perhaps better to let the caller decide? + return _magic().from_file(paths) diff --git a/my/photos/main.py b/my/photos/main.py index 622d475..6262eac 100644 --- a/my/photos/main.py +++ b/my/photos/main.py @@ -15,9 +15,10 @@ from typing import Optional, NamedTuple, Iterator, Iterable, List from geopy.geocoders import Nominatim # type: ignore -from my.core.common import LazyLogger, fastermime +from my.core import LazyLogger from my.core.error import Res, sort_res_by from my.core.cachew import cache_dir, mcachew +from my.core.mime import fastermime from my.config import photos as config # type: ignore[attr-defined] From 7f8a5023107adb21947d0b35031c0367f859f471 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 15 Aug 2024 18:49:19 +0300 Subject: [PATCH 243/302] core.common: move assert_subpackage to my.core.internal --- my/core/cachew.py | 2 +- my/core/common.py | 6 ------ my/core/freezer.py | 2 +- my/core/influxdb.py | 3 ++- my/core/internal.py | 9 +++++++++ my/core/kompress.py | 2 +- my/core/mime.py | 2 +- my/core/pytest.py | 2 +- my/core/sqlite.py | 2 +- 9 files changed, 17 insertions(+), 13 deletions(-) create mode 100644 my/core/internal.py diff --git a/my/core/cachew.py b/my/core/cachew.py index cc5a95b..bcd838d 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -1,4 +1,4 @@ -from .common import assert_subpackage; assert_subpackage(__name__) +from .internal import assert_subpackage; assert_subpackage(__name__) from contextlib import contextmanager import logging diff --git a/my/core/common.py b/my/core/common.py index 32e277b..f23ffbf 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -173,12 +173,6 @@ def asdict(thing: Any) -> Json: raise TypeError(f'Could not convert object {thing} to dict') -def assert_subpackage(name: str) -> None: - # can lead to some unexpected issues if you 'import cachew' which being in my/core directory.. so let's protect against it - # NOTE: if we use overlay, name can be smth like my.origg.my.core.cachew ... - assert name == '__main__' or 'my.core' in name, f'Expected module __name__ ({name}) to be __main__ or start with my.core' - - # TODO deprecate and suggest to use one from my.core directly? not sure from .utils.itertools import unique_everseen diff --git a/my/core/freezer.py b/my/core/freezer.py index 649a2b7..09ba032 100644 --- a/my/core/freezer.py +++ b/my/core/freezer.py @@ -1,4 +1,4 @@ -from .common import assert_subpackage; assert_subpackage(__name__) +from .internal import assert_subpackage; assert_subpackage(__name__) import dataclasses as dcl import inspect diff --git a/my/core/influxdb.py b/my/core/influxdb.py index 8407264..4f0e4c4 100644 --- a/my/core/influxdb.py +++ b/my/core/influxdb.py @@ -1,7 +1,8 @@ ''' TODO doesn't really belong to 'core' morally, but can think of moving out later ''' -from .common import assert_subpackage; assert_subpackage(__name__) + +from .internal import assert_subpackage; assert_subpackage(__name__) from typing import Iterable, Any, Optional, Dict diff --git a/my/core/internal.py b/my/core/internal.py new file mode 100644 index 0000000..8b9882b --- /dev/null +++ b/my/core/internal.py @@ -0,0 +1,9 @@ +""" +Utils specific to hpi core, shouldn't really be used by HPI modules +""" + + +def assert_subpackage(name: str) -> None: + # can lead to some unexpected issues if you 'import cachew' which being in my/core directory.. so let's protect against it + # NOTE: if we use overlay, name can be smth like my.origg.my.core.cachew ... + assert name == '__main__' or 'my.core' in name, f'Expected module __name__ ({name}) to be __main__ or start with my.core' diff --git a/my/core/kompress.py b/my/core/kompress.py index 25dba8c..6ab3228 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -1,4 +1,4 @@ -from .common import assert_subpackage; assert_subpackage(__name__) +from .internal import assert_subpackage; assert_subpackage(__name__) from . import warnings # do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath) diff --git a/my/core/mime.py b/my/core/mime.py index c8abc7e..cf5bdf5 100644 --- a/my/core/mime.py +++ b/my/core/mime.py @@ -2,7 +2,7 @@ Utils for mime/filetype handling """ -from .common import assert_subpackage; assert_subpackage(__name__) +from .internal import assert_subpackage; assert_subpackage(__name__) import functools diff --git a/my/core/pytest.py b/my/core/pytest.py index a2596fb..c73c71a 100644 --- a/my/core/pytest.py +++ b/my/core/pytest.py @@ -2,7 +2,7 @@ Helpers to prevent depending on pytest in runtime """ -from .common import assert_subpackage; assert_subpackage(__name__) +from .internal import assert_subpackage; assert_subpackage(__name__) import sys import typing diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 2580e15..4a471a9 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -1,4 +1,4 @@ -from .common import assert_subpackage; assert_subpackage(__name__) +from .internal import assert_subpackage; assert_subpackage(__name__) from contextlib import contextmanager From 2b0f92c88334e343520fb3a2ee2bceb68725b04c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 16 Aug 2024 00:14:44 +0300 Subject: [PATCH 244/302] my.core: deprecate Path/dataclass imports from my.core during type checking runtime still works for backwards compatibility --- my/arbtt.py | 6 ++++-- my/browser/active_browser.py | 3 ++- my/core/__init__.py | 14 +++++++++----- my/core/core_config.py | 6 ++++-- my/google/takeout/parser.py | 3 ++- my/lastfm.py | 3 ++- my/location/fallback/via_ip.py | 3 ++- my/location/google_takeout_semantic.py | 3 ++- my/location/gpslogger.py | 5 ++++- my/stackexchange/gdpr.py | 6 +++--- 10 files changed, 34 insertions(+), 18 deletions(-) diff --git a/my/arbtt.py b/my/arbtt.py index 941a05f..6de8cb2 100644 --- a/my/arbtt.py +++ b/my/arbtt.py @@ -6,6 +6,7 @@ REQUIRES = ['ijson', 'cffi'] # NOTE likely also needs libyajl2 from apt or elsewhere? +from dataclasses import dataclass from pathlib import Path from typing import Sequence, Iterable, List, Optional @@ -22,8 +23,9 @@ def inputs() -> Sequence[Path]: return get_files(user_config.logfiles) -from .core import dataclass, Json, PathIsh, datetime_aware -from .core.compat import fromisoformat + +from my.core import Json, PathIsh, datetime_aware +from my.core.compat import fromisoformat @dataclass diff --git a/my/browser/active_browser.py b/my/browser/active_browser.py index 601182a..6f335bd 100644 --- a/my/browser/active_browser.py +++ b/my/browser/active_browser.py @@ -4,9 +4,10 @@ Parses active browser history by backing it up with [[http://github.com/seanbrec REQUIRES = ["browserexport", "sqlite_backup"] +from dataclasses import dataclass from my.config import browser as user_config -from my.core import Paths, dataclass +from my.core import Paths @dataclass diff --git a/my/core/__init__.py b/my/core/__init__.py index 3881485..fa50413 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -1,4 +1,6 @@ # this file only keeps the most common & critical types/utility functions +from typing import TYPE_CHECKING + from .common import get_files, PathIsh, Paths from .common import Json from .stats import stat, Stats @@ -12,10 +14,12 @@ from .logging import make_logger, LazyLogger from .util import __NOT_HPI_MODULE__ -# just for brevity in modules -# todo not sure about these.. maybe best to rely on regular imports.. perhaps compare? -from dataclasses import dataclass -from pathlib import Path +if not TYPE_CHECKING: + # we used to keep these here for brevity, but feels like it only adds confusion, + # e.g. suggest that we perhaps somehow modify builtin behaviour or whatever + # so best to prefer explicit behaviour + from dataclasses import dataclass + from pathlib import Path __all__ = [ @@ -34,7 +38,7 @@ __all__ = [ 'Res', 'unwrap', - 'dataclass', 'Path', # TODO deprecate these from use in my.core + 'dataclass', 'Path', ] diff --git a/my/core/core_config.py b/my/core/core_config.py index e70dc05..889dbf9 100644 --- a/my/core/core_config.py +++ b/my/core/core_config.py @@ -1,10 +1,13 @@ ''' Bindings for the 'core' HPI configuration ''' + +from dataclasses import dataclass +from pathlib import Path import re from typing import Sequence, Optional -from . import warnings, PathIsh, Path +from . import warnings, PathIsh try: from my.config import core as user_config # type: ignore[attr-defined] @@ -21,7 +24,6 @@ except Exception as e: _HPI_CACHE_DIR_DEFAULT = '' -from dataclasses import dataclass @dataclass class Config(user_config): ''' diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 3ddd99d..173f99a 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -15,10 +15,11 @@ the cachew cache REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] from contextlib import ExitStack +from dataclasses import dataclass import os from typing import List, Sequence, cast from pathlib import Path -from my.core import make_config, dataclass, stat, Stats +from my.core import make_config, stat, Stats from my.core.cachew import mcachew from my.core.common import LazyLogger, get_files, Paths from my.core.error import ErrorPolicy diff --git a/my/lastfm.py b/my/lastfm.py index 64ef1b3..37cec50 100644 --- a/my/lastfm.py +++ b/my/lastfm.py @@ -2,7 +2,8 @@ Last.fm scrobbles ''' -from my.core import Paths, dataclass, make_logger +from dataclasses import dataclass +from my.core import Paths, make_logger from my.config import lastfm as user_config diff --git a/my/location/fallback/via_ip.py b/my/location/fallback/via_ip.py index f637552..87802e7 100644 --- a/my/location/fallback/via_ip.py +++ b/my/location/fallback/via_ip.py @@ -4,9 +4,10 @@ Converts IP addresses provided by my.location.ip to estimated locations REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] +from dataclasses import dataclass from datetime import timedelta -from my.core import dataclass, Stats, make_config +from my.core import Stats, make_config from my.config import location from my.core.warnings import medium diff --git a/my/location/google_takeout_semantic.py b/my/location/google_takeout_semantic.py index 4257f81..5f2c055 100644 --- a/my/location/google_takeout_semantic.py +++ b/my/location/google_takeout_semantic.py @@ -7,12 +7,13 @@ Extracts semantic location history using google_takeout_parser REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] +from dataclasses import dataclass from typing import Iterator, List from my.google.takeout.parser import events, _cachew_depends_on as _parser_cachew_depends_on from google_takeout_parser.models import PlaceVisit as SemanticLocation -from my.core import dataclass, make_config, stat, LazyLogger, Stats +from my.core import make_config, stat, LazyLogger, Stats from my.core.cachew import mcachew from my.core.error import Res from .common import Location diff --git a/my/location/gpslogger.py b/my/location/gpslogger.py index 8fb59d0..6d158a0 100644 --- a/my/location/gpslogger.py +++ b/my/location/gpslogger.py @@ -4,8 +4,11 @@ Parse [[https://github.com/mendhak/gpslogger][gpslogger]] .gpx (xml) files REQUIRES = ["gpxpy"] + +from dataclasses import dataclass + from my.config import location -from my.core import Paths, dataclass +from my.core import Paths @dataclass diff --git a/my/stackexchange/gdpr.py b/my/stackexchange/gdpr.py index 2f3b98d..5292bef 100644 --- a/my/stackexchange/gdpr.py +++ b/my/stackexchange/gdpr.py @@ -5,8 +5,9 @@ Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][officia # TODO need to merge gdpr and stexport ### config +from dataclasses import dataclass from my.config import stackexchange as user_config -from ..core import dataclass, PathIsh, make_config, get_files +from my.core import PathIsh, make_config, get_files, Json @dataclass class stackexchange(user_config): gdpr_path: PathIsh # path to GDPR zip file @@ -16,8 +17,7 @@ config = make_config(stackexchange) # TODO just merge all of them and then filter?.. not sure -from ..core.common import Json -from ..core.compat import fromisoformat +from my.core.compat import fromisoformat from typing import NamedTuple, Iterable from datetime import datetime class Vote(NamedTuple): From 614c929f95e79ad000c45da72813564cc2b6689e Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 16 Aug 2024 11:46:29 +0300 Subject: [PATCH 245/302] core.common: move Json, datetime_aware, datetime_naive, is_namedtuple, asdict to my.core.types --- my/core/__init__.py | 7 ++++-- my/core/common.py | 53 ++++++++++++++------------------------------ my/core/error.py | 3 ++- my/core/influxdb.py | 3 ++- my/core/pandas.py | 3 ++- my/core/query.py | 4 ++-- my/core/serialize.py | 2 +- my/core/stats.py | 4 ++-- my/core/time.py | 2 +- my/core/types.py | 36 ++++++++++++++++++++++++++++++ my/goodreads.py | 3 +-- my/lastfm.py | 3 +-- my/runnerup.py | 5 ++--- my/time/tz/common.py | 2 +- my/time/tz/main.py | 4 +++- 15 files changed, 78 insertions(+), 56 deletions(-) create mode 100644 my/core/types.py diff --git a/my/core/__init__.py b/my/core/__init__.py index fa50413..c65991a 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -2,9 +2,12 @@ from typing import TYPE_CHECKING from .common import get_files, PathIsh, Paths -from .common import Json from .stats import stat, Stats -from .common import datetime_naive, datetime_aware +from .types import ( + Json, + datetime_aware, + datetime_naive, +) from .compat import assert_never from .utils.itertools import warn_if_empty diff --git a/my/core/common.py b/my/core/common.py index f23ffbf..efd6f48 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -1,12 +1,8 @@ from glob import glob as do_glob from pathlib import Path -from datetime import datetime -from dataclasses import is_dataclass, asdict as dataclasses_asdict import os from typing import ( - Any, Callable, - Dict, Iterable, List, Sequence, @@ -116,9 +112,6 @@ def get_files( return tuple(paths) -Json = Dict[str, Any] - - from typing import TypeVar, Callable, Generic _R = TypeVar('_R') @@ -141,11 +134,6 @@ class classproperty(Generic[_R]): # def __get__(self) -> _R: # return self.f() -# for now just serves documentation purposes... but one day might make it statically verifiable where possible? -# TODO e.g. maybe use opaque mypy alias? -datetime_naive = datetime -datetime_aware = datetime - import re # https://stackoverflow.com/a/295466/706389 @@ -154,25 +142,6 @@ def get_valid_filename(s: str) -> str: return re.sub(r'(?u)[^-\w.]', '', s) -def is_namedtuple(thing: Any) -> bool: - # basic check to see if this is namedtuple-like - _asdict = getattr(thing, '_asdict', None) - return (_asdict is not None) and callable(_asdict) - - -def asdict(thing: Any) -> Json: - # todo primitive? - # todo exception? - if isinstance(thing, dict): - return thing - if is_dataclass(thing): - assert not isinstance(thing, type) # to help mypy - return dataclasses_asdict(thing) - if is_namedtuple(thing): - return thing._asdict() - raise TypeError(f'Could not convert object {thing} to dict') - - # TODO deprecate and suggest to use one from my.core directly? not sure from .utils.itertools import unique_everseen @@ -182,8 +151,6 @@ from .utils.itertools import unique_everseen ## in principle, warnings.deprecated decorator should cooperate with mypy, but doesn't look like it works atm? ## perhaps it doesn't work when it's used from typing_extensions -from .compat import Never - if not TYPE_CHECKING: @deprecated('use my.core.compat.assert_never instead') @@ -243,12 +210,26 @@ if not TYPE_CHECKING: # todo wrap these in deprecated decorator as well? from .cachew import mcachew # noqa: F401 + # TODO hmm how to deprecate these in runtime? + # tricky cause they are actually classes/types + from typing import Literal # noqa: F401 - # TODO hmm how to deprecate it in runtime? tricky cause it's actually a class? + from .stats import Stats + from .types import ( + Json, + datetime_naive, + datetime_aware, + ) + tzdatetime = datetime_aware else: - tzdatetime = Never # makes it invalid as a type while working in runtime + from .compat import Never -Stats = Never + # make these invalid during type check while working in runtime + Stats = Never + tzdatetime = Never + Json = Never + datetime_naive = Never + datetime_aware = Never ### diff --git a/my/core/error.py b/my/core/error.py index fa59137..2432a5d 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -6,6 +6,8 @@ See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail from itertools import tee from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast, Iterator, Literal +from .types import Json + T = TypeVar('T') E = TypeVar('E', bound=Exception) # TODO make covariant? @@ -176,7 +178,6 @@ def extract_error_datetime(e: Exception) -> Optional[datetime]: import traceback -from .common import Json def error_to_json(e: Exception) -> Json: estr = ''.join(traceback.format_exception(Exception, e, e.__traceback__)) return {'error': estr} diff --git a/my/core/influxdb.py b/my/core/influxdb.py index 4f0e4c4..2ac2c79 100644 --- a/my/core/influxdb.py +++ b/my/core/influxdb.py @@ -6,7 +6,8 @@ from .internal import assert_subpackage; assert_subpackage(__name__) from typing import Iterable, Any, Optional, Dict -from .common import LazyLogger, asdict, Json +from .common import LazyLogger +from .types import asdict, Json logger = LazyLogger(__name__) diff --git a/my/core/pandas.py b/my/core/pandas.py index 621682f..2b34b23 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -13,7 +13,8 @@ from typing import TYPE_CHECKING, Any, Iterable, Type, Dict, Literal, Callable, from decorator import decorator from . import warnings, Res -from .common import LazyLogger, Json, asdict +from .common import LazyLogger +from .types import Json, asdict from .error import error_to_json, extract_error_datetime diff --git a/my/core/query.py b/my/core/query.py index 071f7e0..4d7363e 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -14,8 +14,8 @@ from typing import TypeVar, Tuple, Optional, Union, Callable, Iterable, Iterator import more_itertools -import my.core.error as err -from .common import is_namedtuple +from . import error as err +from .types import is_namedtuple from .error import Res, unwrap from .warnings import low diff --git a/my/core/serialize.py b/my/core/serialize.py index b5b1b3a..e38bca5 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -5,8 +5,8 @@ from decimal import Decimal from typing import Any, Optional, Callable, NamedTuple from functools import lru_cache -from .common import is_namedtuple from .error import error_to_json +from .types import is_namedtuple from .pytest import parametrize # note: it would be nice to combine the 'asdict' and _default_encode to some function diff --git a/my/core/stats.py b/my/core/stats.py index d5a43c3..d724068 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -24,6 +24,8 @@ from typing import ( cast, ) +from .types import asdict + Stats = Dict[str, Any] @@ -432,8 +434,6 @@ def test_stat_iterable() -> None: # experimental, not sure about it.. def _guess_datetime(x: Any) -> Optional[datetime]: - from .common import asdict # avoid circular imports - # todo hmm implement without exception.. try: d = asdict(x) diff --git a/my/core/time.py b/my/core/time.py index 7698332..430b082 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -3,7 +3,7 @@ from typing import Sequence, Dict import pytz -from .common import datetime_aware, datetime_naive +from .types import datetime_aware, datetime_naive def user_forced() -> Sequence[str]: diff --git a/my/core/types.py b/my/core/types.py new file mode 100644 index 0000000..c1b0add --- /dev/null +++ b/my/core/types.py @@ -0,0 +1,36 @@ +from .internal import assert_subpackage; assert_subpackage(__name__) + +from dataclasses import is_dataclass, asdict as dataclasses_asdict +from datetime import datetime +from typing import ( + Any, + Dict, +) + + +Json = Dict[str, Any] + + +# for now just serves documentation purposes... but one day might make it statically verifiable where possible? +# TODO e.g. maybe use opaque mypy alias? +datetime_naive = datetime +datetime_aware = datetime + + +def is_namedtuple(thing: Any) -> bool: + # basic check to see if this is namedtuple-like + _asdict = getattr(thing, '_asdict', None) + return (_asdict is not None) and callable(_asdict) + + +def asdict(thing: Any) -> Json: + # todo primitive? + # todo exception? + if isinstance(thing, dict): + return thing + if is_dataclass(thing): + assert not isinstance(thing, type) # to help mypy + return dataclasses_asdict(thing) + if is_namedtuple(thing): + return thing._asdict() + raise TypeError(f'Could not convert object {thing} to dict') diff --git a/my/goodreads.py b/my/goodreads.py index acf2bb9..864bd64 100644 --- a/my/goodreads.py +++ b/my/goodreads.py @@ -7,7 +7,7 @@ REQUIRES = [ from dataclasses import dataclass -from my.core import Paths +from my.core import datetime_aware, Paths from my.config import goodreads as user_config @dataclass @@ -61,7 +61,6 @@ def books() -> Iterator[dal.Book]: ####### # todo ok, not sure these really belong here... -from my.core.common import datetime_aware @dataclass class Event: dt: datetime_aware diff --git a/my/lastfm.py b/my/lastfm.py index 37cec50..6618738 100644 --- a/my/lastfm.py +++ b/my/lastfm.py @@ -3,7 +3,7 @@ Last.fm scrobbles ''' from dataclasses import dataclass -from my.core import Paths, make_logger +from my.core import Paths, Json, make_logger, get_files from my.config import lastfm as user_config @@ -28,7 +28,6 @@ from pathlib import Path from typing import NamedTuple, Sequence, Iterable from my.core.cachew import mcachew -from my.core.common import Json, get_files def inputs() -> Sequence[Path]: diff --git a/my/runnerup.py b/my/runnerup.py index ca09466..a21075a 100644 --- a/my/runnerup.py +++ b/my/runnerup.py @@ -10,9 +10,8 @@ from datetime import timedelta from pathlib import Path from typing import Iterable -from .core import Res, get_files -from .core.common import Json -from .core.compat import fromisoformat +from my.core import Res, get_files, Json +from my.core.compat import fromisoformat import tcxparser # type: ignore[import-untyped] diff --git a/my/time/tz/common.py b/my/time/tz/common.py index 107410a..89150c7 100644 --- a/my/time/tz/common.py +++ b/my/time/tz/common.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import Callable, Literal, cast -from my.core.common import datetime_aware +from my.core import datetime_aware ''' diff --git a/my/time/tz/main.py b/my/time/tz/main.py index 6180160..fafc5fe 100644 --- a/my/time/tz/main.py +++ b/my/time/tz/main.py @@ -1,8 +1,10 @@ ''' Timezone data provider, used to localize timezone-unaware timestamps for other modules ''' + from datetime import datetime -from my.core.common import datetime_aware + +from my.core import datetime_aware # todo hmm, kwargs isn't mypy friendly.. but specifying types would require duplicating default args. uhoh def localize(dt: datetime, **kwargs) -> datetime_aware: From 7023088d13f05a2b2987c3b7c43744b5869b4a25 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 16 Aug 2024 12:12:40 +0300 Subject: [PATCH 246/302] core.common: deprecate outdated LazyLogger alias --- my/core/__init__.py | 7 ++++++- my/core/__main__.py | 4 ++-- my/core/common.py | 15 +++++++++++---- my/core/influxdb.py | 4 ++-- my/core/logging.py | 16 +++++++++++++--- my/core/pandas.py | 4 ++-- my/core/structure.py | 4 ++-- my/foursquare.py | 4 ++-- my/google/takeout/parser.py | 5 ++--- my/jawbone/__init__.py | 4 ++-- my/location/fallback/via_ip.py | 4 ++-- my/location/google.py | 5 ++--- my/rtm.py | 4 ++-- 13 files changed, 50 insertions(+), 30 deletions(-) diff --git a/my/core/__init__.py b/my/core/__init__.py index c65991a..247aab0 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -13,10 +13,15 @@ from .utils.itertools import warn_if_empty from .cfg import make_config from .error import Res, unwrap -from .logging import make_logger, LazyLogger +from .logging import ( + make_logger, +) from .util import __NOT_HPI_MODULE__ +LazyLogger = make_logger # TODO deprecate this in favor of make_logger + + if not TYPE_CHECKING: # we used to keep these here for brevity, but feels like it only adds confusion, # e.g. suggest that we perhaps somehow modify builtin behaviour or whatever diff --git a/my/core/__main__.py b/my/core/__main__.py index c9a4945..e0d5c12 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -485,8 +485,8 @@ def _locate_functions_or_prompt(qualified_names: List[str], prompt: bool = True) def _warn_exceptions(exc: Exception) -> None: - from my.core.common import LazyLogger - logger = LazyLogger('CLI', level='warning') + from my.core import make_logger + logger = make_logger('CLI', level='warning') logger.exception(f'hpi query: {exc}') diff --git a/my/core/common.py b/my/core/common.py index efd6f48..f20e1d4 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -15,14 +15,11 @@ import warnings from . import warnings as core_warnings from . import compat -from .compat import deprecated # some helper functions +# TODO start deprecating this? soon we'd be able to use Path | str syntax which is shorter and more explicit PathIsh = Union[Path, str] -from .logging import setup_logger, LazyLogger - - Paths = Union[Sequence[PathIsh], PathIsh] @@ -152,6 +149,7 @@ from .utils.itertools import unique_everseen ## perhaps it doesn't work when it's used from typing_extensions if not TYPE_CHECKING: + from .compat import deprecated @deprecated('use my.core.compat.assert_never instead') def assert_never(*args, **kwargs): @@ -207,9 +205,18 @@ if not TYPE_CHECKING: return stats.stat(*args, **kwargs) + @deprecated('use my.core.make_logger instead') + def LazyLogger(*args, **kwargs): + from . import logging + + return logging.LazyLogger(*args, **kwargs) + # todo wrap these in deprecated decorator as well? from .cachew import mcachew # noqa: F401 + # this is kinda internal, should just use my.core.logging.setup_logger if necessary + from .logging import setup_logger + # TODO hmm how to deprecate these in runtime? # tricky cause they are actually classes/types diff --git a/my/core/influxdb.py b/my/core/influxdb.py index 2ac2c79..c4b6409 100644 --- a/my/core/influxdb.py +++ b/my/core/influxdb.py @@ -6,11 +6,11 @@ from .internal import assert_subpackage; assert_subpackage(__name__) from typing import Iterable, Any, Optional, Dict -from .common import LazyLogger +from .logging import make_logger from .types import asdict, Json -logger = LazyLogger(__name__) +logger = make_logger(__name__) class config: diff --git a/my/core/logging.py b/my/core/logging.py index 5d2af99..882ab12 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -4,7 +4,7 @@ from functools import lru_cache import logging import os import sys -from typing import Union +from typing import Union, TYPE_CHECKING import warnings @@ -249,6 +249,16 @@ if __name__ == '__main__': ## legacy/deprecated methods for backwards compatilibity -LazyLogger = make_logger -logger = make_logger +if not TYPE_CHECKING: + from .compat import deprecated + + @deprecated('use make_logger instead') + def LazyLogger(*args, **kwargs): + return make_logger(*args, **kwargs) + + @deprecated('use make_logger instead') + def logger(*args, **kwargs): + return make_logger(*args, **kwargs) + + ## diff --git a/my/core/pandas.py b/my/core/pandas.py index 2b34b23..5688aa3 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -13,12 +13,12 @@ from typing import TYPE_CHECKING, Any, Iterable, Type, Dict, Literal, Callable, from decorator import decorator from . import warnings, Res -from .common import LazyLogger +from .logging import make_logger from .types import Json, asdict from .error import error_to_json, extract_error_datetime -logger = LazyLogger(__name__) +logger = make_logger(__name__) if TYPE_CHECKING: diff --git a/my/core/structure.py b/my/core/structure.py index 7a0c2a2..458440e 100644 --- a/my/core/structure.py +++ b/my/core/structure.py @@ -8,10 +8,10 @@ from typing import Sequence, Generator, List, Union, Tuple from contextlib import contextmanager from pathlib import Path -from .common import LazyLogger +from .logging import make_logger -logger = LazyLogger(__name__, level="info") +logger = make_logger(__name__, level="info") def _structure_exists(base_dir: Path, paths: Sequence[str], partial: bool = False) -> bool: diff --git a/my/foursquare.py b/my/foursquare.py index b50ab0e..63e1837 100644 --- a/my/foursquare.py +++ b/my/foursquare.py @@ -9,11 +9,11 @@ import json # TODO pytz for timezone??? -from .core.common import get_files, LazyLogger +from my.core import get_files, make_logger from my.config import foursquare as config -logger = LazyLogger(__name__) +logger = make_logger(__name__) def inputs(): diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 173f99a..c4e5682 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -19,9 +19,8 @@ from dataclasses import dataclass import os from typing import List, Sequence, cast from pathlib import Path -from my.core import make_config, stat, Stats +from my.core import make_config, stat, Stats, get_files, Paths, make_logger from my.core.cachew import mcachew -from my.core.common import LazyLogger, get_files, Paths from my.core.error import ErrorPolicy from my.core.structure import match_structure @@ -52,7 +51,7 @@ class google(user_config): config = make_config(google) -logger = LazyLogger(__name__, level="warning") +logger = make_logger(__name__, level="warning") # patch the takeout parser logger to match the computed loglevel from google_takeout_parser.log import setup as setup_takeout_logger diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 0659bc6..5d43296 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -8,9 +8,9 @@ from pathlib import Path import pytz -from ..core.common import LazyLogger +from my.core import make_logger -logger = LazyLogger(__name__) +logger = make_logger(__name__) from my.config import jawbone as config # type: ignore[attr-defined] diff --git a/my/location/fallback/via_ip.py b/my/location/fallback/via_ip.py index 87802e7..db03c7c 100644 --- a/my/location/fallback/via_ip.py +++ b/my/location/fallback/via_ip.py @@ -27,13 +27,13 @@ config = make_config(ip_config) from functools import lru_cache from typing import Iterator, List -from my.core.common import LazyLogger +from my.core import make_logger from my.core.compat import bisect_left from my.ip.all import ips from my.location.common import Location from my.location.fallback.common import FallbackLocation, DateExact, _datetime_timestamp -logger = LazyLogger(__name__, level="warning") +logger = make_logger(__name__, level="warning") def fallback_locations() -> Iterator[FallbackLocation]: diff --git a/my/location/google.py b/my/location/google.py index 11cb576..a7a92d3 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -19,8 +19,7 @@ import re # pip3 install geopy import geopy # type: ignore -from my.core import stat, Stats -from my.core.common import LazyLogger +from my.core import stat, Stats, make_logger from my.core.cachew import cache_dir, mcachew from my.core.warnings import high @@ -33,7 +32,7 @@ high("Please set up my.google.takeout.parser module for better takeout support") USE_GREP = False -logger = LazyLogger(__name__) +logger = make_logger(__name__) class Location(NamedTuple): diff --git a/my/rtm.py b/my/rtm.py index 56f4d07..22752fe 100644 --- a/my/rtm.py +++ b/my/rtm.py @@ -11,7 +11,7 @@ from functools import cached_property import re from typing import Dict, List, Iterator -from my.core.common import LazyLogger, get_files +from my.core import make_logger, get_files from my.core.utils.itertools import make_dict from my.config import rtm as config @@ -22,7 +22,7 @@ import icalendar # type: ignore from icalendar.cal import Todo # type: ignore -logger = LazyLogger(__name__) +logger = make_logger(__name__) # TODO extract in a module to parse RTM's ical? From 7bfce72b7c72f470e0dcb2847c0abaf437a90a0a Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 16 Aug 2024 13:25:19 +0300 Subject: [PATCH 247/302] core: cleanup/sort imports according to `ruff check --select I` --- my/core/__init__.py | 20 +++++++++---------- my/core/__main__.py | 21 ++++++++++---------- my/core/_cpu_pool.py | 5 ++--- my/core/_deprecated/kompress.py | 15 +++++++------- my/core/cachew.py | 17 +++++++++++++--- my/core/cfg.py | 15 +++++++------- my/core/common.py | 17 ++++++++-------- my/core/compat.py | 7 +++---- my/core/core_config.py | 13 +++++++----- my/core/denylist.py | 12 +++++------ my/core/discovery_pure.py | 8 ++++---- my/core/error.py | 35 +++++++++++++++++++++++++-------- my/core/experimental.py | 2 +- my/core/freezer.py | 4 +++- my/core/hpi_compat.py | 2 +- my/core/influxdb.py | 12 +++++------ my/core/init.py | 2 +- my/core/konsume.py | 5 +++++ my/core/logging.py | 8 +++++--- my/core/orgmode.py | 8 +++++++- my/core/pandas.py | 7 +++---- my/core/preinit.py | 4 +++- my/core/query.py | 20 +++++++++++++++---- my/core/query_range.py | 14 ++++++------- my/core/serialize.py | 9 +++++---- my/core/source.py | 4 ++-- my/core/sqlite.py | 8 +++----- my/core/stats.py | 12 +++++------ my/core/structure.py | 6 ++---- my/core/tests/auto_stats.py | 2 +- my/core/tests/common.py | 3 +-- my/core/tests/denylist.py | 6 +++--- my/core/tests/sqlite.py | 4 ++-- my/core/tests/structure.py | 3 +-- my/core/tests/test_cachew.py | 3 +-- my/core/tests/test_cli.py | 2 +- my/core/tests/test_get_files.py | 8 ++++---- my/core/time.py | 2 +- my/core/types.py | 4 ++-- my/core/util.py | 15 +++++++++----- my/core/utils/concurrent.py | 6 +++--- my/core/utils/imports.py | 4 ++-- my/core/utils/itertools.py | 15 ++++++++------ my/core/warnings.py | 12 ++++++++--- my/location/common.py | 4 ++-- 45 files changed, 235 insertions(+), 170 deletions(-) diff --git a/my/core/__init__.py b/my/core/__init__.py index 247aab0..19be7fe 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -1,23 +1,21 @@ # this file only keeps the most common & critical types/utility functions from typing import TYPE_CHECKING -from .common import get_files, PathIsh, Paths -from .stats import stat, Stats +from .cfg import make_config +from .common import PathIsh, Paths, get_files +from .compat import assert_never +from .error import Res, unwrap +from .logging import ( + make_logger, +) +from .stats import Stats, stat from .types import ( Json, datetime_aware, datetime_naive, ) -from .compat import assert_never -from .utils.itertools import warn_if_empty - -from .cfg import make_config -from .error import Res, unwrap -from .logging import ( - make_logger, -) from .util import __NOT_HPI_MODULE__ - +from .utils.itertools import warn_if_empty LazyLogger = make_logger # TODO deprecate this in favor of make_logger diff --git a/my/core/__main__.py b/my/core/__main__.py index e0d5c12..276de26 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -1,17 +1,17 @@ -from contextlib import ExitStack import functools import importlib import inspect -from itertools import chain import os import shlex import shutil import sys import tempfile import traceback -from typing import Optional, Sequence, Iterable, List, Type, Any, Callable +from contextlib import ExitStack +from itertools import chain from pathlib import Path -from subprocess import check_call, run, PIPE, CompletedProcess, Popen +from subprocess import PIPE, CompletedProcess, Popen, check_call, run +from typing import Any, Callable, Iterable, List, Optional, Sequence, Type import click @@ -221,6 +221,8 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module from .util import HPIModule, modules + + def _modules(*, all: bool=False) -> Iterable[HPIModule]: skipped = [] for m in modules(): @@ -243,9 +245,9 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: Li import contextlib - from .util import HPIModule - from .stats import get_stats, quick_stats from .error import warn_my_config_import_error + from .stats import get_stats, quick_stats + from .util import HPIModule mods: Iterable[HPIModule] if len(for_modules) == 0: @@ -437,7 +439,7 @@ def _ui_getchar_pick(choices: Sequence[str], prompt: str = 'Select from: ') -> i def _locate_functions_or_prompt(qualified_names: List[str], prompt: bool = True) -> Iterable[Callable[..., Any]]: - from .query import locate_qualified_function, QueryException + from .query import QueryException, locate_qualified_function from .stats import is_data_provider # if not connected to a terminal, can't prompt @@ -511,8 +513,7 @@ def query_hpi_functions( raise_exceptions: bool, drop_exceptions: bool, ) -> None: - from .query_range import select_range, RangeTuple - import my.core.error as err + from .query_range import RangeTuple, select_range # chain list of functions from user, in the order they wrote them on the CLI input_src = chain(*(f() for f in _locate_functions_or_prompt(qualified_names))) @@ -825,7 +826,7 @@ def query_cmd( hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments ''' - from datetime import datetime, date + from datetime import date, datetime chosen_order_type: Optional[Type] if order_type == "datetime": diff --git a/my/core/_cpu_pool.py b/my/core/_cpu_pool.py index 5ac66de..2369075 100644 --- a/my/core/_cpu_pool.py +++ b/my/core/_cpu_pool.py @@ -10,10 +10,9 @@ how many cores we want to dedicate to the DAL. Enabled by the env variable, specifying how many cores to dedicate e.g. "HPI_CPU_POOL=4 hpi query ..." """ -from concurrent.futures import ProcessPoolExecutor import os -from typing import cast, Optional - +from concurrent.futures import ProcessPoolExecutor +from typing import Optional, cast _NOT_SET = cast(ProcessPoolExecutor, object()) _INSTANCE: Optional[ProcessPoolExecutor] = _NOT_SET diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index 25b8a20..7eb9b37 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -4,13 +4,13 @@ Various helpers for compression # fmt: off from __future__ import annotations -from datetime import datetime -from functools import total_ordering import io import pathlib -from pathlib import Path import sys -from typing import Union, IO, Sequence, Any, Iterator +from datetime import datetime +from functools import total_ordering +from pathlib import Path +from typing import IO, Any, Iterator, Sequence, Union PathIsh = Union[Path, str] @@ -31,7 +31,7 @@ def is_compressed(p: Path) -> bool: def _zstd_open(path: Path, *args, **kwargs) -> IO: - import zstandard as zstd # type: ignore + import zstandard as zstd # type: ignore fh = path.open('rb') dctx = zstd.ZstdDecompressor() reader = dctx.stream_reader(fh) @@ -85,7 +85,7 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO: # todo 'expected "BinaryIO"'?? return io.TextIOWrapper(ifile, encoding=encoding) elif name.endswith(Ext.lz4): - import lz4.frame # type: ignore + import lz4.frame # type: ignore return lz4.frame.open(str(pp), mode, *args, **kwargs) elif name.endswith(Ext.zstd) or name.endswith(Ext.zst): kwargs['mode'] = mode @@ -101,8 +101,8 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO: return pp.open(mode, *args, **kwargs) -import typing import os +import typing if typing.TYPE_CHECKING: # otherwise mypy can't figure out that BasePath is a type alias.. @@ -147,6 +147,7 @@ def kexists(path: PathIsh, subpath: str) -> bool: import zipfile + if sys.version_info[:2] >= (3, 8): # meh... zipfile.Path is not available on 3.7 zipfile_Path = zipfile.Path diff --git a/my/core/cachew.py b/my/core/cachew.py index bcd838d..e0e7adf 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -1,11 +1,22 @@ from .internal import assert_subpackage; assert_subpackage(__name__) -from contextlib import contextmanager import logging -from pathlib import Path import sys -from typing import Optional, Iterator, cast, TYPE_CHECKING, TypeVar, Callable, overload, Union, Any, Type import warnings +from contextlib import contextmanager +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Iterator, + Optional, + Type, + TypeVar, + Union, + cast, + overload, +) import appdirs # type: ignore[import-untyped] diff --git a/my/core/cfg.py b/my/core/cfg.py index 0b59537..a71a7e3 100644 --- a/my/core/cfg.py +++ b/my/core/cfg.py @@ -1,6 +1,10 @@ from __future__ import annotations -from typing import TypeVar, Type, Callable, Dict, Any +import importlib +import re +import sys +from contextlib import ExitStack, contextmanager +from typing import Any, Callable, Dict, Iterator, Optional, Type, TypeVar Attrs = Dict[str, Any] @@ -27,8 +31,8 @@ def make_config(cls: Type[C], migration: Callable[[Attrs], Attrs]=lambda x: x) - F = TypeVar('F') -from contextlib import contextmanager -from typing import Iterator + + @contextmanager def _override_config(config: F) -> Iterator[F]: ''' @@ -46,9 +50,6 @@ def _override_config(config: F) -> Iterator[F]: delattr(config, k) -import importlib -import sys -from typing import Optional ModuleRegex = str @contextmanager def _reload_modules(modules: ModuleRegex) -> Iterator[None]: @@ -79,8 +80,6 @@ def _reload_modules(modules: ModuleRegex) -> Iterator[None]: sys.modules.pop(m, None) -from contextlib import ExitStack -import re @contextmanager def tmp_config(*, modules: Optional[ModuleRegex]=None, config=None): if modules is None: diff --git a/my/core/common.py b/my/core/common.py index f20e1d4..9a58caf 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -1,20 +1,20 @@ +import os +import warnings from glob import glob as do_glob from pathlib import Path -import os from typing import ( + TYPE_CHECKING, Callable, Iterable, List, Sequence, - TYPE_CHECKING, Tuple, TypeVar, Union, ) -import warnings -from . import warnings as core_warnings from . import compat +from . import warnings as core_warnings # some helper functions # TODO start deprecating this? soon we'd be able to use Path | str syntax which is shorter and more explicit @@ -92,7 +92,7 @@ def get_files( traceback.print_stack() if guess_compression: - from .kompress import CPath, is_compressed, ZipPath + from .kompress import CPath, ZipPath, is_compressed # NOTE: wrap is just for backwards compat with vendorized kompress # with kompress library, only is_compressed check and Cpath should be enough @@ -109,7 +109,7 @@ def get_files( return tuple(paths) -from typing import TypeVar, Callable, Generic +from typing import Callable, Generic, TypeVar _R = TypeVar('_R') @@ -133,6 +133,8 @@ class classproperty(Generic[_R]): import re + + # https://stackoverflow.com/a/295466/706389 def get_valid_filename(s: str) -> str: s = str(s).strip().replace(' ', '_') @@ -142,7 +144,6 @@ def get_valid_filename(s: str) -> str: # TODO deprecate and suggest to use one from my.core directly? not sure from .utils.itertools import unique_everseen - ### legacy imports, keeping them here for backwards compatibility ## hiding behind TYPE_CHECKING so it works in runtime ## in principle, warnings.deprecated decorator should cooperate with mypy, but doesn't look like it works atm? @@ -225,8 +226,8 @@ if not TYPE_CHECKING: from .stats import Stats from .types import ( Json, - datetime_naive, datetime_aware, + datetime_naive, ) tzdatetime = datetime_aware diff --git a/my/core/compat.py b/my/core/compat.py index 7bbe509..4372a01 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -6,7 +6,6 @@ If something is relevant to HPI itself, please put it in .hpi_compat instead import sys from typing import TYPE_CHECKING - if sys.version_info[:2] >= (3, 13): from warnings import deprecated else: @@ -48,7 +47,7 @@ else: # bisect_left doesn't have a 'key' parameter (which we use) # till python3.10 if sys.version_info[:2] <= (3, 9): - from typing import List, TypeVar, Any, Optional, Callable + from typing import Any, Callable, List, Optional, TypeVar X = TypeVar('X') @@ -131,6 +130,6 @@ else: if sys.version_info[:2] >= (3, 11): - from typing import assert_never, assert_type, Never + from typing import Never, assert_never, assert_type else: - from typing_extensions import assert_never, assert_type, Never + from typing_extensions import Never, assert_never, assert_type diff --git a/my/core/core_config.py b/my/core/core_config.py index 889dbf9..9036971 100644 --- a/my/core/core_config.py +++ b/my/core/core_config.py @@ -2,18 +2,18 @@ Bindings for the 'core' HPI configuration ''' +import re from dataclasses import dataclass from pathlib import Path -import re -from typing import Sequence, Optional +from typing import Optional, Sequence -from . import warnings, PathIsh +from . import PathIsh, warnings try: from my.config import core as user_config # type: ignore[attr-defined] except Exception as e: try: - from my.config import common as user_config # type: ignore[attr-defined] + from my.config import common as user_config # type: ignore[attr-defined] warnings.high("'common' config section is deprecated. Please rename it to 'core'.") except Exception as e2: # make it defensive, because it's pretty commonly used and would be annoying if it breaks hpi doctor etc. @@ -116,12 +116,15 @@ class Config(user_config): from .cfg import make_config + config = make_config(Config) ### tests start -from typing import Iterator from contextlib import contextmanager as ctx +from typing import Iterator + + @ctx def _reset_config() -> Iterator[Config]: # todo maybe have this decorator for the whole of my.config? diff --git a/my/core/denylist.py b/my/core/denylist.py index 8c18e06..7ca0ddf 100644 --- a/my/core/denylist.py +++ b/my/core/denylist.py @@ -5,19 +5,19 @@ A helper module for defining denylists for sources programmatically For docs, see doc/DENYLIST.md """ -import sys -import json import functools +import json +import sys from collections import defaultdict -from typing import TypeVar, Set, Any, Mapping, Iterator, Dict, List from pathlib import Path +from typing import Any, Dict, Iterator, List, Mapping, Set, TypeVar import click from more_itertools import seekable -from my.core.serialize import dumps -from my.core.common import PathIsh -from my.core.warnings import medium +from my.core.common import PathIsh +from my.core.serialize import dumps +from my.core.warnings import medium T = TypeVar("T") diff --git a/my/core/discovery_pure.py b/my/core/discovery_pure.py index 85b75ab..63d9922 100644 --- a/my/core/discovery_pure.py +++ b/my/core/discovery_pure.py @@ -16,11 +16,11 @@ NOT_HPI_MODULE_VAR = '__NOT_HPI_MODULE__' ### import ast -import os -from typing import Optional, Sequence, List, NamedTuple, Iterable, cast, Any -from pathlib import Path -import re import logging +import os +import re +from pathlib import Path +from typing import Any, Iterable, List, NamedTuple, Optional, Sequence, cast ''' None means that requirements weren't defined (different from empty requirements) diff --git a/my/core/error.py b/my/core/error.py index 2432a5d..c4dff07 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -3,14 +3,28 @@ Various error handling helpers See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail """ +import traceback +from datetime import datetime from itertools import tee -from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast, Iterator, Literal +from typing import ( + Any, + Callable, + Iterable, + Iterator, + List, + Literal, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) from .types import Json - T = TypeVar('T') -E = TypeVar('E', bound=Exception) # TODO make covariant? +E = TypeVar('E', bound=Exception) # TODO make covariant? ResT = Union[T, E] @@ -18,6 +32,7 @@ Res = ResT[T, Exception] ErrorPolicy = Literal["yield", "raise", "drop"] + def notnone(x: Optional[T]) -> T: assert x is not None return x @@ -29,6 +44,7 @@ def unwrap(res: Res[T]) -> T: else: return res + def drop_exceptions(itr: Iterator[Res[T]]) -> Iterator[T]: """Return non-errors from the iterable""" for o in itr: @@ -146,23 +162,23 @@ def test_sort_res_by() -> None: # helpers to associate timestamps with the errors (so something meaningful could be displayed on the plots, for example) # todo document it under 'patterns' somewhere... - # todo proper typevar? -from datetime import datetime def set_error_datetime(e: Exception, dt: Optional[datetime]) -> None: if dt is None: return e.args = e.args + (dt,) # todo not sure if should return new exception? + def attach_dt(e: Exception, *, dt: Optional[datetime]) -> Exception: set_error_datetime(e, dt) return e + # todo it might be problematic because might mess with timezones (when it's converted to string, it's converted to a shift) def extract_error_datetime(e: Exception) -> Optional[datetime]: import re - from datetime import datetime + for x in reversed(e.args): if isinstance(x, datetime): return x @@ -177,7 +193,6 @@ def extract_error_datetime(e: Exception) -> Optional[datetime]: return None -import traceback def error_to_json(e: Exception) -> Json: estr = ''.join(traceback.format_exception(Exception, e, e.__traceback__)) return {'error': estr} @@ -185,6 +200,7 @@ def error_to_json(e: Exception) -> Json: MODULE_SETUP_URL = 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#private-configuration-myconfig' + def warn_my_config_import_error(err: Union[ImportError, AttributeError], help_url: Optional[str] = None) -> bool: """ If the user tried to import something from my.config but it failed, @@ -193,7 +209,9 @@ def warn_my_config_import_error(err: Union[ImportError, AttributeError], help_ur Returns True if it matched a possible config error """ import re + import click + if help_url is None: help_url = MODULE_SETUP_URL if type(err) is ImportError: @@ -226,7 +244,8 @@ See {help_url} or check the corresponding module.py file for an example\ def test_datetime_errors() -> None: - import pytz + import pytz # noqa: I001 + dt_notz = datetime.now() dt_tz = datetime.now(tz=pytz.timezone('Europe/Amsterdam')) for dt in [dt_tz, dt_notz]: diff --git a/my/core/experimental.py b/my/core/experimental.py index c10ba71..1a78272 100644 --- a/my/core/experimental.py +++ b/my/core/experimental.py @@ -1,6 +1,6 @@ import sys -from typing import Any, Dict, Optional import types +from typing import Any, Dict, Optional # The idea behind this one is to support accessing "overlaid/shadowed" modules from namespace packages diff --git a/my/core/freezer.py b/my/core/freezer.py index 09ba032..e46525b 100644 --- a/my/core/freezer.py +++ b/my/core/freezer.py @@ -2,7 +2,7 @@ from .internal import assert_subpackage; assert_subpackage(__name__) import dataclasses as dcl import inspect -from typing import TypeVar, Type, Any +from typing import Any, Type, TypeVar D = TypeVar('D') @@ -22,6 +22,8 @@ def _freeze_dataclass(Orig: Type[D]): # todo need some decorator thingie? from typing import Generic + + class Freezer(Generic[D]): ''' Some magic which converts dataclass properties into fields. diff --git a/my/core/hpi_compat.py b/my/core/hpi_compat.py index 3c567d9..bad0b17 100644 --- a/my/core/hpi_compat.py +++ b/my/core/hpi_compat.py @@ -2,8 +2,8 @@ Contains various backwards compatibility/deprecation helpers relevant to HPI itself. (as opposed to .compat module which implements compatibility between python versions) """ -import os import inspect +import os import re from types import ModuleType from typing import Iterator, List, Optional, TypeVar diff --git a/my/core/influxdb.py b/my/core/influxdb.py index c4b6409..c39f6af 100644 --- a/my/core/influxdb.py +++ b/my/core/influxdb.py @@ -4,11 +4,12 @@ TODO doesn't really belong to 'core' morally, but can think of moving out later from .internal import assert_subpackage; assert_subpackage(__name__) -from typing import Iterable, Any, Optional, Dict +from typing import Any, Dict, Iterable, Optional + +import click from .logging import make_logger -from .types import asdict, Json - +from .types import Json, asdict logger = make_logger(__name__) @@ -28,7 +29,7 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_c db = config.db - from influxdb import InfluxDBClient # type: ignore + from influxdb import InfluxDBClient # type: ignore client = InfluxDBClient() # todo maybe create if not exists? # client.create_database(db) @@ -106,6 +107,7 @@ def magic_fill(it, *, name: Optional[str]=None, reset: bool=RESET_DEFAULT) -> No it = it() from itertools import tee + from more_itertools import first, one it, x = tee(it) f = first(x, default=None) @@ -125,8 +127,6 @@ def magic_fill(it, *, name: Optional[str]=None, reset: bool=RESET_DEFAULT) -> No fill(it, measurement=name, reset=reset, dt_col=dtf) -import click - @click.group() def main() -> None: pass diff --git a/my/core/init.py b/my/core/init.py index bec3a9a..49148de 100644 --- a/my/core/init.py +++ b/my/core/init.py @@ -14,9 +14,9 @@ Please let me know if you are aware of a better way of dealing with this! # separate function to present namespace pollution def setup_config() -> None: - from pathlib import Path import sys import warnings + from pathlib import Path from .preinit import get_mycfg_dir mycfg_dir = get_mycfg_dir() diff --git a/my/core/konsume.py b/my/core/konsume.py index 10bea8d..ac1b100 100644 --- a/my/core/konsume.py +++ b/my/core/konsume.py @@ -94,6 +94,8 @@ class Wvalue(Zoomable): from typing import Tuple + + def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]: res: Zoomable cc: List[Zoomable] @@ -123,6 +125,7 @@ def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]: from contextlib import contextmanager from typing import Iterator + class UnconsumedError(Exception): pass @@ -146,6 +149,8 @@ Expected {c} to be fully consumed by the parser. from typing import cast + + def test_unconsumed() -> None: import pytest with pytest.raises(UnconsumedError): diff --git a/my/core/logging.py b/my/core/logging.py index 882ab12..734c1e0 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -1,11 +1,11 @@ from __future__ import annotations -from functools import lru_cache import logging import os import sys -from typing import Union, TYPE_CHECKING import warnings +from functools import lru_cache +from typing import TYPE_CHECKING, Union def test() -> None: @@ -222,7 +222,9 @@ def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger: # OK, when stdout is not a tty, enlighten doesn't log anything, good def get_enlighten(): # TODO could add env variable to disable enlighten for a module? - from unittest.mock import Mock # Mock to return stub so cients don't have to think about it + from unittest.mock import ( + Mock, # Mock to return stub so cients don't have to think about it + ) # for now hidden behind the flag since it's a little experimental if os.environ.get('ENLIGHTEN_ENABLE', None) is None: diff --git a/my/core/orgmode.py b/my/core/orgmode.py index 5894b23..d9a254c 100644 --- a/my/core/orgmode.py +++ b/my/core/orgmode.py @@ -2,6 +2,8 @@ Various helpers for reading org-mode data """ from datetime import datetime + + def parse_org_datetime(s: str) -> datetime: s = s.strip('[]') for fmt, cl in [ @@ -21,8 +23,10 @@ def parse_org_datetime(s: str) -> datetime: # TODO I guess want to borrow inspiration from bs4? element type <-> tag; and similar logic for find_one, find_all +from typing import Callable, Iterable, TypeVar + from orgparse import OrgNode -from typing import Iterable, TypeVar, Callable + V = TypeVar('V') def collect(n: OrgNode, cfun: Callable[[OrgNode], Iterable[V]]) -> Iterable[V]: @@ -32,6 +36,8 @@ def collect(n: OrgNode, cfun: Callable[[OrgNode], Iterable[V]]) -> Iterable[V]: from more_itertools import one from orgparse.extra import Table + + def one_table(o: OrgNode) -> Table: return one(collect(o, lambda n: (x for x in n.body_rich if isinstance(x, Table)))) diff --git a/my/core/pandas.py b/my/core/pandas.py index 5688aa3..8abbb1f 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -8,15 +8,14 @@ from __future__ import annotations import dataclasses from datetime import datetime, timezone from pprint import pformat -from typing import TYPE_CHECKING, Any, Iterable, Type, Dict, Literal, Callable, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Literal, Type, TypeVar from decorator import decorator -from . import warnings, Res +from . import Res, warnings +from .error import error_to_json, extract_error_datetime from .logging import make_logger from .types import Json, asdict -from .error import error_to_json, extract_error_datetime - logger = make_logger(__name__) diff --git a/my/core/preinit.py b/my/core/preinit.py index 8c0f6a4..be5477b 100644 --- a/my/core/preinit.py +++ b/my/core/preinit.py @@ -1,11 +1,13 @@ from pathlib import Path + # todo preinit isn't really a good name? it's only in a separate file because # - it's imported from my.core.init (so we wan't to keep this file as small/reliable as possible, hence not common or something) # - we still need this function in __main__, so has to be separate from my/core/init.py def get_mycfg_dir() -> Path: - import appdirs # type: ignore[import-untyped] import os + + import appdirs # type: ignore[import-untyped] # not sure if that's necessary, i.e. could rely on PYTHONPATH instead # on the other hand, by using MY_CONFIG we are guaranteed to load it from the desired path? mvar = os.environ.get('MY_CONFIG') diff --git a/my/core/query.py b/my/core/query.py index 4d7363e..cf85b1b 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -10,16 +10,27 @@ import importlib import inspect import itertools from datetime import datetime -from typing import TypeVar, Tuple, Optional, Union, Callable, Iterable, Iterator, Dict, Any, NamedTuple, List +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + NamedTuple, + Optional, + Tuple, + TypeVar, + Union, +) import more_itertools from . import error as err -from .types import is_namedtuple from .error import Res, unwrap +from .types import is_namedtuple from .warnings import low - T = TypeVar("T") ET = Res[T] @@ -687,9 +698,10 @@ def test_raise_exceptions() -> None: def test_wrap_unsortable_with_error_and_warning() -> None: - import pytest from collections import Counter + import pytest + # by default should wrap unsortable (error) with pytest.warns(UserWarning, match=r"encountered exception"): res = list(select(_mixed_iter_errors(), order_value=lambda o: isinstance(o, datetime))) diff --git a/my/core/query_range.py b/my/core/query_range.py index 2b3a3d3..d077225 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -9,24 +9,22 @@ See the select_range function below import re import time +from datetime import date, datetime, timedelta from functools import lru_cache -from datetime import datetime, timedelta, date -from typing import Callable, Iterator, NamedTuple, Optional, Any, Type +from typing import Any, Callable, Iterator, NamedTuple, Optional, Type import more_itertools +from .compat import fromisoformat from .query import ( - QueryException, - select, + ET, OrderFunc, + QueryException, Where, _handle_generate_order_by, - ET, + select, ) -from .compat import fromisoformat - - timedelta_regex = re.compile(r"^((?P[\.\d]+?)w)?((?P[\.\d]+?)d)?((?P[\.\d]+?)h)?((?P[\.\d]+?)m)?((?P[\.\d]+?)s)?$") diff --git a/my/core/serialize.py b/my/core/serialize.py index e38bca5..b196d47 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -1,13 +1,13 @@ import datetime -from dataclasses import is_dataclass, asdict -from pathlib import Path +from dataclasses import asdict, is_dataclass from decimal import Decimal -from typing import Any, Optional, Callable, NamedTuple from functools import lru_cache +from pathlib import Path +from typing import Any, Callable, NamedTuple, Optional from .error import error_to_json -from .types import is_namedtuple from .pytest import parametrize +from .types import is_namedtuple # note: it would be nice to combine the 'asdict' and _default_encode to some function # that takes a complex python object and returns JSON-compatible fields, while still @@ -117,6 +117,7 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: def stdlib_factory() -> Optional[Dumps]: import json + from .warnings import high high( diff --git a/my/core/source.py b/my/core/source.py index 6d0f0fd..9488ae2 100644 --- a/my/core/source.py +++ b/my/core/source.py @@ -3,9 +3,9 @@ Decorator to gracefully handle importing a data source, or warning and yielding nothing (or a default) when its not available """ -from functools import wraps -from typing import Any, Iterator, TypeVar, Callable, Optional, Iterable import warnings +from functools import wraps +from typing import Any, Callable, Iterable, Iterator, Optional, TypeVar from .warnings import medium diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 4a471a9..47bd78b 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -1,13 +1,12 @@ from .internal import assert_subpackage; assert_subpackage(__name__) -from contextlib import contextmanager -from pathlib import Path import shutil import sqlite3 +from contextlib import contextmanager +from pathlib import Path from tempfile import TemporaryDirectory -from typing import Tuple, Any, Iterator, Callable, Optional, Union, Literal - +from typing import Any, Callable, Iterator, Literal, Optional, Tuple, Union, overload from .common import PathIsh from .compat import assert_never @@ -98,7 +97,6 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: # and then the return type ends up as Iterator[Tuple[str, ...]], which isn't desirable :( # a bit annoying to have this copy-pasting, but hopefully not a big issue -from typing import overload @overload def select(cols: Tuple[str ], rest: str, *, db: sqlite3.Connection) -> \ Iterator[Tuple[Any ]]: ... diff --git a/my/core/stats.py b/my/core/stats.py index d724068..08821a2 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -3,13 +3,13 @@ Helpers for hpi doctor/stats functionality. ''' import collections -from contextlib import contextmanager -from datetime import datetime import importlib import inspect +import typing +from contextlib import contextmanager +from datetime import datetime from pathlib import Path from types import ModuleType -import typing from typing import ( Any, Callable, @@ -26,7 +26,6 @@ from typing import ( from .types import asdict - Stats = Dict[str, Any] @@ -133,8 +132,8 @@ def test_stat() -> None: # # works with pandas dataframes - import pandas as pd import numpy as np + import pandas as pd def df() -> pd.DataFrame: dates = pd.date_range(start='2024-02-10 08:00', end='2024-02-11 16:00', freq='5h') @@ -357,7 +356,7 @@ def _stat_item(item): def _stat_iterable(it: Iterable[Any], quick: bool = False) -> Stats: - from more_itertools import ilen, take, first + from more_itertools import first, ilen, take # todo not sure if there is something in more_itertools to compute this? total = 0 @@ -448,6 +447,7 @@ def _guess_datetime(x: Any) -> Optional[datetime]: def test_guess_datetime() -> None: from dataclasses import dataclass from typing import NamedTuple + from .compat import fromisoformat dd = fromisoformat('2021-02-01T12:34:56Z') diff --git a/my/core/structure.py b/my/core/structure.py index 458440e..df25e37 100644 --- a/my/core/structure.py +++ b/my/core/structure.py @@ -1,16 +1,14 @@ +import atexit import os import shutil import tempfile import zipfile -import atexit - -from typing import Sequence, Generator, List, Union, Tuple from contextlib import contextmanager from pathlib import Path +from typing import Generator, List, Sequence, Tuple, Union from .logging import make_logger - logger = make_logger(__name__, level="info") diff --git a/my/core/tests/auto_stats.py b/my/core/tests/auto_stats.py index bf4764c..d10d4c4 100644 --- a/my/core/tests/auto_stats.py +++ b/my/core/tests/auto_stats.py @@ -6,7 +6,7 @@ from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime, timedelta from pathlib import Path -from typing import Iterable, Sequence, Iterator +from typing import Iterable, Iterator, Sequence @dataclass diff --git a/my/core/tests/common.py b/my/core/tests/common.py index a102ad3..22a74d7 100644 --- a/my/core/tests/common.py +++ b/my/core/tests/common.py @@ -1,10 +1,9 @@ -from contextlib import contextmanager import os +from contextlib import contextmanager from typing import Iterator, Optional import pytest - V = 'HPI_TESTS_USES_OPTIONAL_DEPS' # TODO use it for serialize tests that are using simplejson/orjson? diff --git a/my/core/tests/denylist.py b/my/core/tests/denylist.py index cca757d..8016282 100644 --- a/my/core/tests/denylist.py +++ b/my/core/tests/denylist.py @@ -1,8 +1,8 @@ -from datetime import datetime import json -from pathlib import Path -from typing import NamedTuple, Iterator import warnings +from datetime import datetime +from pathlib import Path +from typing import Iterator, NamedTuple from ..denylist import DenyList diff --git a/my/core/tests/sqlite.py b/my/core/tests/sqlite.py index b3ecffe..1ad0748 100644 --- a/my/core/tests/sqlite.py +++ b/my/core/tests/sqlite.py @@ -1,7 +1,7 @@ -from concurrent.futures import ProcessPoolExecutor -from pathlib import Path import shutil import sqlite3 +from concurrent.futures import ProcessPoolExecutor +from pathlib import Path from tempfile import TemporaryDirectory from ..sqlite import sqlite_connect_immutable, sqlite_copy_and_open diff --git a/my/core/tests/structure.py b/my/core/tests/structure.py index beb8e7f..6a94fc4 100644 --- a/my/core/tests/structure.py +++ b/my/core/tests/structure.py @@ -1,9 +1,8 @@ from pathlib import Path -from ..structure import match_structure - import pytest +from ..structure import match_structure structure_data: Path = Path(__file__).parent / "structure_data" diff --git a/my/core/tests/test_cachew.py b/my/core/tests/test_cachew.py index 5f7dd65..70ac76f 100644 --- a/my/core/tests/test_cachew.py +++ b/my/core/tests/test_cachew.py @@ -35,8 +35,7 @@ def test_cachew_dir_none() -> None: settings.ENABLE = True # by default it's off in tests (see conftest.py) - from my.core.cachew import cache_dir - from my.core.cachew import mcachew + from my.core.cachew import cache_dir, mcachew from my.core.core_config import _reset_config as reset with reset() as cc: diff --git a/my/core/tests/test_cli.py b/my/core/tests/test_cli.py index 4d847ae..1838e84 100644 --- a/my/core/tests/test_cli.py +++ b/my/core/tests/test_cli.py @@ -1,6 +1,6 @@ import os -from subprocess import check_call import sys +from subprocess import check_call def test_lists_modules() -> None: diff --git a/my/core/tests/test_get_files.py b/my/core/tests/test_get_files.py index 52e43f8..68be4d9 100644 --- a/my/core/tests/test_get_files.py +++ b/my/core/tests/test_get_files.py @@ -1,15 +1,15 @@ import os -from pathlib import Path import shutil import tempfile -from typing import TYPE_CHECKING import zipfile +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest from ..common import get_files from ..kompress import CPath, ZipPath -import pytest - # hack to replace all /tmp with 'real' tmp dir # not ideal, but makes tests more concise diff --git a/my/core/time.py b/my/core/time.py index 430b082..83a407b 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -1,5 +1,5 @@ from functools import lru_cache -from typing import Sequence, Dict +from typing import Dict, Sequence import pytz diff --git a/my/core/types.py b/my/core/types.py index c1b0add..b1cf103 100644 --- a/my/core/types.py +++ b/my/core/types.py @@ -1,13 +1,13 @@ from .internal import assert_subpackage; assert_subpackage(__name__) -from dataclasses import is_dataclass, asdict as dataclasses_asdict +from dataclasses import asdict as dataclasses_asdict +from dataclasses import is_dataclass from datetime import datetime from typing import ( Any, Dict, ) - Json = Dict[str, Any] diff --git a/my/core/util.py b/my/core/util.py index 57e41d4..b48a450 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -1,21 +1,24 @@ -from pathlib import Path -from itertools import chain import os import pkgutil import sys -from typing import List, Iterable, Optional +from itertools import chain +from pathlib import Path +from types import ModuleType +from typing import Iterable, List, Optional -from .discovery_pure import HPIModule, ignored, _is_not_module_src, has_stats +from .discovery_pure import HPIModule, _is_not_module_src, has_stats, ignored def modules() -> Iterable[HPIModule]: import my + for m in _iter_all_importables(my): yield m __NOT_HPI_MODULE__ = 'Import this to mark a python file as a helper, not an actual HPI module' from .discovery_pure import NOT_HPI_MODULE_VAR + assert NOT_HPI_MODULE_VAR in globals() # check name consistency def is_not_hpi_module(module: str) -> Optional[str]: @@ -23,6 +26,7 @@ def is_not_hpi_module(module: str) -> Optional[str]: None if a module, otherwise returns reason ''' import importlib + path: Optional[str] = None try: # TODO annoying, this can cause import of the parent module? @@ -41,7 +45,6 @@ def is_not_hpi_module(module: str) -> Optional[str]: return None -from types import ModuleType # todo reuse in readme/blog post # borrowed from https://github.com/sanitizers/octomachinery/blob/24288774d6dcf977c5033ae11311dbff89394c89/tests/circular_imports_test.py#L22-L55 def _iter_all_importables(pkg: ModuleType) -> Iterable[HPIModule]: @@ -192,6 +195,7 @@ from my.core import __NOT_HPI_MODULE__ ''') import sys + orig_path = list(sys.path) try: sys.path.insert(0, str(badp)) @@ -226,6 +230,7 @@ def stats(): ''') import sys + orig_path = list(sys.path) try: sys.path.insert(0, str(badp)) diff --git a/my/core/utils/concurrent.py b/my/core/utils/concurrent.py index cc17cda..3553cd9 100644 --- a/my/core/utils/concurrent.py +++ b/my/core/utils/concurrent.py @@ -1,10 +1,9 @@ -from concurrent.futures import Future, Executor import sys -from typing import Any, Callable, Optional, TypeVar, TYPE_CHECKING +from concurrent.futures import Executor, Future +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar from ..compat import ParamSpec - _P = ParamSpec('_P') _T = TypeVar('_T') @@ -15,6 +14,7 @@ class DummyExecutor(Executor): This is useful if you're already using Executor for parallelising, but also want to provide an option to run the code serially (e.g. for debugging) """ + def __init__(self, max_workers: Optional[int] = 1) -> None: self._shutdown = False self._max_workers = max_workers diff --git a/my/core/utils/imports.py b/my/core/utils/imports.py index efd8e9a..4666a5e 100644 --- a/my/core/utils/imports.py +++ b/my/core/utils/imports.py @@ -1,9 +1,9 @@ import importlib import importlib.util -from pathlib import Path import sys -from typing import Optional +from pathlib import Path from types import ModuleType +from typing import Optional from ..common import PathIsh diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index e8802bb..023484d 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -4,8 +4,10 @@ Various helpers/transforms of iterators Ideally this should be as small as possible and we should rely on stdlib itertools or more_itertools """ +import warnings from collections.abc import Hashable from typing import ( + TYPE_CHECKING, Callable, Dict, Iterable, @@ -13,18 +15,16 @@ from typing import ( List, Optional, Sized, - Union, TypeVar, + Union, cast, - TYPE_CHECKING, ) -import warnings + +import more_itertools +from decorator import decorator from ..compat import ParamSpec -from decorator import decorator -import more_itertools - T = TypeVar('T') K = TypeVar('K') V = TypeVar('V') @@ -268,7 +268,9 @@ def check_if_hashable(iterable: Iterable[_HT]) -> Iterable[_HT]: def test_check_if_hashable() -> None: from dataclasses import dataclass from typing import Set, Tuple + import pytest + from ..compat import assert_type x1: List[int] = [1, 2] @@ -353,6 +355,7 @@ def unique_everseen( def test_unique_everseen() -> None: import pytest + from ..tests.common import tmp_environ_set def fun_good() -> Iterator[int]: diff --git a/my/core/warnings.py b/my/core/warnings.py index 7051f34..82e539b 100644 --- a/my/core/warnings.py +++ b/my/core/warnings.py @@ -6,8 +6,8 @@ E.g. would be nice to propagate the warnings in the UI (it's even a subclass of ''' import sys -from typing import Optional import warnings +from typing import TYPE_CHECKING, Optional import click @@ -48,5 +48,11 @@ def high(message: str, *args, **kwargs) -> None: _warn(message, *args, **kwargs) -# NOTE: deprecated -- legacy import -from warnings import warn \ No newline at end of file +if not TYPE_CHECKING: + from .compat import deprecated + + @deprecated('use warnings.warn directly instead') + def warn(*args, **kwargs): + import warnings + + return warnings.warn(*args, **kwargs) diff --git a/my/location/common.py b/my/location/common.py index 7824bef..510e005 100644 --- a/my/location/common.py +++ b/my/location/common.py @@ -41,9 +41,9 @@ def locations_to_gpx(locations: Iterable[LocationProtocol], buffer: TextIO) -> I try: import gpxpy.gpx except ImportError as ie: - from my.core.warnings import warn + from my.core.warnings import high - warn("gpxpy not installed, cannot write to gpx. 'pip install gpxpy'") + high("gpxpy not installed, cannot write to gpx. 'pip install gpxpy'") raise ie gpx = gpxpy.gpx.GPX() From 245ad220578a3f3a7db1414bb7d5379f77fb1fd3 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 17 Aug 2024 12:56:13 +0100 Subject: [PATCH 248/302] core.common: bring back asdict backwards compat -- was used in orger --- my/core/common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/my/core/common.py b/my/core/common.py index 9a58caf..225ff2c 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -212,6 +212,12 @@ if not TYPE_CHECKING: return logging.LazyLogger(*args, **kwargs) + @deprecated('use my.core.types.asdict instead') + def asdict(*args, **kwargs): + from . import types + + return types.asdict(*args, **kwargs) + # todo wrap these in deprecated decorator as well? from .cachew import mcachew # noqa: F401 From 5ec357915bbe13a664392ca245d996d00e27acb8 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 17 Aug 2024 12:59:03 +0100 Subject: [PATCH 249/302] core.common: add test for classproperty --- my/core/common.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index 225ff2c..dcd1074 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import ( TYPE_CHECKING, Callable, + Generic, Iterable, List, Sequence, @@ -109,11 +110,12 @@ def get_files( return tuple(paths) -from typing import Callable, Generic, TypeVar - _R = TypeVar('_R') + # https://stackoverflow.com/a/5192374/706389 +# NOTE: it was added to stdlib in 3.9 and then deprecated in 3.11 +# seems that the suggested solution is to use custom decorator? class classproperty(Generic[_R]): def __init__(self, f: Callable[..., _R]) -> None: self.f = f @@ -122,6 +124,19 @@ class classproperty(Generic[_R]): return self.f(cls) +def test_classproperty() -> None: + from .compat import assert_type + + class C: + @classproperty + def prop(cls) -> str: + return 'hello' + + res = C.prop + assert res == 'hello' + assert_type(res, str) + + # hmm, this doesn't really work with mypy well.. # https://github.com/python/mypy/issues/6244 # class staticproperty(Generic[_R]): From 9f017fb29be53866e19ffdf9054b6ac6f011a9cd Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 19 Aug 2024 23:50:12 +0100 Subject: [PATCH 250/302] my.core.pandas: add more tests --- my/core/pandas.py | 126 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 110 insertions(+), 16 deletions(-) diff --git a/my/core/pandas.py b/my/core/pandas.py index 8abbb1f..8ad93cb 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -1,6 +1,7 @@ ''' Various pandas helpers and convenience functions ''' + from __future__ import annotations # todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential @@ -8,12 +9,22 @@ from __future__ import annotations import dataclasses from datetime import datetime, timezone from pprint import pformat -from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Literal, Type, TypeVar +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterable, + Iterator, + Literal, + Type, + TypeVar, +) from decorator import decorator -from . import Res, warnings -from .error import error_to_json, extract_error_datetime +from . import warnings +from .error import Res, error_to_json, extract_error_datetime from .logging import make_logger from .types import Json, asdict @@ -38,7 +49,7 @@ else: S1 = Any -def check_dateish(s: SeriesT[S1]) -> Iterable[str]: +def _check_dateish(s: SeriesT[S1]) -> Iterable[str]: import pandas as pd # noqa: F811 not actually a redefinition ctype = s.dtype @@ -62,9 +73,37 @@ def check_dateish(s: SeriesT[S1]) -> Iterable[str]: def test_check_dateish() -> None: import pandas as pd - # todo just a dummy test to check it doesn't crash, need something meaningful - s1 = pd.Series([1, 2, 3]) - list(check_dateish(s1)) + from .compat import fromisoformat + + # empty series shouldn't warn + assert list(_check_dateish(pd.Series([]))) == [] + + # if no dateimes, shouldn't return any warnings + assert list(_check_dateish(pd.Series([1, 2, 3]))) == [] + + # all values are datetimes, shouldn't warn + # fmt: off + assert list(_check_dateish(pd.Series([ + fromisoformat('2024-08-19T01:02:03'), + fromisoformat('2024-08-19T03:04:05'), + ]))) == [] + # fmt: on + + # mixture of timezones -- should warn + # fmt: off + assert len(list(_check_dateish(pd.Series([ + fromisoformat('2024-08-19T01:02:03'), + fromisoformat('2024-08-19T03:04:05Z'), + ])))) == 1 + # fmt: on + + # TODO hmm. maybe this should actually warn? + # fmt: off + assert len(list(_check_dateish(pd.Series([ + 'whatever', + fromisoformat('2024-08-19T01:02:03'), + ])))) == 0 + # fmt: on # fmt: off @@ -102,7 +141,7 @@ def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy = 'add_if_missing # makes sense to keep super defensive try: for col, data in df.reset_index().items(): - for w in check_dateish(data): + for w in _check_dateish(data): warnings.low(f"{tag}, column '{col}': {w}") except Exception as e: logger.exception(e) @@ -126,8 +165,7 @@ def error_to_row(e: Exception, *, dt_col: str = 'dt', tz: timezone | None = None return err_dict -# todo not sure about naming -def to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]: +def _to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]: for r in it: if isinstance(r, Exception): yield error_to_row(r) @@ -162,7 +200,7 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFr import pandas as pd # noqa: F811 not actually a redefinition columns = None if schema is None else list(_as_columns(schema).keys()) - return pd.DataFrame(to_jsons(it), columns=columns) + return pd.DataFrame(_to_jsons(it), columns=columns) # ugh. in principle this could be inside the test @@ -172,20 +210,76 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFr # see https://github.com/pytest-dev/pytest/issues/7856 @dataclasses.dataclass class _X: + # FIXME try moving inside? x: int def test_as_dataframe() -> None: + import numpy as np + import pandas as pd import pytest + from pandas.testing import assert_frame_equal - it = (dict(i=i, s=f'str{i}') for i in range(10)) + from .compat import fromisoformat + + it = (dict(i=i, s=f'str{i}') for i in range(5)) with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841 df: DataFrameT = as_dataframe(it) # todo test other error col policies - assert list(df.columns) == ['i', 's', 'error'] - assert len(as_dataframe([])) == 0 + # fmt: off + assert_frame_equal( + df, + pd.DataFrame({ + 'i' : [0 , 1 , 2 , 3 , 4 ], + 's' : ['str0', 'str1', 'str2', 'str3', 'str4'], + # NOTE: error column is always added + 'error': [None , None , None , None , None ], + }), + ) + # fmt: on + assert_frame_equal(as_dataframe([]), pd.DataFrame(columns=['error'])) - # makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable df2: DataFrameT = as_dataframe([], schema=_X) - assert list(df2.columns) == ['x', 'error'] + assert_frame_equal( + df2, + # FIXME hmm. x column type should be an int?? and error should be string (or object??) + pd.DataFrame(columns=['x', 'error']), + ) + + @dataclasses.dataclass + class S: + value: str + + def it2() -> Iterator[Res[S]]: + yield S(value='test') + yield RuntimeError('i failed') + + df = as_dataframe(it2()) + # fmt: off + assert_frame_equal( + df, + pd.DataFrame(data={ + 'value': ['test', np.nan ], + 'error': [np.nan, 'RuntimeError: i failed\n'], + 'dt' : [np.nan, np.nan ], + }).astype(dtype={'dt': 'float'}), # FIXME should be datetime64 as below + ) + # fmt: on + + def it3() -> Iterator[Res[S]]: + yield S(value='aba') + yield RuntimeError('whoops') + yield S(value='cde') + yield RuntimeError('exception with datetime', fromisoformat('2024-08-19T22:47:01Z')) + + df = as_dataframe(it3()) + + # fmt: off + assert_frame_equal(df, pd.DataFrame(data={ + 'value': ['aba' , np.nan , 'cde' , np.nan ], + 'error': [np.nan, 'RuntimeError: whoops\n', np.nan, "RuntimeError: ('exception with datetime', datetime.datetime(2024, 8, 19, 22, 47, 1, tzinfo=datetime.timezone.utc))\n"], + # note: dt column is added even if errors don't have an associated datetime + 'dt' : [np.nan, np.nan , np.nan, '2024-08-19 22:47:01+00:00'], + }).astype(dtype={'dt': 'datetime64[ns, UTC]'})) + # fmt: on From d154825591b2b2c074b42852cea9ddd510928fca Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 23 Aug 2024 00:00:16 +0100 Subject: [PATCH 251/302] my.bluemaestro: make config construction lazy following the discussions here: https://github.com/karlicoss/HPI/issues/46#issuecomment-2295464073 --- my/bluemaestro.py | 51 +++++++++++++++++++++++++++++++------------- tests/bluemaestro.py | 8 ++----- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index 3e25cae..12c114f 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -4,36 +4,58 @@ """ # todo most of it belongs to DAL... but considering so few people use it I didn't bother for now +import re +import sqlite3 +from abc import abstractmethod from dataclasses import dataclass from datetime import datetime, timedelta from pathlib import Path -import re -import sqlite3 -from typing import Iterable, Sequence, Set, Optional +from typing import Iterable, Optional, Protocol, Sequence, Set import pytz from my.core import ( + Paths, + Res, + Stats, get_files, make_logger, - Res, stat, - Stats, - influxdb, + unwrap, ) from my.core.cachew import mcachew -from my.core.error import unwrap from my.core.pandas import DataFrameT, as_dataframe from my.core.sqlite import sqlite_connect_immutable -from my.config import bluemaestro as config + +class config(Protocol): + @property + @abstractmethod + def export_path(self) -> Paths: + raise NotImplementedError + + @property + def tz(self) -> pytz.BaseTzInfo: + # fixme: later, rely on the timezone provider + # NOTE: the timezone should be set with respect to the export date!!! + return pytz.timezone('Europe/London') + # TODO when I change tz, check the diff + + +def make_config() -> config: + from my.config import bluemaestro as user_config + + class combined_config(user_config, config): ... + + return combined_config() logger = make_logger(__name__) def inputs() -> Sequence[Path]: - return get_files(config.export_path) + cfg = make_config() + return get_files(cfg.export_path) Celsius = float @@ -50,12 +72,6 @@ class Measurement: dewpoint: Celsius -# fixme: later, rely on the timezone provider -# NOTE: the timezone should be set with respect to the export date!!! -tz = pytz.timezone('Europe/London') -# TODO when I change tz, check the diff - - def is_bad_table(name: str) -> bool: # todo hmm would be nice to have a hook that can patch any module up to delegate = getattr(config, 'is_bad_table', None) @@ -64,6 +80,9 @@ def is_bad_table(name: str) -> bool: @mcachew(depends_on=inputs) def measurements() -> Iterable[Res[Measurement]]: + cfg = make_config() + tz = cfg.tz + # todo ideally this would be via arguments... but needs to be lazy paths = inputs() total = len(paths) @@ -211,6 +230,8 @@ def dataframe() -> DataFrameT: def fill_influxdb() -> None: + from my.core import influxdb + influxdb.fill(measurements(), measurement=__name__) diff --git a/tests/bluemaestro.py b/tests/bluemaestro.py index 84d3eb0..63ce589 100644 --- a/tests/bluemaestro.py +++ b/tests/bluemaestro.py @@ -1,19 +1,15 @@ from pathlib import Path -from typing import TYPE_CHECKING, Iterator, Any +from typing import Iterator from more_itertools import one import pytest -if TYPE_CHECKING: - from my.bluemaestro import Measurement -else: - Measurement = Any +from my.bluemaestro import measurements, Measurement def ok_measurements() -> Iterator[Measurement]: - from my.bluemaestro import measurements for m in measurements(): assert not isinstance(m, Exception) yield m From 5a67f0bafe354514b3490385430a73ad0f55b970 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 23 Aug 2024 00:47:00 +0100 Subject: [PATCH 252/302] pdfs: migrate config to Protocol with properties allowes to remove a whole bunch of hacky crap from tests! --- my/pdfs.py | 86 +++++++++++++++++++++++----------------------- my/tests/common.py | 4 +++ tests/pdfs.py | 30 +++++----------- 3 files changed, 56 insertions(+), 64 deletions(-) diff --git a/my/pdfs.py b/my/pdfs.py index 0ab4af3..524c68b 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -1,64 +1,64 @@ ''' PDF documents and annotations on your filesystem ''' + REQUIRES = [ 'git+https://github.com/0xabu/pdfannots', # todo not sure if should use pypi version? ] -from datetime import datetime -from dataclasses import dataclass -import io -from pathlib import Path import time -from typing import NamedTuple, List, Optional, Iterator, Sequence +from datetime import datetime +from pathlib import Path +from typing import Iterator, List, NamedTuple, Optional, Protocol, Sequence +import pdfannots +from more_itertools import bucket -from my.core import LazyLogger, get_files, Paths, PathIsh +from my.core import PathIsh, Paths, Stats, get_files, make_logger, stat from my.core.cachew import mcachew -from my.core.cfg import Attrs, make_config from my.core.error import Res, split_errors -from more_itertools import bucket -import pdfannots - - -from my.config import pdfs as user_config - -@dataclass -class pdfs(user_config): - paths: Paths = () # allowed to be empty for 'filelist' logic +class config(Protocol): + @property + def paths(self) -> Paths: + return () # allowed to be empty for 'filelist' logic def is_ignored(self, p: Path) -> bool: """ - Used to ignore some extremely heavy files - is_ignored function taken either from config, - or if not defined, it's a function that returns False + You can override this in user config if you want to ignore some files that are tooheavy """ - user_ignore = getattr(user_config, 'is_ignored', None) - if user_ignore is not None: - return user_ignore(p) - return False - @staticmethod - def _migration(attrs: Attrs) -> Attrs: - roots = 'roots' - if roots in attrs: # legacy name - attrs['paths'] = attrs[roots] - from my.core.warnings import high - high(f'"{roots}" is deprecated! Use "paths" instead.') - return attrs + +def make_config() -> config: + from my.config import pdfs as user_config + + class migration: + @property + def paths(self) -> Paths: + roots = getattr(user_config, 'roots', None) + if roots is not None: + from my.core.warnings import high + + high('"roots" is deprecated! Use "paths" instead.') + return roots + else: + return () + + class combined_config(user_config, migration, config): ... + + return combined_config() -config = make_config(pdfs, migration=pdfs._migration) +logger = make_logger(__name__) -logger = LazyLogger(__name__) def inputs() -> Sequence[Path]: - all_files = get_files(config.paths, glob='**/*.pdf') - return [p for p in all_files if not config.is_ignored(p)] + cfg = make_config() + all_files = get_files(cfg.paths, glob='**/*.pdf') + return [p for p in all_files if not cfg.is_ignored(p)] # TODO canonical names/fingerprinting? @@ -121,14 +121,13 @@ def _iter_annotations(pdfs: Sequence[Path]) -> Iterator[Res[Annotation]]: # todo how to print to stdout synchronously? # todo global config option not to use pools? useful for debugging.. from concurrent.futures import ProcessPoolExecutor + from my.core.utils.concurrent import DummyExecutor + workers = None # use 0 for debugging Pool = DummyExecutor if workers == 0 else ProcessPoolExecutor with Pool(workers) as pool: - futures = [ - pool.submit(get_annots, pdf) - for pdf in pdfs - ] + futures = [pool.submit(get_annots, pdf) for pdf in pdfs] for f, pdf in zip(futures, pdfs): try: yield from f.result() @@ -161,11 +160,13 @@ class Pdf(NamedTuple): return self.created -def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Res[Pdf]]: +def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]] = None) -> Iterator[Res[Pdf]]: if filelist is not None: # hacky... keeping it backwards compatible # https://github.com/karlicoss/HPI/pull/74 - config.paths = filelist + from my.config import pdfs as user_config + + user_config.paths = filelist ait = annotations() vit, eit = split_errors(ait, ET=Exception) @@ -176,10 +177,9 @@ def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]]=None) -> Iterator[Re yield from eit -from my.core import stat, Stats def stats() -> Stats: return { - **stat(annotations) , + **stat(annotations), **stat(annotated_pdfs), } diff --git a/my/tests/common.py b/my/tests/common.py index e3060e1..f8b645d 100644 --- a/my/tests/common.py +++ b/my/tests/common.py @@ -20,6 +20,10 @@ def reset_modules() -> None: ''' to_unload = [m for m in sys.modules if re.match(r'my[.]?', m)] for m in to_unload: + if 'my.pdfs' in m: + # temporary hack -- since my.pdfs migrated to a 'lazy' config, this isn't necessary anymore + # but if we reset module anyway, it confuses the ProcessPool inside my.pdfs + continue del sys.modules[m] diff --git a/tests/pdfs.py b/tests/pdfs.py index 63b1319..6db669f 100644 --- a/tests/pdfs.py +++ b/tests/pdfs.py @@ -1,17 +1,16 @@ +import inspect from pathlib import Path +import pytest from more_itertools import ilen -import pytest - +from my.core.cfg import tmp_config from my.tests.common import testdata +from my.pdfs import annotated_pdfs, annotations, get_annots + def test_module(with_config) -> None: - # TODO crap. if module is imported too early (on the top level, it makes it super hard to override config) - # need to at least detect it... - from my.pdfs import annotations, annotated_pdfs - # todo check types etc as well assert ilen(annotations()) >= 3 assert ilen(annotated_pdfs()) >= 1 @@ -22,12 +21,13 @@ def test_with_error(with_config, tmp_path: Path) -> None: root = tmp_path g = root / 'garbage.pdf' g.write_text('garbage') + from my.config import pdfs + # meh. otherwise legacy config value 'wins' del pdfs.roots # type: ignore[attr-defined] pdfs.paths = (root,) - from my.pdfs import annotations annots = list(annotations()) [annot] = annots assert isinstance(annot, Exception) @@ -35,9 +35,6 @@ def test_with_error(with_config, tmp_path: Path) -> None: @pytest.fixture def with_config(): - from my.tests.common import reset_modules - reset_modules() # todo ugh.. getting boilerplaty.. need to make it a bit more automatic.. - # extra_data = Path(__file__).absolute().parent / 'extra/data/polar' # assert extra_data.exists(), extra_data # todo hmm, turned out no annotations in these ones.. whatever @@ -47,13 +44,9 @@ def with_config(): testdata(), ] - import my.core.cfg as C - with C.tmp_config() as config: + with tmp_config() as config: config.pdfs = user_config - try: - yield - finally: - reset_modules() + yield EXPECTED_HIGHLIGHTS = { @@ -68,8 +61,6 @@ def test_get_annots() -> None: Test get_annots, with a real PDF file get_annots should return a list of three Annotation objects """ - from my.pdfs import get_annots - annotations = get_annots(testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf') assert len(annotations) == 3 assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS @@ -80,12 +71,9 @@ def test_annotated_pdfs_with_filelist() -> None: Test annotated_pdfs, with a real PDF file annotated_pdfs should return a list of one Pdf object, with three Annotations """ - from my.pdfs import annotated_pdfs - filelist = [testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf'] annotations_generator = annotated_pdfs(filelist=filelist) - import inspect assert inspect.isgeneratorfunction(annotated_pdfs) highlights_from_pdfs = [] From 1215181af533ba49bbb7658de66b259333e90310 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 25 Aug 2024 16:36:34 +0100 Subject: [PATCH 253/302] core: move stuff from tests/demo.py to my/core/tests/test_config.py also clean all this up a bit --- my/core/__init__.py | 4 +- my/core/tests/test_config.py | 132 +++++++++++++++++++++++++++++++++++ my/demo.py | 53 ++++++++------ tests/demo.py | 118 ------------------------------- 4 files changed, 164 insertions(+), 143 deletions(-) create mode 100644 my/core/tests/test_config.py delete mode 100644 tests/demo.py diff --git a/my/core/__init__.py b/my/core/__init__.py index 19be7fe..ba633f6 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING from .cfg import make_config from .common import PathIsh, Paths, get_files from .compat import assert_never -from .error import Res, unwrap +from .error import Res, unwrap, notnone from .logging import ( make_logger, ) @@ -42,7 +42,7 @@ __all__ = [ '__NOT_HPI_MODULE__', - 'Res', 'unwrap', + 'Res', 'unwrap', 'notnone', 'dataclass', 'Path', ] diff --git a/my/core/tests/test_config.py b/my/core/tests/test_config.py new file mode 100644 index 0000000..c76f5d9 --- /dev/null +++ b/my/core/tests/test_config.py @@ -0,0 +1,132 @@ +""" +Various tests that are checking behaviour of user config wrt to various things +""" + +import sys +from pathlib import Path + +import pytest +import pytz +from more_itertools import ilen + +import my.config +from my.core import notnone +from my.demo import items, make_config + + +# run the same test multiple times to make sure there are not issues with import order etc +@pytest.mark.parametrize('run_id', ['1', '2']) +def test_override_config(tmp_path: Path, run_id: str) -> None: + class user_config: + username = f'user_{run_id}' + data_path = f'{tmp_path}/*.json' + + my.config.demo = user_config # type: ignore[misc, assignment] + + [item1, item2] = items() + assert item1.username == f'user_{run_id}' + assert item2.username == f'user_{run_id}' + + +@pytest.mark.skip(reason="won't work at the moment because of inheritance") +def test_dynamic_config_simplenamespace(tmp_path: Path) -> None: + from types import SimpleNamespace + + user_config = SimpleNamespace( + username='user3', + data_path=f'{tmp_path}/*.json', + ) + my.config.demo = user_config # type: ignore[misc, assignment] + + cfg = make_config() + + assert cfg.username == 'user3' + + +def test_mixin_attribute_handling(tmp_path: Path) -> None: + """ + Tests that arbitrary mixin attributes work with our config handling pattern + """ + + nytz = pytz.timezone('America/New_York') + + class user_config: + # check that override is taken into the account + timezone = nytz + + irrelevant = 'hello' + + username = 'UUU' + data_path = f'{tmp_path}/*.json' + + my.config.demo = user_config # type: ignore[misc, assignment] + + cfg = make_config() + + assert cfg.username == 'UUU' + + # mypy doesn't know about it, but the attribute is there + assert getattr(cfg, 'irrelevant') == 'hello' + + # check that overridden default attribute is actually getting overridden + assert cfg.timezone == nytz + + [item1, item2] = items() + assert item1.username == 'UUU' + assert notnone(item1.dt.tzinfo).zone == nytz.zone # type: ignore[attr-defined] + assert item2.username == 'UUU' + assert notnone(item2.dt.tzinfo).zone == nytz.zone # type: ignore[attr-defined] + + +# use multiple identical tests to make sure there are no issues with cached imports etc +@pytest.mark.parametrize('run_id', ['1', '2']) +def test_dynamic_module_import(tmp_path: Path, run_id: str) -> None: + """ + Test for dynamic hackery in config properties + e.g. importing some external modules + """ + + ext = tmp_path / 'external' + ext.mkdir() + (ext / '__init__.py').write_text( + ''' +def transform(x): + from .submodule import do_transform + return do_transform(x) + +''' + ) + (ext / 'submodule.py').write_text( + f''' +def do_transform(x): + return {{"total_{run_id}": sum(x.values())}} +''' + ) + + class user_config: + username = 'someuser' + data_path = f'{tmp_path}/*.json' + external = f'{ext}' + + my.config.demo = user_config # type: ignore[misc, assignment] + + [item1, item2] = items() + assert item1.raw == {f'total_{run_id}': 1 + 123}, item1 + assert item2.raw == {f'total_{run_id}': 2 + 456}, item2 + + # need to reset these modules, otherwise they get cached + # kind of relevant to my.core.cfg.tmp_config + sys.modules.pop('external', None) + sys.modules.pop('external.submodule', None) + + +@pytest.fixture(autouse=True) +def prepare_data(tmp_path: Path): + (tmp_path / 'data.json').write_text( + ''' +[ + {"key": 1, "value": 123}, + {"key": 2, "value": 456} +] +''' + ) diff --git a/my/demo.py b/my/demo.py index 645be4f..e27b5dd 100644 --- a/my/demo.py +++ b/my/demo.py @@ -2,19 +2,23 @@ Just a demo module for testing and documentation purposes ''' -from .core import Paths, PathIsh - -from typing import Optional -from datetime import tzinfo, timezone - -from my.config import demo as user_config +import json +from abc import abstractmethod from dataclasses import dataclass +from datetime import datetime, timezone, tzinfo +from pathlib import Path +from typing import Iterable, Optional, Protocol, Sequence + +from my.core import Json, PathIsh, Paths, get_files -@dataclass -class demo(user_config): +class config(Protocol): data_path: Paths + + # this is to check required attribute handling username: str + + # this is to check optional attribute handling timezone: tzinfo = timezone.utc external: Optional[PathIsh] = None @@ -23,47 +27,50 @@ class demo(user_config): def external_module(self): rpath = self.external if rpath is not None: - from .core.utils.imports import import_dir + from my.core.utils.imports import import_dir + return import_dir(rpath) - import my.config.repos.external as m # type: ignore + import my.config.repos.external as m # type: ignore + return m -from .core import make_config -config = make_config(demo) +def make_config() -> config: + from my.config import demo as user_config -# TODO not sure about type checking? -external = config.external_module + class combined_config(user_config, config): ... + return combined_config() -from pathlib import Path -from typing import Sequence, Iterable -from datetime import datetime -from .core import Json, get_files @dataclass class Item: ''' Some completely arbitrary artificial stuff, just for testing ''' + username: str raw: Json dt: datetime def inputs() -> Sequence[Path]: - return get_files(config.data_path) + cfg = make_config() + return get_files(cfg.data_path) -import json def items() -> Iterable[Item]: + cfg = make_config() + + transform = (lambda i: i) if cfg.external is None else cfg.external_module.transform + for f in inputs(): - dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config.timezone) + dt = datetime.fromtimestamp(f.stat().st_mtime, tz=cfg.timezone) j = json.loads(f.read_text()) for raw in j: yield Item( - username=config.username, - raw=external.identity(raw), + username=cfg.username, + raw=transform(raw), dt=dt, ) diff --git a/tests/demo.py b/tests/demo.py deleted file mode 100644 index 73a6c65..0000000 --- a/tests/demo.py +++ /dev/null @@ -1,118 +0,0 @@ -import sys -from pathlib import Path -from more_itertools import ilen - -# TODO NOTE: this wouldn't work because of an early my.config.demo import -# from my.demo import items - -def test_dynamic_config_1(tmp_path: Path) -> None: - import my.config - - class user_config: - username = 'user' - data_path = f'{tmp_path}/*.json' - external = f'{tmp_path}/external' - my.config.demo = user_config # type: ignore[misc, assignment] - - from my.demo import items - [item1, item2] = items() - assert item1.username == 'user' - - -# exactly the same test, but using a different config, to test out the behaviour w.r.t. import order -def test_dynamic_config_2(tmp_path: Path) -> None: - # doesn't work without it! - # because the config from test_dybamic_config_1 is cached in my.demo.demo - del sys.modules['my.demo'] - - import my.config - - class user_config: - username = 'user2' - data_path = f'{tmp_path}/*.json' - external = f'{tmp_path}/external' - my.config.demo = user_config # type: ignore[misc, assignment] - - from my.demo import items - [item1, item2] = items() - assert item1.username == 'user2' - - -import pytest - -@pytest.mark.skip(reason="won't work at the moment because of inheritance") -def test_dynamic_config_simplenamespace(tmp_path: Path) -> None: - # doesn't work without it! - # because the config from test_dybamic_config_1 is cached in my.demo.demo - del sys.modules['my.demo'] - - import my.config - from types import SimpleNamespace - - user_config = SimpleNamespace( - username='user3', - data_path=f'{tmp_path}/*.json', - ) - my.config.demo = user_config # type: ignore[misc, assignment] - - from my.demo import config - assert config.username == 'user3' - - -# make sure our config handling pattern does it as expected -def test_attribute_handling(tmp_path: Path) -> None: - # doesn't work without it! - # because the config from test_dybamic_config_1 is cached in my.demo.demo - del sys.modules['my.demo'] - - import pytz - nytz = pytz.timezone('America/New_York') - - import my.config - class user_config: - # check that override is taken into the account - timezone = nytz - - irrelevant = 'hello' - - username = 'UUU' - data_path = f'{tmp_path}/*.json' - external = f'{tmp_path}/external' - - - my.config.demo = user_config # type: ignore[misc, assignment] - - from my.demo import config - - assert config.username == 'UUU' - - # mypy doesn't know about it, but the attribute is there - assert getattr(config, 'irrelevant') == 'hello' - - # check that overridden default attribute is actually getting overridden - assert config.timezone == nytz - - - -@pytest.fixture(autouse=True) -def prepare(tmp_path: Path): - (tmp_path / 'data.json').write_text(''' -[ - {"key1": 1}, - {"key2": 2} -] -''') - ext = tmp_path / 'external' - ext.mkdir() - (ext / '__init__.py').write_text(''' -def identity(x): - from .submodule import hello - hello(x) - return x - -''') - (ext / 'submodule.py').write_text('hello = lambda x: print("hello " + str(x))') - yield - ex = 'my.config.repos.external' - if ex in sys.modules: - del sys.modules[ex] From 2ff2dcfc003d2ff0e0440d91b2778eee3c2e851c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 25 Aug 2024 18:51:34 +0100 Subject: [PATCH 254/302] tests: move test checkign for my_config handling to core/tests/test_config.py allows to remove the hacky reset_modules thing from setup fixture --- my/core/tests/test_config.py | 47 ++++++++++++++++++++++++++++++++++++ tests/config.py | 41 ++----------------------------- 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/my/core/tests/test_config.py b/my/core/tests/test_config.py index c76f5d9..a318a95 100644 --- a/my/core/tests/test_config.py +++ b/my/core/tests/test_config.py @@ -3,6 +3,7 @@ Various tests that are checking behaviour of user config wrt to various things """ import sys +import os from pathlib import Path import pytest @@ -13,6 +14,10 @@ import my.config from my.core import notnone from my.demo import items, make_config +from .common import tmp_environ_set + +# TODO would be nice to randomize test order here to catch various config issues + # run the same test multiple times to make sure there are not issues with import order etc @pytest.mark.parametrize('run_id', ['1', '2']) @@ -120,6 +125,48 @@ def do_transform(x): sys.modules.pop('external.submodule', None) +@pytest.mark.parametrize('run_id', ['1', '2']) +def test_my_config_env_variable(tmp_path: Path, run_id: str) -> None: + """ + Tests handling of MY_CONFIG variable + """ + + # ugh. so by this point, my.config is already loaded (default stub), so we need to unload it + sys.modules.pop('my.config', None) + # but my.config itself relies on my.core.init hook, so unless it's reloaded too it wouldn't help + sys.modules.pop('my.core', None) + sys.modules.pop('my.core.init', None) + # it's a bit of a mouthful of course, but in most cases MY_CONFIG would be set once + # , and before hpi runs, so hopefully it's not a huge deal + cfg_dir = tmp_path / 'my' + cfg_file = cfg_dir / 'config.py' + cfg_dir.mkdir() + + cfg_file.write_text( + f''' +# print("IMPORTING CONFIG {run_id}") +class demo: + username = 'xxx_{run_id}' + data_path = r'{tmp_path}{os.sep}*.json' # need raw string for windows... +''' + ) + + with tmp_environ_set('MY_CONFIG', str(tmp_path)): + [item1, item2] = items() + assert item1.username == f'xxx_{run_id}' + assert item2.username == f'xxx_{run_id}' + + # sigh.. so this is cached in sys.path + # so it takes precedence later during next import, not giving the MY_CONFIG hook + # (imported from builtin my.config) to kick in + sys.path.remove(str(tmp_path)) + + # FIXME ideally this shouldn't be necessary? + # remove this after we fixup my.tests.reddit and my.tests.commits + # (they were failing ci when running all tests) + sys.modules.pop('my.config', None) + + @pytest.fixture(autouse=True) def prepare_data(tmp_path: Path): (tmp_path / 'data.json').write_text( diff --git a/tests/config.py b/tests/config.py index 101f7df..acfe1f1 100644 --- a/tests/config.py +++ b/tests/config.py @@ -1,6 +1,7 @@ from pathlib import Path +# TODO move this somewhere else -- there are more specific tests covering this now def test_dynamic_configuration(notes: Path) -> None: import pytz from types import SimpleNamespace as NS @@ -26,42 +27,11 @@ def test_dynamic_configuration(notes: Path) -> None: import pytest -def test_environment_variable(tmp_path: Path) -> None: - cfg_dir = tmp_path / 'my' - cfg_file = cfg_dir / 'config.py' - cfg_dir.mkdir() - cfg_file.write_text(''' -class feedly: - pass -class just_for_test: - pass -''') - - import os - oenv = dict(os.environ) - try: - os.environ['MY_CONFIG'] = str(tmp_path) - # should not raise at least - import my.rss.feedly - - import my.config as c - assert hasattr(c, 'just_for_test') - finally: - os.environ.clear() - os.environ.update(oenv) - - import sys - # TODO wtf??? doesn't work without unlink... is it caching something? - cfg_file.unlink() - del sys.modules['my.config'] # meh.. - - import my.config as c - assert not hasattr(c, 'just_for_test') - from dataclasses import dataclass +# TODO this test should probs be deprecated? it's more of a documentation? def test_user_config() -> None: from my.core.common import classproperty class user_config: @@ -117,10 +87,3 @@ Some misc stuff yield ndir finally: pass - - -@pytest.fixture(autouse=True) -def prepare(): - from my.tests.common import reset_modules - reset_modules() - yield From 7cae9d5bf367f0c5d2a1cc158cb5782a523666b1 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 25 Aug 2024 22:36:54 +0100 Subject: [PATCH 255/302] my.google.takeout.paths: migrate to new style lazy config also clean up tests a little and move into my.tests.location.google --- my/google/takeout/paths.py | 45 ++++++++++++++++++------------ my/tests/common.py | 4 +++ my/tests/location/google.py | 55 +++++++++++++++++++++++++++++++++++++ tests/location.py | 28 ------------------- 4 files changed, 86 insertions(+), 46 deletions(-) create mode 100644 my/tests/location/google.py delete mode 100644 tests/location.py diff --git a/my/google/takeout/paths.py b/my/google/takeout/paths.py index 5b53149..948cf2e 100644 --- a/my/google/takeout/paths.py +++ b/my/google/takeout/paths.py @@ -2,44 +2,53 @@ Module for locating and accessing [[https://takeout.google.com][Google Takeout]] data ''' -from dataclasses import dataclass -from ...core.common import Paths, get_files -from ...core.util import __NOT_HPI_MODULE__ - -from my.config import google as user_config +from abc import abstractmethod +from pathlib import Path +from typing import Iterable, Optional, Protocol from more_itertools import last -@dataclass -class google(user_config): - takeout_path: Paths # path/paths/glob for the takeout zips -### +from my.core import __NOT_HPI_MODULE__, Paths, get_files + + +class config: + """ + path/paths/glob for the takeout zips + """ + + @property + @abstractmethod + def takeout_path(self) -> Paths: + raise NotImplementedError + # TODO rename 'google' to 'takeout'? not sure -from ...core.cfg import make_config -config = make_config(google) -from pathlib import Path -from typing import Optional, Iterable +def make_config() -> config: + from my.config import google as user_config + + class combined_config(user_config, config): ... + + return combined_config() -def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]: +def get_takeouts(*, path: Optional[str] = None) -> Iterable[Path]: """ Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need """ - # TODO FIXME zip is not great.. + # TODO zip is not great.. # allow a lambda expression? that way the user could restrict it - for takeout in get_files(config.takeout_path, glob='*.zip'): + cfg = make_config() + for takeout in get_files(cfg.takeout_path, glob='*.zip'): if path is None or (takeout / path).exists(): yield takeout -def get_last_takeout(*, path: Optional[str]=None) -> Optional[Path]: +def get_last_takeout(*, path: Optional[str] = None) -> Optional[Path]: return last(get_takeouts(path=path), default=None) # TODO might be a good idea to merge across multiple takeouts... # perhaps even a special takeout module that deals with all of this automatically? # e.g. accumulate, filter and maybe report useless takeouts? - diff --git a/my/tests/common.py b/my/tests/common.py index f8b645d..962ee46 100644 --- a/my/tests/common.py +++ b/my/tests/common.py @@ -31,3 +31,7 @@ def testdata() -> Path: d = Path(__file__).absolute().parent.parent.parent / 'testdata' assert d.exists(), d return d + + +# prevent pytest from treating this as test +testdata.__test__ = False # type: ignore[attr-defined] diff --git a/my/tests/location/google.py b/my/tests/location/google.py new file mode 100644 index 0000000..612522b --- /dev/null +++ b/my/tests/location/google.py @@ -0,0 +1,55 @@ +""" +Tests for LEGACY location provider + +Keeping for now for backwards compatibility +""" + +from pathlib import Path + +import pytest +from more_itertools import one + +from my.core.cfg import tmp_config +from my.location.google import locations + + +def test_google_locations() -> None: + locs = list(locations()) + assert len(locs) == 3810, len(locs) + + last = locs[-1] + assert last.dt.strftime('%Y%m%d %H:%M:%S') == '20170802 13:01:56' # should be utc + # todo approx + assert last.lat == 46.5515350 + assert last.lon == 16.4742742 + # todo check altitude + + +@pytest.fixture(autouse=True) +def prepare(tmp_path: Path): + + # TODO could just pick a part of shared config? not sure + _takeout_path = _prepare_takeouts_dir(tmp_path) + + class google: + takeout_path = _takeout_path + + with tmp_config() as config: + config.google = google + yield + + +def _prepare_takeouts_dir(tmp_path: Path) -> Path: + from ..common import testdata + + try: + track = one(testdata().rglob('italy-slovenia-2017-07-29.json')) + except ValueError: + raise RuntimeError('testdata not found, setup git submodules?') + + # todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs + import zipfile + + with zipfile.ZipFile(tmp_path / 'takeout.zip', 'w') as zf: + zf.writestr('Takeout/Location History/Location History.json', track.read_bytes()) + return tmp_path diff --git a/tests/location.py b/tests/location.py deleted file mode 100644 index 2597d5e..0000000 --- a/tests/location.py +++ /dev/null @@ -1,28 +0,0 @@ -from pathlib import Path - -import pytest - - -def test() -> None: - from my.location.google import locations - locs = list(locations()) - assert len(locs) == 3810 - - last = locs[-1] - assert last.dt.strftime('%Y%m%d %H:%M:%S') == '20170802 13:01:56' # should be utc - # todo approx - assert last.lat == 46.5515350 - assert last.lon == 16.4742742 - # todo check altitude - - -@pytest.fixture(autouse=True) -def prepare(tmp_path: Path): - from .shared_config import temp_config - user_config = temp_config(tmp_path) - - import my.core.cfg as C - with C.tmp_config() as config: - config.google = user_config.google - yield - From 094519acafc681d15a0e395e1dbe9b62641846ce Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 25 Aug 2024 23:06:26 +0100 Subject: [PATCH 256/302] tests: disable cachew in my.tests subpackage --- my/tests/conftest.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 my/tests/conftest.py diff --git a/my/tests/conftest.py b/my/tests/conftest.py new file mode 100644 index 0000000..4e67f71 --- /dev/null +++ b/my/tests/conftest.py @@ -0,0 +1,8 @@ +import pytest + +# I guess makes sense by default +@pytest.fixture(autouse=True) +def without_cachew(): + from my.core.cachew import disabled_cachew + with disabled_cachew(): + yield From 270080bd562aec0489a9b3573e00882756736342 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 26 Aug 2024 00:08:09 +0100 Subject: [PATCH 257/302] core.error: better defensive handling for my.core.source when parts of config are missing --- my/core/error.py | 27 +++++++++++++++++++++++---- my/core/source.py | 2 +- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/my/core/error.py b/my/core/error.py index c4dff07..7489f69 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -201,7 +201,12 @@ def error_to_json(e: Exception) -> Json: MODULE_SETUP_URL = 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#private-configuration-myconfig' -def warn_my_config_import_error(err: Union[ImportError, AttributeError], help_url: Optional[str] = None) -> bool: +def warn_my_config_import_error( + err: Union[ImportError, AttributeError], + *, + help_url: Optional[str] = None, + module_name: Optional[str] = None, +) -> bool: """ If the user tried to import something from my.config but it failed, possibly due to missing the config block in my.config? @@ -233,10 +238,24 @@ See {help_url}\ config_obj = cast(object, getattr(err, 'obj')) # the object that caused the attribute error # e.g. active_browser for my.browser nested_block_name = err.name - if config_obj.__module__ == 'my.config': - click.secho(f"""You're likely missing the nested config block for '{getattr(config_obj, '__name__', str(config_obj))}.{nested_block_name}'. + errmsg = f"""You're likely missing the nested config block for '{getattr(config_obj, '__name__', str(config_obj))}.{nested_block_name}'. See {help_url} or check the corresponding module.py file for an example\ -""", fg='yellow', err=True) +""" + if config_obj.__module__ == 'my.config': + click.secho(errmsg, fg='yellow', err=True) + return True + if module_name is not None and nested_block_name == module_name.split('.')[-1]: + # this tries to cover cases like these + # user config: + # class location: + # class via_ip: + # accuracy = 10_000 + # then when we import it, we do something like + # from my.config import location + # user_config = location.via_ip + # so if location is present, but via_ip is not, we get + # AttributeError: type object 'location' has no attribute 'via_ip' + click.secho(errmsg, fg='yellow', err=True) return True else: click.echo(f"Unexpected error... {err}", err=True) diff --git a/my/core/source.py b/my/core/source.py index 9488ae2..6e0a78a 100644 --- a/my/core/source.py +++ b/my/core/source.py @@ -65,7 +65,7 @@ class core: """) # try to check if this is a config error or based on dependencies not being installed if isinstance(err, (ImportError, AttributeError)): - matched_config_err = warn_my_config_import_error(err, help_url=help_url) + matched_config_err = warn_my_config_import_error(err, module_name=module_name, help_url=help_url) # if we determined this wasn't a config error, and it was an attribute error # it could be *any* attribute error -- we should raise this since its otherwise a fatal error # from some code in the module failing From a5643206a0dc2735bc383ef700afd71439d8b192 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 26 Aug 2024 02:00:51 +0100 Subject: [PATCH 258/302] general: make time.tz.via_location user config lazy, move tests to my.tests package also gets rid of the problematic reset_modules thingie --- my/calendar/holidays.py | 2 +- my/core/compat.py | 2 + my/location/fallback/via_ip.py | 4 +- my/tests/calendar.py | 9 ++ my/tests/common.py | 16 --- my/tests/conftest.py | 2 + .../tests/location/fallback.py | 54 +++++---- .../tests/shared_tz_config.py | 75 ++++++------ my/tests/tz.py | 107 ++++++++++++++++++ my/time/tz/via_location.py | 107 ++++++++++++------ tests/calendar.py | 19 ---- tests/orgmode.py | 7 +- tests/takeout.py | 2 +- tests/tz.py | 95 ---------------- tox.ini | 1 - 15 files changed, 269 insertions(+), 233 deletions(-) create mode 100644 my/tests/calendar.py rename tests/location_fallback.py => my/tests/location/fallback.py (86%) rename tests/shared_config.py => my/tests/shared_tz_config.py (55%) create mode 100644 my/tests/tz.py delete mode 100644 tests/calendar.py delete mode 100644 tests/tz.py diff --git a/my/calendar/holidays.py b/my/calendar/holidays.py index f73bf70..af51696 100644 --- a/my/calendar/holidays.py +++ b/my/calendar/holidays.py @@ -19,7 +19,7 @@ def _calendar(): # todo switch to using time.tz.main once _get_tz stabilizes? from ..time.tz import via_location as LTZ # TODO would be nice to do it dynamically depending on the past timezones... - tz = LTZ._get_tz(datetime.now()) + tz = LTZ.get_tz(datetime.now()) assert tz is not None zone = tz.zone; assert zone is not None code = zone_to_countrycode(zone) diff --git a/my/core/compat.py b/my/core/compat.py index 4372a01..eccaf07 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -125,8 +125,10 @@ def test_fromisoformat() -> None: if sys.version_info[:2] >= (3, 10): from types import NoneType + from typing import TypeAlias else: NoneType = type(None) + from typing_extensions import TypeAlias if sys.version_info[:2] >= (3, 11): diff --git a/my/location/fallback/via_ip.py b/my/location/fallback/via_ip.py index db03c7c..79a452c 100644 --- a/my/location/fallback/via_ip.py +++ b/my/location/fallback/via_ip.py @@ -29,7 +29,6 @@ from typing import Iterator, List from my.core import make_logger from my.core.compat import bisect_left -from my.ip.all import ips from my.location.common import Location from my.location.fallback.common import FallbackLocation, DateExact, _datetime_timestamp @@ -37,6 +36,9 @@ logger = make_logger(__name__, level="warning") def fallback_locations() -> Iterator[FallbackLocation]: + # prefer late import since ips get overridden in tests + from my.ip.all import ips + dur = config.for_duration.total_seconds() for ip in ips(): lat, lon = ip.latlon diff --git a/my/tests/calendar.py b/my/tests/calendar.py new file mode 100644 index 0000000..b5f856c --- /dev/null +++ b/my/tests/calendar.py @@ -0,0 +1,9 @@ +from my.calendar.holidays import is_holiday + +from .shared_tz_config import config # autoused fixture + + +def test_is_holiday() -> None: + assert is_holiday('20190101') + assert not is_holiday('20180601') + assert is_holiday('20200906') # national holiday in Bulgaria diff --git a/my/tests/common.py b/my/tests/common.py index 962ee46..cf5c632 100644 --- a/my/tests/common.py +++ b/my/tests/common.py @@ -1,7 +1,5 @@ import os from pathlib import Path -import re -import sys import pytest @@ -13,20 +11,6 @@ skip_if_not_karlicoss = pytest.mark.skipif( ) -def reset_modules() -> None: - ''' - A hack to 'unload' HPI modules, otherwise some modules might cache the config - TODO: a bit crap, need a better way.. - ''' - to_unload = [m for m in sys.modules if re.match(r'my[.]?', m)] - for m in to_unload: - if 'my.pdfs' in m: - # temporary hack -- since my.pdfs migrated to a 'lazy' config, this isn't necessary anymore - # but if we reset module anyway, it confuses the ProcessPool inside my.pdfs - continue - del sys.modules[m] - - def testdata() -> Path: d = Path(__file__).absolute().parent.parent.parent / 'testdata' assert d.exists(), d diff --git a/my/tests/conftest.py b/my/tests/conftest.py index 4e67f71..cc7bb7e 100644 --- a/my/tests/conftest.py +++ b/my/tests/conftest.py @@ -1,8 +1,10 @@ import pytest + # I guess makes sense by default @pytest.fixture(autouse=True) def without_cachew(): from my.core.cachew import disabled_cachew + with disabled_cachew(): yield diff --git a/tests/location_fallback.py b/my/tests/location/fallback.py similarity index 86% rename from tests/location_fallback.py rename to my/tests/location/fallback.py index aad33ee..10a4e5b 100644 --- a/tests/location_fallback.py +++ b/my/tests/location/fallback.py @@ -2,32 +2,23 @@ To test my.location.fallback_location.all """ +from datetime import datetime, timedelta, timezone from typing import Iterator -from datetime import datetime, timezone, timedelta +import pytest from more_itertools import ilen -from my.ip.common import IP - -def data() -> Iterator[IP]: - # random IP addresses - yield IP(addr="67.98.113.0", dt=datetime(2020, 1, 1, 12, 0, 0, tzinfo=timezone.utc)) - yield IP(addr="67.98.112.0", dt=datetime(2020, 1, 15, 12, 0, 0, tzinfo=timezone.utc)) - yield IP(addr="59.40.113.87", dt=datetime(2020, 2, 1, 12, 0, 0, tzinfo=timezone.utc)) - yield IP(addr="59.40.139.87", dt=datetime(2020, 2, 1, 16, 0, 0, tzinfo=timezone.utc)) - yield IP(addr="161.235.192.228", dt=datetime(2020, 3, 1, 12, 0, 0, tzinfo=timezone.utc)) - -# redefine the my.ip.all function using data for testing import my.ip.all as ip_module -ip_module.ips = data - +from my.ip.common import IP from my.location.fallback import via_ip +from ..shared_tz_config import config # autoused fixture + + # these are all tests for the bisect algorithm defined in via_ip.py # to make sure we can correctly find IPs that are within the 'for_duration' of a given datetime - def test_ip_fallback() -> None: - # make sure that the data override works + # precondition, make sure that the data override works assert ilen(ip_module.ips()) == ilen(data()) assert ilen(ip_module.ips()) == ilen(via_ip.fallback_locations()) assert ilen(via_ip.fallback_locations()) == 5 @@ -47,7 +38,9 @@ def test_ip_fallback() -> None: assert len(est) == 1 # right after the 'for_duration' for an IP - est = list(via_ip.estimate_location(datetime(2020, 1, 1, 12, 0, 0, tzinfo=timezone.utc) + via_ip.config.for_duration + timedelta(seconds=1))) + est = list( + via_ip.estimate_location(datetime(2020, 1, 1, 12, 0, 0, tzinfo=timezone.utc) + via_ip.config.for_duration + timedelta(seconds=1)) + ) assert len(est) == 0 # on 2/1/2020, threes one IP if before 16:30 @@ -75,8 +68,8 @@ def test_ip_fallback() -> None: # # redefine fallback_estimators to prevent possible namespace packages the user # may have installed from having side effects testing this - from my.location.fallback import all - from my.location.fallback import via_home + from my.location.fallback import all, via_home + def _fe() -> Iterator[all.LocationEstimator]: yield via_ip.estimate_location yield via_home.estimate_location @@ -88,6 +81,7 @@ def test_ip_fallback() -> None: # # just passing via_ip should give one IP from my.location.fallback.common import _iter_estimate_from + raw_est = list(_iter_estimate_from(use_dt, (via_ip.estimate_location,))) assert len(raw_est) == 1 assert raw_est[0].datasource == "via_ip" @@ -110,7 +104,7 @@ def test_ip_fallback() -> None: # should have used the IP from via_ip since it was more accurate assert all_est.datasource == "via_ip" - # test that a home defined in shared_config.py is used if no IP is found + # test that a home defined in shared_tz_config.py is used if no IP is found loc = all.estimate_location(datetime(2021, 1, 1, 12, 30, 0, tzinfo=timezone.utc)) assert loc.datasource == "via_home" @@ -121,5 +115,21 @@ def test_ip_fallback() -> None: assert (loc.lat, loc.lon) != (bulgaria.lat, bulgaria.lon) -# re-use prepare fixture for overriding config from shared_config.py -from .tz import prepare +def data() -> Iterator[IP]: + # random IP addresses + yield IP(addr="67.98.113.0", dt=datetime(2020, 1, 1, 12, 0, 0, tzinfo=timezone.utc)) + yield IP(addr="67.98.112.0", dt=datetime(2020, 1, 15, 12, 0, 0, tzinfo=timezone.utc)) + yield IP(addr="59.40.113.87", dt=datetime(2020, 2, 1, 12, 0, 0, tzinfo=timezone.utc)) + yield IP(addr="59.40.139.87", dt=datetime(2020, 2, 1, 16, 0, 0, tzinfo=timezone.utc)) + yield IP(addr="161.235.192.228", dt=datetime(2020, 3, 1, 12, 0, 0, tzinfo=timezone.utc)) + + +@pytest.fixture(autouse=True) +def prepare(config): + before = ip_module.ips + # redefine the my.ip.all function using data for testing + ip_module.ips = data + try: + yield + finally: + ip_module.ips = before diff --git a/tests/shared_config.py b/my/tests/shared_tz_config.py similarity index 55% rename from tests/shared_config.py rename to my/tests/shared_tz_config.py index c2f6973..3d95a9e 100644 --- a/tests/shared_config.py +++ b/my/tests/shared_tz_config.py @@ -1,47 +1,26 @@ -# Defines some shared config for tests +""" +Helper to test various timezone/location dependent things +""" -from datetime import datetime, date, timezone +from datetime import date, datetime, timezone from pathlib import Path -from typing import Any, NamedTuple -import my.time.tz.via_location as LTZ +import pytest from more_itertools import one - -class SharedConfig(NamedTuple): - google: Any - location: Any - time: Any +from my.core.cfg import tmp_config -def _prepare_google_config(tmp_path: Path): - from my.tests.common import testdata - try: - track = one(testdata().rglob('italy-slovenia-2017-07-29.json')) - except ValueError: - raise RuntimeError('testdata not found, setup git submodules?') +@pytest.fixture(autouse=True) +def config(tmp_path: Path): + # TODO could just pick a part of shared config? not sure + _takeout_path = _prepare_takeouts_dir(tmp_path) - - # todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs - import zipfile - with zipfile.ZipFile(tmp_path / 'takeout.zip', 'w') as zf: - zf.writestr('Takeout/Location History/Location History.json', track.read_bytes()) - - class google_config: - takeout_path = tmp_path - return google_config - - -# pass tmp_path from pytest to this helper function -# see tests/tz.py as an example -def temp_config(temp_path: Path) -> Any: - from my.tests.common import reset_modules - reset_modules() - - LTZ.config.fast = True + class google: + takeout_path = _takeout_path class location: - home_accuracy = 30_000 + # fmt: off home = ( # supports ISO strings ('2005-12-04' , (42.697842, 23.325973)), # Bulgaria, Sofia @@ -50,16 +29,32 @@ def temp_config(temp_path: Path) -> Any: # check tz handling.. (datetime.fromtimestamp(1600000000, tz=timezone.utc), (55.7558 , 37.6173 )), # Moscow, Russia ) + # fmt: on # note: order doesn't matter, will be sorted in the data provider - class via_ip: - accuracy = 15_000 - class gpslogger: - pass class time: class tz: class via_location: - pass # just rely on the defaults... + fast = True # some tests rely on it + + with tmp_config() as cfg: + cfg.google = google + cfg.location = location + cfg.time = time + yield cfg - return SharedConfig(google=_prepare_google_config(temp_path), location=location, time=time) +def _prepare_takeouts_dir(tmp_path: Path) -> Path: + from .common import testdata + + try: + track = one(testdata().rglob('italy-slovenia-2017-07-29.json')) + except ValueError: + raise RuntimeError('testdata not found, setup git submodules?') + + # todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs + import zipfile + + with zipfile.ZipFile(tmp_path / 'takeout.zip', 'w') as zf: + zf.writestr('Takeout/Location History/Location History.json', track.read_bytes()) + return tmp_path diff --git a/my/tests/tz.py b/my/tests/tz.py new file mode 100644 index 0000000..db88278 --- /dev/null +++ b/my/tests/tz.py @@ -0,0 +1,107 @@ +import sys +from datetime import datetime, timedelta + +import pytest +import pytz + +import my.time.tz.main as tz_main +import my.time.tz.via_location as tz_via_location +from my.core import notnone +from my.core.compat import fromisoformat + +from .shared_tz_config import config # autoused fixture + + +def getzone(dt: datetime) -> str: + tz = notnone(dt.tzinfo) + return getattr(tz, 'zone') + + +@pytest.mark.parametrize('fast', [False, True]) +def test_iter_tzs(fast: bool, config) -> None: + # TODO hmm.. maybe need to make sure we start with empty config? + config.time.tz.via_location.fast = fast + + ll = list(tz_via_location._iter_tzs()) + zones = [x.zone for x in ll] + + if fast: + assert zones == [ + 'Europe/Rome', + 'Europe/Rome', + 'Europe/Vienna', + 'Europe/Vienna', + 'Europe/Vienna', + ] + else: + assert zones == [ + 'Europe/Rome', + 'Europe/Rome', + 'Europe/Ljubljana', + 'Europe/Ljubljana', + 'Europe/Ljubljana', + ] + + +def test_past() -> None: + """ + Should fallback to the 'home' location provider + """ + dt = fromisoformat('2000-01-01 12:34:45') + dt = tz_main.localize(dt) + assert getzone(dt) == 'America/New_York' + + +def test_future() -> None: + """ + For locations in the future should rely on 'home' location + """ + fut = datetime.now() + timedelta(days=100) + fut = tz_main.localize(fut) + assert getzone(fut) == 'Europe/Moscow' + + +def test_get_tz(config) -> None: + # todo hmm, the way it's implemented at the moment, never returns None? + get_tz = tz_via_location.get_tz + + # not present in the test data + tz = get_tz(fromisoformat('2020-01-01 10:00:00')) + assert notnone(tz).zone == 'Europe/Sofia' + + tz = get_tz(fromisoformat('2017-08-01 11:00:00')) + assert notnone(tz).zone == 'Europe/Vienna' + + tz = get_tz(fromisoformat('2017-07-30 10:00:00')) + assert notnone(tz).zone == 'Europe/Rome' + + tz = get_tz(fromisoformat('2020-10-01 14:15:16')) + assert tz is not None + + on_windows = sys.platform == 'win32' + if not on_windows: + tz = get_tz(datetime.min) + assert tz is not None + else: + # seems this fails because windows doesn't support same date ranges + # https://stackoverflow.com/a/41400321/ + with pytest.raises(OSError): + get_tz(datetime.min) + + +def test_policies() -> None: + naive = fromisoformat('2017-07-30 10:00:00') + assert naive.tzinfo is None # just in case + + # actual timezone at the time + assert getzone(tz_main.localize(naive)) == 'Europe/Rome' + + z = pytz.timezone('America/New_York') + aware = z.localize(naive) + + assert getzone(tz_main.localize(aware)) == 'America/New_York' + + assert getzone(tz_main.localize(aware, policy='convert')) == 'Europe/Rome' + + with pytest.raises(RuntimeError): + assert tz_main.localize(aware, policy='throw') diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index b66ff8a..329e330 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -1,52 +1,43 @@ ''' Timezone data provider, guesses timezone based on location data (e.g. GPS) ''' + REQUIRES = [ # for determining timezone by coordinate 'timezonefinder', ] +import heapq +import os from collections import Counter from dataclasses import dataclass from datetime import date, datetime from functools import lru_cache -import heapq from itertools import groupby -import os -from typing import Iterator, Optional, Tuple, Any, List, Iterable, Set, Dict +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Iterable, + Iterator, + List, + Optional, + Protocol, + Set, + Tuple, +) import pytz +from my.core import Stats, datetime_aware, make_logger, stat from my.core.cachew import mcachew -from my.core import make_logger, stat, Stats, datetime_aware +from my.core.compat import TypeAlias from my.core.source import import_source from my.core.warnings import high - from my.location.common import LatLon -## user might not have tz config section, so makes sense to be more defensive about it -# todo might be useful to extract a helper for this -try: - from my.config import time -except ImportError as ie: - if ie.name != 'time': - raise ie -else: - try: - user_config = time.tz.via_location - except AttributeError as ae: - if not ("'tz'" in str(ae) or "'via_location'"): - raise ae - -# deliberately dynamic to prevent confusing mypy -if 'user_config' not in globals(): - globals()['user_config'] = object -## - - -@dataclass -class config(user_config): +class config(Protocol): # less precise, but faster fast: bool = True @@ -62,6 +53,43 @@ class config(user_config): _iter_tz_refresh_time: int = 6 +def _get_user_config(): + ## user might not have tz config section, so makes sense to be more defensive about it + + class empty_config: ... + + try: + from my.config import time + except ImportError as ie: + if "'time'" not in str(ie): + raise ie + else: + return empty_config + + try: + user_config = time.tz.via_location + except AttributeError as ae: + if not ("'tz'" in str(ae) or "'via_location'" in str(ae)): + raise ae + else: + return empty_config + + return user_config + + +def make_config() -> config: + if TYPE_CHECKING: + import my.config + + user_config: TypeAlias = my.config.time.tz.via_location + else: + user_config = _get_user_config() + + class combined_config(user_config, config): ... + + return combined_config() + + logger = make_logger(__name__) @@ -78,6 +106,7 @@ def _timezone_finder(fast: bool) -> Any: # for backwards compatibility def _locations() -> Iterator[Tuple[LatLon, datetime_aware]]: try: + raise RuntimeError import my.location.all for loc in my.location.all.locations(): @@ -140,13 +169,14 @@ def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> I # Note: this takes a while, as the upstream since _locations isn't sorted, so this # has to do an iterative sort of the entire my.locations.all list def _iter_local_dates() -> Iterator[DayWithZone]: - finder = _timezone_finder(fast=config.fast) # rely on the default + cfg = make_config() + finder = _timezone_finder(fast=cfg.fast) # rely on the default # pdt = None # TODO: warnings doesn't actually warn? # warnings = [] locs: Iterable[Tuple[LatLon, datetime]] - locs = _sorted_locations() if config.sort_locations else _locations() + locs = _sorted_locations() if cfg.sort_locations else _locations() yield from _find_tz_for_locs(finder, locs) @@ -158,11 +188,13 @@ def _iter_local_dates() -> Iterator[DayWithZone]: def _iter_local_dates_fallback() -> Iterator[DayWithZone]: from my.location.fallback.all import fallback_locations as flocs + cfg = make_config() + def _fallback_locations() -> Iterator[Tuple[LatLon, datetime]]: for loc in sorted(flocs(), key=lambda x: x.dt): yield ((loc.lat, loc.lon), loc.dt) - yield from _find_tz_for_locs(_timezone_finder(fast=config.fast), _fallback_locations()) + yield from _find_tz_for_locs(_timezone_finder(fast=cfg.fast), _fallback_locations()) def most_common(lst: Iterator[DayWithZone]) -> DayWithZone: @@ -180,7 +212,8 @@ def _iter_tz_depends_on() -> str: 2022-04-26_12 2022-04-26_18 """ - mod = config._iter_tz_refresh_time + cfg = make_config() + mod = cfg._iter_tz_refresh_time assert mod >= 1 day = str(date.today()) hr = datetime.now().hour @@ -293,5 +326,13 @@ def stats(quick: bool = False) -> Stats: return stat(localized_years) -# deprecated -- still used in some other modules so need to keep -_get_tz = get_tz +## deprecated -- keeping for now as might be used in other modules? +if not TYPE_CHECKING: + from my.core.compat import deprecated + + @deprecated('use get_tz function instead') + def _get_tz(*args, **kwargs): + return get_tz(*args, **kwargs) + + +## diff --git a/tests/calendar.py b/tests/calendar.py deleted file mode 100644 index 3435da3..0000000 --- a/tests/calendar.py +++ /dev/null @@ -1,19 +0,0 @@ -from pathlib import Path - -import pytest - -from my.calendar.holidays import is_holiday - - -def test() -> None: - assert is_holiday('20190101') - assert not is_holiday('20180601') - assert is_holiday('20200906') # national holiday in Bulgaria - - -@pytest.fixture(autouse=True) -def prepare(tmp_path: Path): - from . import tz - # todo meh. fixtures can't be called directly? - orig = tz.prepare.__wrapped__ # type: ignore - yield from orig(tmp_path) diff --git a/tests/orgmode.py b/tests/orgmode.py index 37d783e..9b5cc59 100644 --- a/tests/orgmode.py +++ b/tests/orgmode.py @@ -1,10 +1,9 @@ from my.tests.common import skip_if_not_karlicoss as pytestmark -from my import orgmode -from my.core.orgmode import collect - - def test() -> None: + from my import orgmode + from my.core.orgmode import collect + # meh results = list(orgmode.query().collect_all(lambda n: [n] if 'python' in n.tags else [])) assert len(results) > 5 diff --git a/tests/takeout.py b/tests/takeout.py index cddc684..47d405b 100644 --- a/tests/takeout.py +++ b/tests/takeout.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +from my.tests.common import skip_if_not_karlicoss as pytestmark from datetime import datetime, timezone from itertools import islice import pytz diff --git a/tests/tz.py b/tests/tz.py deleted file mode 100644 index d86c5cb..0000000 --- a/tests/tz.py +++ /dev/null @@ -1,95 +0,0 @@ -import sys -from datetime import datetime, timedelta -from pathlib import Path - -import pytest -import pytz - -from my.core.error import notnone - -import my.time.tz.main as TZ -import my.time.tz.via_location as LTZ - - -def test_iter_tzs() -> None: - ll = list(LTZ._iter_tzs()) - assert len(ll) > 3 - - -def test_past() -> None: - # should fallback to the home location provider - dt = D('20000101 12:34:45') - dt = TZ.localize(dt) - tz = dt.tzinfo - assert tz is not None - assert getattr(tz, 'zone') == 'America/New_York' - - -def test_future() -> None: - fut = datetime.now() + timedelta(days=100) - # shouldn't crash at least - assert TZ.localize(fut) is not None - - -def test_tz() -> None: - # todo hmm, the way it's implemented at the moment, never returns None? - - # not present in the test data - tz = LTZ._get_tz(D('20200101 10:00:00')) - assert notnone(tz).zone == 'Europe/Sofia' - - tz = LTZ._get_tz(D('20170801 11:00:00')) - assert notnone(tz).zone == 'Europe/Vienna' - - tz = LTZ._get_tz(D('20170730 10:00:00')) - assert notnone(tz).zone == 'Europe/Rome' - - tz = LTZ._get_tz(D('20201001 14:15:16')) - assert tz is not None - - on_windows = sys.platform == 'win32' - if not on_windows: - tz = LTZ._get_tz(datetime.min) - assert tz is not None - else: - # seems this fails because windows doesn't support same date ranges - # https://stackoverflow.com/a/41400321/ - with pytest.raises(OSError): - LTZ._get_tz(datetime.min) - - -def test_policies() -> None: - getzone = lambda dt: getattr(dt.tzinfo, 'zone') - - naive = D('20170730 10:00:00') - # actual timezone at the time - assert getzone(TZ.localize(naive)) == 'Europe/Rome' - - z = pytz.timezone('America/New_York') - aware = z.localize(naive) - - assert getzone(TZ.localize(aware)) == 'America/New_York' - - assert getzone(TZ.localize(aware, policy='convert')) == 'Europe/Rome' - - - with pytest.raises(RuntimeError): - assert TZ.localize(aware, policy='throw') - - -def D(dstr: str) -> datetime: - return datetime.strptime(dstr, '%Y%m%d %H:%M:%S') - - - -@pytest.fixture(autouse=True) -def prepare(tmp_path: Path): - from .shared_config import temp_config - conf = temp_config(tmp_path) - - import my.core.cfg as C - with C.tmp_config() as config: - config.google = conf.google - config.time = conf.time - config.location = conf.location - yield diff --git a/tox.ini b/tox.ini index 248469e..20f730b 100644 --- a/tox.ini +++ b/tox.ini @@ -88,7 +88,6 @@ commands = {envpython} -m pytest tests \ # ignore some tests which might take a while to run on ci.. - --ignore tests/takeout.py \ --ignore tests/extra/polar.py {posargs} From b87d1c970a870eda11cc48f45d074c286dae15e6 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 26 Aug 2024 03:34:45 +0100 Subject: [PATCH 259/302] tests: move remaining tests from tests/ to my.tests, cleanup corresponding modules --- my/body/weight.py | 35 ++++++++---- my/orgmode.py | 36 ++++++++---- my/pdfs.py | 8 +-- {tests => my/tests}/bluemaestro.py | 12 ++-- my/tests/body/weight.py | 57 +++++++++++++++++++ {tests => my/tests}/pdfs.py | 3 +- my/tests/reddit.py | 9 +-- tests/config.py | 89 ------------------------------ tox.ini | 5 -- 9 files changed, 120 insertions(+), 134 deletions(-) rename {tests => my/tests}/bluemaestro.py (86%) create mode 100644 my/tests/body/weight.py rename {tests => my/tests}/pdfs.py (99%) delete mode 100644 tests/config.py diff --git a/my/body/weight.py b/my/body/weight.py index def3e87..277b4d1 100644 --- a/my/body/weight.py +++ b/my/body/weight.py @@ -2,21 +2,29 @@ Weight data (manually logged) ''' +from dataclasses import dataclass from datetime import datetime -from typing import NamedTuple, Iterator +from typing import Any, Iterator -from ..core import LazyLogger -from ..core.error import Res, set_error_datetime, extract_error_datetime +from my.core import make_logger +from my.core.error import Res, extract_error_datetime, set_error_datetime -from .. import orgmode +from my import orgmode -from my.config import weight as config # type: ignore[attr-defined] +config = Any -log = LazyLogger('my.body.weight') +def make_config() -> config: + from my.config import weight as user_config # type: ignore[attr-defined] + + return user_config() -class Entry(NamedTuple): +log = make_logger(__name__) + + +@dataclass +class Entry: dt: datetime value: float # TODO comment?? @@ -26,6 +34,8 @@ Result = Res[Entry] def from_orgmode() -> Iterator[Result]: + cfg = make_config() + orgs = orgmode.query() for o in orgmode.query().all(): if 'weight' not in o.tags: @@ -46,8 +56,8 @@ def from_orgmode() -> Iterator[Result]: yield e continue # FIXME use timezone provider - created = config.default_timezone.localize(created) - assert created is not None #??? somehow mypy wasn't happy? + created = cfg.default_timezone.localize(created) + assert created is not None # ??? somehow mypy wasn't happy? yield Entry( dt=created, value=w, @@ -57,19 +67,21 @@ def from_orgmode() -> Iterator[Result]: def make_dataframe(data: Iterator[Result]): import pandas as pd + def it(): for e in data: if isinstance(e, Exception): dt = extract_error_datetime(e) yield { - 'dt' : dt, + 'dt': dt, 'error': str(e), } else: yield { - 'dt' : e.dt, + 'dt': e.dt, 'weight': e.value, } + df = pd.DataFrame(it()) df.set_index('dt', inplace=True) # TODO not sure about UTC?? @@ -81,6 +93,7 @@ def dataframe(): entries = from_orgmode() return make_dataframe(entries) + # TODO move to a submodule? e.g. my.body.weight.orgmode? # so there could be more sources # not sure about my.body thing though diff --git a/my/orgmode.py b/my/orgmode.py index c27f5a7..cf14e43 100644 --- a/my/orgmode.py +++ b/my/orgmode.py @@ -6,18 +6,28 @@ REQUIRES = [ 'orgparse', ] +import re from datetime import datetime from pathlib import Path -import re -from typing import List, Sequence, Iterable, NamedTuple, Optional, Tuple +from typing import Iterable, List, NamedTuple, Optional, Sequence, Tuple -from my.core import get_files +import orgparse + +from my.core import Paths, Stats, get_files, stat from my.core.cachew import cache_dir, mcachew from my.core.orgmode import collect -from my.config import orgmode as user_config -import orgparse +class config: + paths: Paths + + +def make_config() -> config: + from my.config import orgmode as user_config + + class combined_config(user_config, config): ... + + return combined_config() # temporary? hack to cache org-mode notes @@ -28,10 +38,13 @@ class OrgNote(NamedTuple): def inputs() -> Sequence[Path]: - return get_files(user_config.paths) + cfg = make_config() + return get_files(cfg.paths) _rgx = re.compile(orgparse.date.gene_timestamp_regex(brtype='inactive'), re.VERBOSE) + + def _created(n: orgparse.OrgNode) -> Tuple[Optional[datetime], str]: heading = n.heading # meh.. support in orgparse? @@ -41,7 +54,7 @@ def _created(n: orgparse.OrgNode) -> Tuple[Optional[datetime], str]: # try to guess from heading m = _rgx.search(heading) if m is not None: - createds = m.group(0) # could be None + createds = m.group(0) # could be None if createds is None: return (None, heading) assert isinstance(createds, str) @@ -67,7 +80,7 @@ def to_note(x: orgparse.OrgNode) -> OrgNote: created = None return OrgNote( created=created, - heading=heading, # todo include the body? + heading=heading, # todo include the body? tags=list(x.tags), ) @@ -84,14 +97,15 @@ def _cachew_cache_path(_self, f: Path) -> Path: def _cachew_depends_on(_self, f: Path): return (f, f.stat().st_mtime) - + class Query: def __init__(self, files: Sequence[Path]) -> None: self.files = files # TODO yield errors? @mcachew( - cache_path=_cachew_cache_path, force_file=True, + cache_path=_cachew_cache_path, + force_file=True, depends_on=_cachew_depends_on, ) def _iterate(self, f: Path) -> Iterable[OrgNote]: @@ -114,8 +128,8 @@ def query() -> Query: return Query(files=inputs()) -from my.core import Stats, stat def stats() -> Stats: def outlines(): return query().all() + return stat(outlines) diff --git a/my/pdfs.py b/my/pdfs.py index 524c68b..1cedfd5 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -10,7 +10,7 @@ REQUIRES = [ import time from datetime import datetime from pathlib import Path -from typing import Iterator, List, NamedTuple, Optional, Protocol, Sequence +from typing import Iterator, List, NamedTuple, Optional, Protocol, Sequence, TYPE_CHECKING import pdfannots from more_itertools import bucket @@ -185,8 +185,6 @@ def stats() -> Stats: ### legacy/misc stuff -iter_annotations = annotations # for backwards compatibility +if not TYPE_CHECKING: + iter_annotations = annotations ### - -# can use 'hpi query my.pdfs.annotations -o pprint' to test -# diff --git a/tests/bluemaestro.py b/my/tests/bluemaestro.py similarity index 86% rename from tests/bluemaestro.py rename to my/tests/bluemaestro.py index 63ce589..2d7c81e 100644 --- a/tests/bluemaestro.py +++ b/my/tests/bluemaestro.py @@ -1,12 +1,12 @@ -from pathlib import Path from typing import Iterator +import pytest from more_itertools import one -import pytest +from my.bluemaestro import Measurement, measurements +from my.core.cfg import tmp_config - -from my.bluemaestro import measurements, Measurement +from .common import testdata def ok_measurements() -> Iterator[Measurement]: @@ -26,7 +26,7 @@ def test() -> None: # check that timezone is set properly assert dts == '20200824 22' - assert len(tp) == 1 # should be unique + assert len(tp) == 1 # should be unique # 2.5 K + 4 K datapoints, somewhat overlapping assert len(res2020) < 6000 @@ -46,14 +46,12 @@ def test_old_db() -> None: @pytest.fixture(autouse=True) def prepare(): - from my.tests.common import testdata bmdata = testdata() / 'hpi-testdata' / 'bluemaestro' assert bmdata.exists(), bmdata class bluemaestro: export_path = bmdata - from my.core.cfg import tmp_config with tmp_config() as config: config.bluemaestro = bluemaestro yield diff --git a/my/tests/body/weight.py b/my/tests/body/weight.py new file mode 100644 index 0000000..069e940 --- /dev/null +++ b/my/tests/body/weight.py @@ -0,0 +1,57 @@ +from pathlib import Path +import pytz +from my.core.cfg import tmp_config +import pytest +from my.body.weight import from_orgmode + + +def test_body_weight() -> None: + weights = [0.0 if isinstance(x, Exception) else x.value for x in from_orgmode()] + + assert weights == [ + 0.0, + 62.0, + 0.0, + 61.0, + 62.0, + 0.0, + ] + + +@pytest.fixture(autouse=True) +def prepare(tmp_path: Path): + ndir = tmp_path / 'notes' + ndir.mkdir() + logs = ndir / 'logs.org' + logs.write_text( + ''' +#+TITLE: Stuff I'm logging + +* Weight (org-capture) :weight: +** [2020-05-01 Fri 09:00] 62 +** 63 + this should be ignored, got no timestamp +** [2020-05-03 Sun 08:00] 61 +** [2020-05-04 Mon 10:00] 62 +''' + ) + misc = ndir / 'misc.org' + misc.write_text( + ''' +Some misc stuff + +* unrelated note :weight:whatever: +''' + ) + + class orgmode: + paths = [ndir] + + class weight: + # TODO ugh. this belongs to tz provider or global config or something + default_timezone = pytz.timezone('Europe/London') + + with tmp_config() as cfg: + cfg.orgmode = orgmode + cfg.weight = weight + yield diff --git a/tests/pdfs.py b/my/tests/pdfs.py similarity index 99% rename from tests/pdfs.py rename to my/tests/pdfs.py index 6db669f..bd1e93a 100644 --- a/tests/pdfs.py +++ b/my/tests/pdfs.py @@ -5,9 +5,8 @@ import pytest from more_itertools import ilen from my.core.cfg import tmp_config -from my.tests.common import testdata - from my.pdfs import annotated_pdfs, annotations, get_annots +from my.tests.common import testdata def test_module(with_config) -> None: diff --git a/my/tests/reddit.py b/my/tests/reddit.py index fb8d6d2..4f1ec51 100644 --- a/my/tests/reddit.py +++ b/my/tests/reddit.py @@ -1,15 +1,16 @@ +import pytest +from more_itertools import consume + from my.core.cfg import tmp_config from my.core.utils.itertools import ensure_unique -# todo ugh, it's discovered as a test??? from .common import testdata -from more_itertools import consume -import pytest # deliberately use mixed style imports on the top level and inside the methods to test tmp_config stuff -import my.reddit.rexport as my_reddit_rexport +# todo won't really be necessary once we migrate to lazy user config import my.reddit.all as my_reddit_all +import my.reddit.rexport as my_reddit_rexport def test_basic_1() -> None: diff --git a/tests/config.py b/tests/config.py deleted file mode 100644 index acfe1f1..0000000 --- a/tests/config.py +++ /dev/null @@ -1,89 +0,0 @@ -from pathlib import Path - - -# TODO move this somewhere else -- there are more specific tests covering this now -def test_dynamic_configuration(notes: Path) -> None: - import pytz - from types import SimpleNamespace as NS - - from my.core.cfg import tmp_config - with tmp_config() as C: - C.orgmode = NS(paths=[notes]) - # TODO ugh. this belongs to tz provider or global config or something - C.weight = NS(default_timezone=pytz.timezone('Europe/London')) - - from my.body.weight import from_orgmode - weights = [0.0 if isinstance(x, Exception) else x.value for x in from_orgmode()] - - assert weights == [ - 0.0, - 62.0, - 0.0, - 61.0, - 62.0, - 0.0, - ] - -import pytest - - - -from dataclasses import dataclass - - -# TODO this test should probs be deprecated? it's more of a documentation? -def test_user_config() -> None: - from my.core.common import classproperty - class user_config: - param1 = 'abacaba' - # TODO fuck. properties don't work here??? - @classproperty - def param2(cls) -> int: - return 456 - - extra = 'extra!' - - @dataclass - class test_config(user_config): - param1: str - param2: int # type: ignore[assignment] # TODO need to figure out how to trick mypy for @classproperty - param3: str = 'default' - - assert test_config.param1 == 'abacaba' - assert test_config.param2 == 456 - assert test_config.param3 == 'default' - assert test_config.extra == 'extra!' - - from my.core.cfg import make_config - c = make_config(test_config) - assert c.param1 == 'abacaba' - assert c.param2 == 456 - assert c.param3 == 'default' - assert c.extra == 'extra!' - - -@pytest.fixture -def notes(tmp_path: Path): - ndir = tmp_path / 'notes' - ndir.mkdir() - logs = ndir / 'logs.org' - logs.write_text(''' -#+TITLE: Stuff I'm logging - -* Weight (org-capture) :weight: -** [2020-05-01 Fri 09:00] 62 -** 63 - this should be ignored, got no timestamp -** [2020-05-03 Sun 08:00] 61 -** [2020-05-04 Mon 10:00] 62 - ''') - misc = ndir / 'misc.org' - misc.write_text(''' -Some misc stuff - -* unrelated note :weight:whatever: - ''') - try: - yield ndir - finally: - pass diff --git a/tox.ini b/tox.ini index 20f730b..6b95088 100644 --- a/tox.ini +++ b/tox.ini @@ -86,11 +86,6 @@ commands = --pyargs {[testenv]package_name}.core {[testenv]package_name}.tests \ {posargs} - {envpython} -m pytest tests \ - # ignore some tests which might take a while to run on ci.. - --ignore tests/extra/polar.py - {posargs} - [testenv:demo] commands = From b1fe23b8d0d2ede92c4c17c7199f537c0523317b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 26 Aug 2024 03:50:32 +0100 Subject: [PATCH 260/302] my.rss.feedly/my.twittr.talon -- migrate to use lazy user configs --- my/rss/feedly.py | 28 ++++++++++++++++++++++------ my/twitter/talon.py | 34 ++++++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/my/rss/feedly.py b/my/rss/feedly.py index 55bcf9b..127ef61 100644 --- a/my/rss/feedly.py +++ b/my/rss/feedly.py @@ -2,19 +2,35 @@ Feedly RSS reader """ -from my.config import feedly as config - -from datetime import datetime, timezone import json +from abc import abstractmethod +from datetime import datetime, timezone from pathlib import Path -from typing import Iterator, Sequence +from typing import Iterator, Protocol, Sequence + +from my.core import Paths, get_files -from my.core import get_files from .common import Subscription, SubscriptionState +class config(Protocol): + @property + @abstractmethod + def export_path(self) -> Paths: + raise NotImplementedError + + +def make_config() -> config: + from my.config import feedly as user_config + + class combined_config(user_config, config): ... + + return combined_config() + + def inputs() -> Sequence[Path]: - return get_files(config.export_path) + cfg = make_config() + return get_files(cfg.export_path) def parse_file(f: Path) -> Iterator[Subscription]: diff --git a/my/twitter/talon.py b/my/twitter/talon.py index 306a735..1b79727 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -1,12 +1,15 @@ """ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter_l/databases/=) """ + from __future__ import annotations -from dataclasses import dataclass -from datetime import datetime, timezone import re import sqlite3 +from abc import abstractmethod +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path from typing import Iterator, Sequence, Union from my.core import Paths, Res, datetime_aware, get_files @@ -15,18 +18,25 @@ from my.core.sqlite import sqlite_connection from .common import TweetId, permalink -from my.config import twitter as user_config + +class config: + @property + @abstractmethod + def export_path(self) -> Paths: + raise NotImplementedError -@dataclass -class config(user_config.talon): - # paths[s]/glob to the exported sqlite databases - export_path: Paths +def make_config() -> config: + from my.config import twitter as user_config + + class combined_config(user_config.talon, config): + pass + + return combined_config() -from pathlib import Path def inputs() -> Sequence[Path]: - return get_files(config.export_path) + return get_files(make_config().export_path) @dataclass(unsafe_hash=True) @@ -46,12 +56,16 @@ class Tweet: @dataclass(unsafe_hash=True) class _IsTweet: tweet: Tweet + + @dataclass(unsafe_hash=True) class _IsFavorire: tweet: Tweet Entity = Union[_IsTweet, _IsFavorire] + + def _entities() -> Iterator[Res[Entity]]: for f in inputs(): yield from _process_one(f) @@ -59,7 +73,7 @@ def _entities() -> Iterator[Res[Entity]]: def _process_one(f: Path) -> Iterator[Res[Entity]]: handlers = { - 'user_tweets.db' : _process_user_tweets, + 'user_tweets.db': _process_user_tweets, 'favorite_tweets.db': _process_favorite_tweets, } fname = f.name From c08ddbc781b4cc79441e9d0d856957a94791fb7f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 27 Aug 2024 21:02:39 +0100 Subject: [PATCH 261/302] general: small updates for typing while trying out pyright --- my/core/common.py | 2 +- my/core/stats.py | 2 +- my/core/util.py | 2 +- my/core/utils/itertools.py | 8 +++----- my/google/takeout/html.py | 5 ++--- my/hypothesis.py | 5 +++-- my/smscalls.py | 2 +- 7 files changed, 12 insertions(+), 14 deletions(-) diff --git a/my/core/common.py b/my/core/common.py index dcd1074..d0b737e 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -133,8 +133,8 @@ def test_classproperty() -> None: return 'hello' res = C.prop - assert res == 'hello' assert_type(res, str) + assert res == 'hello' # hmm, this doesn't really work with mypy well.. diff --git a/my/core/stats.py b/my/core/stats.py index 08821a2..bfedbd2 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -2,7 +2,7 @@ Helpers for hpi doctor/stats functionality. ''' -import collections +import collections.abc import importlib import inspect import typing diff --git a/my/core/util.py b/my/core/util.py index b48a450..0c596fa 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -25,7 +25,7 @@ def is_not_hpi_module(module: str) -> Optional[str]: ''' None if a module, otherwise returns reason ''' - import importlib + import importlib.util path: Optional[str] = None try: diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index 023484d..66f82bd 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -275,20 +275,18 @@ def test_check_if_hashable() -> None: x1: List[int] = [1, 2] r1 = check_if_hashable(x1) - # tgype: ignore[comparison-overlap] # object should be unchanged - assert r1 is x1 assert_type(r1, Iterable[int]) + assert r1 is x1 x2: Iterator[Union[int, str]] = iter((123, 'aba')) r2 = check_if_hashable(x2) - assert list(r2) == [123, 'aba'] assert_type(r2, Iterable[Union[int, str]]) + assert list(r2) == [123, 'aba'] x3: Tuple[object, ...] = (789, 'aba') r3 = check_if_hashable(x3) - # ttype: ignore[comparison-overlap] # object should be unchanged - assert r3 is x3 assert_type(r3, Iterable[object]) + assert r3 is x3 # object should be unchanged x4: List[Set[int]] = [{1, 2, 3}, {4, 5, 6}] with pytest.raises(Exception): diff --git a/my/google/takeout/html.py b/my/google/takeout/html.py index d393957..3ce692c 100644 --- a/my/google/takeout/html.py +++ b/my/google/takeout/html.py @@ -8,7 +8,6 @@ from pathlib import Path from datetime import datetime from html.parser import HTMLParser from typing import List, Optional, Any, Callable, Iterable, Tuple -from collections import OrderedDict from urllib.parse import unquote import pytz @@ -94,8 +93,8 @@ class TakeoutHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): if self.state == State.INSIDE and tag == 'a': self.state = State.PARSING_LINK - attrs = OrderedDict(attrs) - hr = attrs['href'] + [hr] = (v for k, v in attrs if k == 'href') + assert hr is not None # sometimes it's starts with this prefix, it's apparently clicks from google search? or visits from chrome address line? who knows... # TODO handle http? diff --git a/my/hypothesis.py b/my/hypothesis.py index 55fff64..82104cd 100644 --- a/my/hypothesis.py +++ b/my/hypothesis.py @@ -41,6 +41,7 @@ except ModuleNotFoundError as e: dal = pre_pip_dal_handler('hypexport', e, config, requires=REQUIRES) +DAL = dal.DAL Highlight = dal.Highlight Page = dal.Page @@ -49,8 +50,8 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) -def _dal() -> dal.DAL: - return dal.DAL(inputs()) +def _dal() -> DAL: + return DAL(inputs()) # TODO they are in reverse chronological order... diff --git a/my/smscalls.py b/my/smscalls.py index f436709..b56026d 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -24,7 +24,7 @@ from datetime import datetime, timezone from pathlib import Path from typing import NamedTuple, Iterator, Set, Tuple, Optional, Any, Dict, List -from lxml import etree +import lxml.etree as etree from my.core.error import Res From d244c7cc4e17bc9010829c34dd4b88aaba5e5981 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 27 Aug 2024 22:50:37 +0100 Subject: [PATCH 262/302] ruff: enable and fix C4 ruleset --- my/coding/commits.py | 4 ++-- my/core/_deprecated/kompress.py | 2 +- my/core/common.py | 2 +- my/core/freezer.py | 6 ++++-- my/core/influxdb.py | 12 ++++++------ my/core/pandas.py | 2 +- my/core/query.py | 10 +++++----- my/core/sqlite.py | 2 +- my/emfit/__init__.py | 2 +- my/github/gdpr.py | 4 ++-- my/lastfm.py | 9 +++++---- my/location/fallback/via_home.py | 2 +- my/media/imdb.py | 2 +- my/pdfs.py | 2 +- my/rescuetime.py | 13 +++++++------ my/tests/pdfs.py | 2 +- my/twitter/archive.py | 2 +- my/youtube/takeout.py | 2 +- ruff.toml | 6 ++++++ 19 files changed, 48 insertions(+), 38 deletions(-) diff --git a/my/coding/commits.py b/my/coding/commits.py index dac3b1f..20b66a0 100644 --- a/my/coding/commits.py +++ b/my/coding/commits.py @@ -161,14 +161,14 @@ def git_repos_in(roots: List[Path]) -> List[Path]: *roots, ]).decode('utf8').splitlines() - candidates = set(Path(o).resolve().absolute().parent for o in outputs) + candidates = {Path(o).resolve().absolute().parent for o in outputs} # exclude stuff within .git dirs (can happen for submodules?) candidates = {c for c in candidates if '.git' not in c.parts[:-1]} candidates = {c for c in candidates if is_git_dir(c)} - repos = list(sorted(map(_git_root, candidates))) + repos = sorted(map(_git_root, candidates)) return repos diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index 7eb9b37..89e5745 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -244,7 +244,7 @@ class ZipPath(zipfile_Path): # see https://en.wikipedia.org/wiki/ZIP_(file_format)#Structure dt = datetime(*self.root.getinfo(self.at).date_time) ts = int(dt.timestamp()) - params = dict( + params = dict( # noqa: C408 st_mode=0, st_ino=0, st_dev=0, diff --git a/my/core/common.py b/my/core/common.py index d0b737e..ba4ce6b 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -80,7 +80,7 @@ def get_files( paths.append(src) if sort: - paths = list(sorted(paths)) + paths = sorted(paths) if len(paths) == 0: # todo make it conditionally defensive based on some global settings diff --git a/my/core/freezer.py b/my/core/freezer.py index e46525b..93bceb7 100644 --- a/my/core/freezer.py +++ b/my/core/freezer.py @@ -60,8 +60,10 @@ class _A: def test_freezer() -> None: - - val = _A(x=dict(an_int=123, an_any=[1, 2, 3])) + val = _A(x={ + 'an_int': 123, + 'an_any': [1, 2, 3], + }) af = Freezer(_A) fval = af.freeze(val) diff --git a/my/core/influxdb.py b/my/core/influxdb.py index c39f6af..b1d6b9b 100644 --- a/my/core/influxdb.py +++ b/my/core/influxdb.py @@ -72,16 +72,16 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_c fields = filter_dict(d) - yield dict( - measurement=measurement, + yield { + 'measurement': measurement, # TODO maybe good idea to tag with database file/name? to inspect inconsistencies etc.. # hmm, so tags are autoindexed and might be faster? # not sure what's the big difference though # "fields are data and tags are metadata" - tags=tags, - time=dt, - fields=fields, - ) + 'tags': tags, + 'time': dt, + 'fields': fields, + } from more_itertools import chunked # "The optimal batch size is 5000 lines of line protocol." diff --git a/my/core/pandas.py b/my/core/pandas.py index 8ad93cb..d38465a 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -222,7 +222,7 @@ def test_as_dataframe() -> None: from .compat import fromisoformat - it = (dict(i=i, s=f'str{i}') for i in range(5)) + it = ({'i': i, 's': f'str{i}'} for i in range(5)) with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841 df: DataFrameT = as_dataframe(it) # todo test other error col policies diff --git a/my/core/query.py b/my/core/query.py index cf85b1b..54cd9db 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -655,7 +655,7 @@ def test_wrap_unsortable() -> None: # by default, wrap unsortable res = list(select(_mixed_iter(), order_key="z")) - assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4, "Unsortable": 2}) + assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4, "Unsortable": 2}) def test_disabled_wrap_unsorted() -> None: @@ -674,7 +674,7 @@ def test_drop_unsorted() -> None: # test drop unsortable, should remove them before the 'sorted' call res = list(select(_mixed_iter(), order_key="z", wrap_unsorted=False, drop_unsorted=True)) assert len(res) == 4 - assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4}) + assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4}) def test_drop_exceptions() -> None: @@ -705,7 +705,7 @@ def test_wrap_unsortable_with_error_and_warning() -> None: # by default should wrap unsortable (error) with pytest.warns(UserWarning, match=r"encountered exception"): res = list(select(_mixed_iter_errors(), order_value=lambda o: isinstance(o, datetime))) - assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4, "_B": 2, "Unsortable": 1}) + assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4, "_B": 2, "Unsortable": 1}) # compare the returned error wrapped in the Unsortable returned_error = next((o for o in res if isinstance(o, Unsortable))).obj assert "Unhandled error!" == str(returned_error) @@ -717,7 +717,7 @@ def test_order_key_unsortable() -> None: # both unsortable and items which dont match the order_by (order_key) in this case should be classified unsorted res = list(select(_mixed_iter_errors(), order_key="z")) - assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4, "Unsortable": 3}) + assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4, "Unsortable": 3}) def test_order_default_param() -> None: @@ -737,7 +737,7 @@ def test_no_recursive_unsortables() -> None: # select to select as input, wrapping unsortables the first time, second should drop them # reverse=True to send errors to the end, so the below order_key works res = list(select(_mixed_iter_errors(), order_key="z", reverse=True)) - assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4, "Unsortable": 3}) + assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4, "Unsortable": 3}) # drop_unsorted dropped = list(select(res, order_key="z", drop_unsorted=True)) diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 47bd78b..08a80e5 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -35,7 +35,7 @@ SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any] def dict_factory(cursor, row): fields = [column[0] for column in cursor.description] - return {key: value for key, value in zip(fields, row)} + return dict(zip(fields, row)) Factory = Union[SqliteRowFactory, Literal['row', 'dict']] diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index 7fae8ea..71a483f 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -189,7 +189,7 @@ def fake_data(nights: int = 500) -> Iterator: # TODO remove/deprecate it? I think used by timeline def get_datas() -> List[Emfit]: # todo ugh. run lint properly - return list(sorted(datas(), key=lambda e: e.start)) # type: ignore + return sorted(datas(), key=lambda e: e.start) # type: ignore # TODO move away old entries if there is a diff?? diff --git a/my/github/gdpr.py b/my/github/gdpr.py index 1fde7c9..4ca8e84 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -51,12 +51,12 @@ def events() -> Iterable[Res[Event]]: # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples # another one is zulip archive if last.is_dir(): - files = list(sorted(last.glob('*.json'))) # looks like all files are in the root + files = sorted(last.glob('*.json')) # looks like all files are in the root open_file = lambda f: f.open() else: # treat as .tar.gz tfile = tarfile.open(last) - files = list(sorted(map(Path, tfile.getnames()))) + files = sorted(map(Path, tfile.getnames())) files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json'] open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ diff --git a/my/lastfm.py b/my/lastfm.py index 6618738..d20ebf3 100644 --- a/my/lastfm.py +++ b/my/lastfm.py @@ -83,9 +83,10 @@ def stats() -> Stats: def fill_influxdb() -> None: from my.core import influxdb + # todo needs to be more automatic - sd = (dict( - dt=x.dt, - track=x.track, - ) for x in scrobbles()) + sd = ({ + 'dt': x.dt, + 'track': x.track, + } for x in scrobbles()) influxdb.fill(sd, measurement=__name__) diff --git a/my/location/fallback/via_home.py b/my/location/fallback/via_home.py index 259dcaa..199ebb0 100644 --- a/my/location/fallback/via_home.py +++ b/my/location/fallback/via_home.py @@ -55,7 +55,7 @@ class Config(user_config): if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) res.append((dt, loc)) - res = list(sorted(res, key=lambda p: p[0])) + res = sorted(res, key=lambda p: p[0]) return res diff --git a/my/media/imdb.py b/my/media/imdb.py index b7ecbde..df6d62d 100644 --- a/my/media/imdb.py +++ b/my/media/imdb.py @@ -33,7 +33,7 @@ def iter_movies() -> Iterator[Movie]: def get_movies() -> List[Movie]: - return list(sorted(iter_movies(), key=lambda m: m.created)) + return sorted(iter_movies(), key=lambda m: m.created) def test(): diff --git a/my/pdfs.py b/my/pdfs.py index 1cedfd5..db49c0e 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -97,7 +97,7 @@ def get_annots(p: Path) -> List[Annotation]: b = time.time() with p.open('rb') as fo: doc = pdfannots.process_file(fo, emit_progress_to=None) - annots = [a for a in doc.iter_annots()] + annots = list(doc.iter_annots()) # also has outlines are kinda like TOC, I don't really need them a = time.time() took = a - b diff --git a/my/rescuetime.py b/my/rescuetime.py index c493e8e..76a0d4c 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -82,12 +82,13 @@ def fake_data(rows: int=1000) -> Iterator: def fill_influxdb() -> None: - from .core import influxdb - it = (dict( - dt=e.dt, - duration_d=e.duration_s, - tags=dict(activity=e.activity), - ) for e in entries() if isinstance(e, Entry)) # TODO handle errors in core.influxdb + from my.core import influxdb + + it = ({ + 'dt': e.dt, + 'duration_d': e.duration_s, + 'tags': {'activity': e.activity}, + } for e in entries() if isinstance(e, Entry)) # TODO handle errors in core.influxdb influxdb.fill(it, measurement=__name__) diff --git a/my/tests/pdfs.py b/my/tests/pdfs.py index bd1e93a..3702424 100644 --- a/my/tests/pdfs.py +++ b/my/tests/pdfs.py @@ -62,7 +62,7 @@ def test_get_annots() -> None: """ annotations = get_annots(testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf') assert len(annotations) == 3 - assert set([a.highlight for a in annotations]) == EXPECTED_HIGHLIGHTS + assert {a.highlight for a in annotations} == EXPECTED_HIGHLIGHTS def test_annotated_pdfs_with_filelist() -> None: diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 0ea6b24..685f7fc 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -105,7 +105,7 @@ class Tweet: repls.append((fr, to, me['display_url'])) # todo not sure, maybe use media_url_https instead? # for now doing this for compatibility with twint - repls = list(sorted(repls)) + repls = sorted(repls) parts = [] idx = 0 for fr, to, what in repls: diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index 8fe8f2c..99d65d9 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -120,4 +120,4 @@ def _watched_legacy() -> Iterable[Watched]: watches.append(Watched(url=url, title=title, when=dt)) # todo hmm they already come sorted.. wonder if should just rely on it.. - return list(sorted(watches, key=lambda e: e.when)) + return sorted(watches, key=lambda e: e.when) diff --git a/ruff.toml b/ruff.toml index 54f621c..db926da 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,3 +1,9 @@ +lint.extend-select = [ + "F", # flakes rules -- default, but extend just in case + "E", # pycodestyle -- default, but extend just in case + "C4", # flake8-comprehensions -- unnecessary list/map/dict calls +] + lint.ignore = [ ### too opinionated style checks "E501", # too long lines From 118c2d44849b4df661cf07a91d690e91ec1a6069 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 27 Aug 2024 23:12:57 +0100 Subject: [PATCH 263/302] ruff: enable UP ruleset for detecting python deprecations --- my/core/__main__.py | 4 ++-- my/core/_deprecated/kompress.py | 10 ++-------- my/core/query.py | 2 +- my/core/query_range.py | 3 +-- my/core/stats.py | 13 +++++++++---- my/core/tests/denylist.py | 3 +-- my/core/util.py | 3 +-- my/time/tz/via_location.py | 2 +- ruff.toml | 9 +++++++++ 9 files changed, 27 insertions(+), 22 deletions(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index 276de26..986b05f 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -16,7 +16,7 @@ from typing import Any, Callable, Iterable, List, Optional, Sequence, Type import click -@functools.lru_cache() +@functools.lru_cache def mypy_cmd() -> Optional[Sequence[str]]: try: # preferably, use mypy from current python env @@ -43,7 +43,7 @@ def run_mypy(cfg_path: Path) -> Optional[CompletedProcess]: cmd = mypy_cmd() if cmd is None: return None - mres = run([ + mres = run([ # noqa: UP022 *cmd, '--namespace-packages', '--color-output', # not sure if works?? diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index 89e5745..803e515 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -148,14 +148,8 @@ def kexists(path: PathIsh, subpath: str) -> bool: import zipfile -if sys.version_info[:2] >= (3, 8): - # meh... zipfile.Path is not available on 3.7 - zipfile_Path = zipfile.Path -else: - if typing.TYPE_CHECKING: - zipfile_Path = Any - else: - zipfile_Path = object +# meh... zipfile.Path is not available on 3.7 +zipfile_Path = zipfile.Path @total_ordering diff --git a/my/core/query.py b/my/core/query.py index 54cd9db..bc7e222 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -707,7 +707,7 @@ def test_wrap_unsortable_with_error_and_warning() -> None: res = list(select(_mixed_iter_errors(), order_value=lambda o: isinstance(o, datetime))) assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4, "_B": 2, "Unsortable": 1}) # compare the returned error wrapped in the Unsortable - returned_error = next((o for o in res if isinstance(o, Unsortable))).obj + returned_error = next(o for o in res if isinstance(o, Unsortable)).obj assert "Unhandled error!" == str(returned_error) diff --git a/my/core/query_range.py b/my/core/query_range.py index d077225..761b045 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -526,9 +526,8 @@ def test_parse_timedelta_string() -> None: def test_parse_datetime_float() -> None: - pnow = parse_datetime_float("now") - sec_diff = abs((pnow - datetime.now().timestamp())) + sec_diff = abs(pnow - datetime.now().timestamp()) # should probably never fail? could mock time.time # but there seems to be issues with doing that use C-libraries (as time.time) does # https://docs.python.org/3/library/unittest.mock-examples.html#partial-mocking diff --git a/my/core/stats.py b/my/core/stats.py index bfedbd2..85c2a99 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -414,7 +414,9 @@ def test_stat_iterable() -> None: dd = datetime.fromtimestamp(123, tz=timezone.utc) day = timedelta(days=3) - X = NamedTuple('X', [('x', int), ('d', datetime)]) + class X(NamedTuple): + x: int + d: datetime def it(): yield RuntimeError('oops!') @@ -452,9 +454,12 @@ def test_guess_datetime() -> None: dd = fromisoformat('2021-02-01T12:34:56Z') - # ugh.. https://github.com/python/mypy/issues/7281 - A = NamedTuple('A', [('x', int)]) - B = NamedTuple('B', [('x', int), ('created', datetime)]) + class A(NamedTuple): + x: int + + class B(NamedTuple): + x: int + created: datetime assert _guess_datetime(A(x=4)) is None assert _guess_datetime(B(x=4, created=dd)) == dd diff --git a/my/core/tests/denylist.py b/my/core/tests/denylist.py index 8016282..2688319 100644 --- a/my/core/tests/denylist.py +++ b/my/core/tests/denylist.py @@ -91,8 +91,7 @@ def test_denylist(tmp_path: Path) -> None: assert "59.40.113.87" not in [i.addr for i in filtered] - with open(tf, "r") as f: - data_json = json.loads(f.read()) + data_json = json.loads(tf.read_text()) assert data_json == [ { diff --git a/my/core/util.py b/my/core/util.py index 0c596fa..fdd10f9 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -12,8 +12,7 @@ from .discovery_pure import HPIModule, _is_not_module_src, has_stats, ignored def modules() -> Iterable[HPIModule]: import my - for m in _iter_all_importables(my): - yield m + yield from _iter_all_importables(my) __NOT_HPI_MODULE__ = 'Import this to mark a python file as a helper, not an actual HPI module' diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 329e330..d74bdc3 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -218,7 +218,7 @@ def _iter_tz_depends_on() -> str: day = str(date.today()) hr = datetime.now().hour hr_truncated = hr // mod * mod - return "{}_{}".format(day, hr_truncated) + return f"{day}_{hr_truncated}" # refresh _iter_tzs every few hours -- don't think a better depends_on is possible dynamically diff --git a/ruff.toml b/ruff.toml index db926da..dda279b 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,7 +1,10 @@ +target-version = "py38" # NOTE: inferred from pyproject.toml if present + lint.extend-select = [ "F", # flakes rules -- default, but extend just in case "E", # pycodestyle -- default, but extend just in case "C4", # flake8-comprehensions -- unnecessary list/map/dict calls + "UP", # detect deprecated python stdlib stuff ] lint.ignore = [ @@ -28,4 +31,10 @@ lint.ignore = [ "F841", # Local variable `count` is assigned to but never used "F401", # imported but unused ### + +### TODO should be fine to use these with from __future__ import annotations? +### there was some issue with cachew though... double check this? + "UP006", # use type instead of Type + "UP007", # use X | Y instead of Union +### ] From 664c40e3e8a590b63e76354ad907fb6d4567494b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 27 Aug 2024 23:37:17 +0100 Subject: [PATCH 264/302] ruff: enable FBT rules to detect boolean arguments use without kwargs --- my/body/sleep/common.py | 2 +- my/core/__main__.py | 13 +++++++------ my/core/common.py | 9 +++++---- my/core/denylist.py | 5 +++-- my/core/influxdb.py | 2 +- my/core/konsume.py | 2 +- my/core/query.py | 19 +++++++++++++------ my/core/stats.py | 6 +++--- my/core/structure.py | 2 +- my/core/utils/concurrent.py | 2 +- my/endomondo.py | 2 +- my/google/takeout/parser.py | 2 +- my/jawbone/__init__.py | 2 +- my/location/fallback/all.py | 2 +- my/location/fallback/common.py | 2 +- my/tests/tz.py | 2 +- my/time/tz/via_location.py | 4 ++-- my/topcoder.py | 2 +- ruff.toml | 9 +++++---- 19 files changed, 50 insertions(+), 39 deletions(-) diff --git a/my/body/sleep/common.py b/my/body/sleep/common.py index e84c8d5..1100814 100644 --- a/my/body/sleep/common.py +++ b/my/body/sleep/common.py @@ -7,7 +7,7 @@ class Combine: self.modules = modules @cdf - def dataframe(self, with_temperature: bool=True) -> DataFrameT: + def dataframe(self, *, with_temperature: bool=True) -> DataFrameT: import pandas as pd # todo include 'source'? df = pd.concat([m.dataframe() for m in self.modules]) diff --git a/my/core/__main__.py b/my/core/__main__.py index 986b05f..d3c0cc7 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -438,7 +438,7 @@ def _ui_getchar_pick(choices: Sequence[str], prompt: str = 'Select from: ') -> i return result_map[ch] -def _locate_functions_or_prompt(qualified_names: List[str], prompt: bool = True) -> Iterable[Callable[..., Any]]: +def _locate_functions_or_prompt(qualified_names: List[str], *, prompt: bool = True) -> Iterable[Callable[..., Any]]: from .query import QueryException, locate_qualified_function from .stats import is_data_provider @@ -588,7 +588,7 @@ def query_hpi_functions( @click.group() @click.option("--debug", is_flag=True, default=False, help="Show debug logs") -def main(debug: bool) -> None: +def main(*, debug: bool) -> None: ''' Human Programming Interface @@ -637,7 +637,7 @@ def _module_autocomplete(ctx: click.Context, args: Sequence[str], incomplete: st @click.option('-q', '--quick', is_flag=True, help='Only run partial checks (first 100 items)') @click.option('-S', '--skip-config-check', 'skip_conf', is_flag=True, help='Skip configuration check') @click.argument('MODULE', nargs=-1, required=False, shell_complete=_module_autocomplete) -def doctor_cmd(verbose: bool, list_all: bool, quick: bool, skip_conf: bool, module: Sequence[str]) -> None: +def doctor_cmd(*, verbose: bool, list_all: bool, quick: bool, skip_conf: bool, module: Sequence[str]) -> None: ''' Run various checks @@ -671,7 +671,7 @@ def config_create_cmd() -> None: @main.command(name='modules', short_help='list available modules') @click.option('--all', 'list_all', is_flag=True, help='List all modules, including disabled') -def module_cmd(list_all: bool) -> None: +def module_cmd(*, list_all: bool) -> None: '''List available modules''' list_modules(list_all=list_all) @@ -684,7 +684,7 @@ def module_grp() -> None: @module_grp.command(name='requires', short_help='print module reqs') @click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True) -def module_requires_cmd(modules: Sequence[str]) -> None: +def module_requires_cmd(*, modules: Sequence[str]) -> None: ''' Print MODULES requirements @@ -701,7 +701,7 @@ def module_requires_cmd(modules: Sequence[str]) -> None: is_flag=True, help='Bypass PEP 668 and install dependencies into the system-wide python package directory.') @click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True) -def module_install_cmd(user: bool, parallel: bool, break_system_packages: bool, modules: Sequence[str]) -> None: +def module_install_cmd(*, user: bool, parallel: bool, break_system_packages: bool, modules: Sequence[str]) -> None: ''' Install dependencies for modules using pip @@ -782,6 +782,7 @@ def module_install_cmd(user: bool, parallel: bool, break_system_packages: bool, help='ignore any errors returned as objects from the functions') @click.argument('FUNCTION_NAME', nargs=-1, required=True, shell_complete=_module_autocomplete) def query_cmd( + *, function_name: Sequence[str], output: str, stream: bool, diff --git a/my/core/common.py b/my/core/common.py index ba4ce6b..b97866f 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -26,10 +26,11 @@ Paths = Union[Sequence[PathIsh], PathIsh] DEFAULT_GLOB = '*' def get_files( - pp: Paths, - glob: str=DEFAULT_GLOB, - sort: bool=True, - guess_compression: bool=True, + pp: Paths, + glob: str=DEFAULT_GLOB, + *, + sort: bool=True, + guess_compression: bool=True, ) -> Tuple[Path, ...]: """ Helper function to avoid boilerplate. diff --git a/my/core/denylist.py b/my/core/denylist.py index 7ca0ddf..92faf2c 100644 --- a/my/core/denylist.py +++ b/my/core/denylist.py @@ -96,6 +96,7 @@ class DenyList: def filter( self, itr: Iterator[T], + *, invert: bool = False, ) -> Iterator[T]: denyf = functools.partial(self._allow, deny_map=self.load()) @@ -103,7 +104,7 @@ class DenyList: return filter(lambda x: not denyf(x), itr) return filter(denyf, itr) - def deny(self, key: str, value: Any, write: bool = False) -> None: + def deny(self, key: str, value: Any, *, write: bool = False) -> None: ''' add a key/value pair to the denylist ''' @@ -111,7 +112,7 @@ class DenyList: self._load() self._deny_raw({key: self._stringify_value(value)}, write=write) - def _deny_raw(self, data: Dict[str, Any], write: bool = False) -> None: + def _deny_raw(self, data: Dict[str, Any], *, write: bool = False) -> None: self._deny_raw_list.append(data) if write: self.write() diff --git a/my/core/influxdb.py b/my/core/influxdb.py index b1d6b9b..25eeba1 100644 --- a/my/core/influxdb.py +++ b/my/core/influxdb.py @@ -135,7 +135,7 @@ def main() -> None: @main.command(name='populate', short_help='populate influxdb') @click.option('--reset', is_flag=True, help='Reset Influx measurements before inserting', show_default=True) @click.argument('FUNCTION_NAME', type=str, required=True) -def populate(function_name: str, reset: bool) -> None: +def populate(*, function_name: str, reset: bool) -> None: from .__main__ import _locate_functions_or_prompt [provider] = list(_locate_functions_or_prompt([function_name])) # todo could have a non-interactive version which populates from all data sources for the provider? diff --git a/my/core/konsume.py b/my/core/konsume.py index ac1b100..0e4a2fe 100644 --- a/my/core/konsume.py +++ b/my/core/konsume.py @@ -131,7 +131,7 @@ class UnconsumedError(Exception): # TODO think about error policy later... @contextmanager -def wrap(j, throw=True) -> Iterator[Zoomable]: +def wrap(j, *, throw=True) -> Iterator[Zoomable]: w, children = _wrap(j) yield w diff --git a/my/core/query.py b/my/core/query.py index bc7e222..daf702d 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -131,11 +131,12 @@ def attribute_func(obj: T, where: Where, default: Optional[U] = None) -> Optiona def _generate_order_by_func( - obj_res: Res[T], - key: Optional[str] = None, - where_function: Optional[Where] = None, - default: Optional[U] = None, - force_unsortable: bool = False, + obj_res: Res[T], + *, + key: Optional[str] = None, + where_function: Optional[Where] = None, + default: Optional[U] = None, + force_unsortable: bool = False, ) -> Optional[OrderFunc]: """ Accepts an object Res[T] (Instance of some class or Exception) @@ -274,6 +275,7 @@ def _wrap_unsorted(itr: Iterator[ET], orderfunc: OrderFunc) -> Tuple[Iterator[Un # the second being items for which orderfunc returned a non-none value def _handle_unsorted( itr: Iterator[ET], + *, orderfunc: OrderFunc, drop_unsorted: bool, wrap_unsorted: bool @@ -503,7 +505,12 @@ Will attempt to call iter() on the value""") # note: can't just attach sort unsortable values in the same iterable as the # other items because they don't have any lookups for order_key or functions # to handle items in the order_by_lookup dictionary - unsortable, itr = _handle_unsorted(itr, order_by_chosen, drop_unsorted, wrap_unsorted) + unsortable, itr = _handle_unsorted( + itr, + orderfunc=order_by_chosen, + drop_unsorted=drop_unsorted, + wrap_unsorted=wrap_unsorted, + ) # run the sort, with the computed order by function itr = iter(sorted(itr, key=order_by_chosen, reverse=reverse)) # type: ignore[arg-type] diff --git a/my/core/stats.py b/my/core/stats.py index 85c2a99..4c9fb0c 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -30,7 +30,7 @@ Stats = Dict[str, Any] class StatsFun(Protocol): - def __call__(self, quick: bool = False) -> Stats: ... + def __call__(self, *, quick: bool = False) -> Stats: ... # global state that turns on/off quick stats @@ -176,7 +176,7 @@ def guess_stats(module: ModuleType) -> Optional[StatsFun]: if len(providers) == 0: return None - def auto_stats(quick: bool = False) -> Stats: + def auto_stats(*, quick: bool = False) -> Stats: res = {} for k, v in providers.items(): res.update(stat(v, quick=quick, name=k)) @@ -355,7 +355,7 @@ def _stat_item(item): return _guess_datetime(item) -def _stat_iterable(it: Iterable[Any], quick: bool = False) -> Stats: +def _stat_iterable(it: Iterable[Any], *, quick: bool = False) -> Stats: from more_itertools import first, ilen, take # todo not sure if there is something in more_itertools to compute this? diff --git a/my/core/structure.py b/my/core/structure.py index df25e37..149a22a 100644 --- a/my/core/structure.py +++ b/my/core/structure.py @@ -12,7 +12,7 @@ from .logging import make_logger logger = make_logger(__name__, level="info") -def _structure_exists(base_dir: Path, paths: Sequence[str], partial: bool = False) -> bool: +def _structure_exists(base_dir: Path, paths: Sequence[str], *, partial: bool = False) -> bool: """ Helper function for match_structure to check if all subpaths exist at some base directory diff --git a/my/core/utils/concurrent.py b/my/core/utils/concurrent.py index 3553cd9..5f11ab0 100644 --- a/my/core/utils/concurrent.py +++ b/my/core/utils/concurrent.py @@ -47,5 +47,5 @@ class DummyExecutor(Executor): return f - def shutdown(self, wait: bool = True, **kwargs) -> None: + def shutdown(self, wait: bool = True, **kwargs) -> None: # noqa: FBT001,FBT002 self._shutdown = True diff --git a/my/endomondo.py b/my/endomondo.py index d314e97..1d7acc2 100644 --- a/my/endomondo.py +++ b/my/endomondo.py @@ -44,7 +44,7 @@ def workouts() -> Iterable[Res[Workout]]: from .core.pandas import check_dataframe, DataFrameT @check_dataframe -def dataframe(defensive: bool=True) -> DataFrameT: +def dataframe(*, defensive: bool=True) -> DataFrameT: def it(): for w in workouts(): if isinstance(w, Exception): diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index c4e5682..258ab96 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -91,7 +91,7 @@ def _cachew_depends_on() -> List[str]: # ResultsType is a Union of all of the models in google_takeout_parser @mcachew(depends_on=_cachew_depends_on, logger=logger, force_file=True) -def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: +def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: # noqa: FBT001 error_policy = config.error_policy count = 0 emitted = GoogleEventSet() diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 5d43296..1706a54 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -174,7 +174,7 @@ def hhmm(time: datetime): # return fromstart / tick -def plot_one(sleep: SleepEntry, fig, axes, xlims=None, showtext=True): +def plot_one(sleep: SleepEntry, fig, axes, xlims=None, *, showtext=True): import matplotlib.dates as mdates # type: ignore[import-not-found] span = sleep.completed - sleep.created diff --git a/my/location/fallback/all.py b/my/location/fallback/all.py index 0c7b8cd..a5daa05 100644 --- a/my/location/fallback/all.py +++ b/my/location/fallback/all.py @@ -24,7 +24,7 @@ def fallback_estimators() -> Iterator[LocationEstimator]: yield _home_estimate -def estimate_location(dt: DateExact, first_match: bool=False, under_accuracy: Optional[int] = None) -> FallbackLocation: +def estimate_location(dt: DateExact, *, first_match: bool=False, under_accuracy: Optional[int] = None) -> FallbackLocation: loc = estimate_from(dt, estimators=list(fallback_estimators()), first_match=first_match, under_accuracy=under_accuracy) # should never happen if the user has home configured if loc is None: diff --git a/my/location/fallback/common.py b/my/location/fallback/common.py index fd508c6..13bc603 100644 --- a/my/location/fallback/common.py +++ b/my/location/fallback/common.py @@ -18,7 +18,7 @@ class FallbackLocation(LocationProtocol): elevation: Optional[float] = None datasource: Optional[str] = None # which module provided this, useful for debugging - def to_location(self, end: bool = False) -> Location: + def to_location(self, *, end: bool = False) -> Location: ''' by default the start date is used for the location If end is True, the start date + duration is used diff --git a/my/tests/tz.py b/my/tests/tz.py index db88278..92d8f3b 100644 --- a/my/tests/tz.py +++ b/my/tests/tz.py @@ -18,7 +18,7 @@ def getzone(dt: datetime) -> str: @pytest.mark.parametrize('fast', [False, True]) -def test_iter_tzs(fast: bool, config) -> None: +def test_iter_tzs(*, fast: bool, config) -> None: # TODO hmm.. maybe need to make sure we start with empty config? config.time.tz.via_location.fast = fast diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index d74bdc3..156a5db 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -94,7 +94,7 @@ logger = make_logger(__name__) @lru_cache(None) -def _timezone_finder(fast: bool) -> Any: +def _timezone_finder(*, fast: bool) -> Any: if fast: # less precise, but faster from timezonefinder import TimezoneFinderL as Finder @@ -304,7 +304,7 @@ def localize(dt: datetime) -> datetime_aware: return tz.localize(dt) -def stats(quick: bool = False) -> Stats: +def stats(*, quick: bool = False) -> Stats: if quick: prev, config.sort_locations = config.sort_locations, False res = {'first': next(_iter_local_dates())} diff --git a/my/topcoder.py b/my/topcoder.py index 8e39252..07f71be 100644 --- a/my/topcoder.py +++ b/my/topcoder.py @@ -58,7 +58,7 @@ def _parse_one(p: Path) -> Iterator[Res[Competition]]: h.pop_if_primitive('version', 'id') h = h.zoom('result') - h.check('success', True) + h.check('success', expected=True) h.check('status', 200) h.pop_if_primitive('metadata') diff --git a/ruff.toml b/ruff.toml index dda279b..a8af399 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,10 +1,11 @@ target-version = "py38" # NOTE: inferred from pyproject.toml if present lint.extend-select = [ - "F", # flakes rules -- default, but extend just in case - "E", # pycodestyle -- default, but extend just in case - "C4", # flake8-comprehensions -- unnecessary list/map/dict calls - "UP", # detect deprecated python stdlib stuff + "F", # flakes rules -- default, but extend just in case + "E", # pycodestyle -- default, but extend just in case + "C4", # flake8-comprehensions -- unnecessary list/map/dict calls + "UP", # detect deprecated python stdlib stuff + "FBT", # detect use of boolean arguments ] lint.ignore = [ From b594377a599acbc14eb9a2b17e88ca87fcc1a727 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 00:00:54 +0100 Subject: [PATCH 265/302] ruff: enable RUF ruleset --- my/arbtt.py | 2 +- my/bluemaestro.py | 4 ++-- my/core/__main__.py | 2 +- my/core/error.py | 4 ++-- my/core/source.py | 2 +- my/core/structure.py | 22 +++++++++++----------- my/core/time.py | 2 +- my/core/util.py | 2 +- my/experimental/destructive_parsing.py | 2 +- my/google/takeout/html.py | 2 +- my/location/common.py | 2 +- my/photos/main.py | 2 +- my/polar.py | 2 +- my/twitter/android.py | 5 +++-- ruff.toml | 3 +++ 15 files changed, 31 insertions(+), 27 deletions(-) diff --git a/my/arbtt.py b/my/arbtt.py index 6de8cb2..2bcf291 100644 --- a/my/arbtt.py +++ b/my/arbtt.py @@ -81,7 +81,7 @@ def entries() -> Iterable[Entry]: cmds = [base] # rely on default else: # otherwise, 'merge' them - cmds = [base + ['--logfile', f] for f in inps] + cmds = [[*base, '--logfile', f] for f in inps] import ijson.backends.yajl2_cffi as ijson # type: ignore from subprocess import Popen, PIPE diff --git a/my/bluemaestro.py b/my/bluemaestro.py index 12c114f..50338bb 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -104,7 +104,7 @@ def measurements() -> Iterable[Res[Measurement]]: f'SELECT "{path.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index' ) oldfmt = True - db_dts = list(db.execute('SELECT last_download FROM info'))[0][0] + [(db_dts,)] = db.execute('SELECT last_download FROM info') if db_dts == 'N/A': # ??? happens for 20180923-20180928 continue @@ -137,7 +137,7 @@ def measurements() -> Iterable[Res[Measurement]]: processed_tables |= set(log_tables) # todo use later? - frequencies = [list(db.execute(f'SELECT interval from {t.replace("_log", "_meta")}'))[0][0] for t in log_tables] + frequencies = [list(db.execute(f'SELECT interval from {t.replace("_log", "_meta")}'))[0][0] for t in log_tables] # noqa: RUF015 # todo could just filter out the older datapoints?? dunno. diff --git a/my/core/__main__.py b/my/core/__main__.py index d3c0cc7..3af8e08 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -392,7 +392,7 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool=False, b # I think it only helps for pypi artifacts (not git!), # and only if they weren't cached for r in requirements: - cmds.append(pre_cmd + [r]) + cmds.append([*pre_cmd, r]) else: if parallel: warning('parallel install is not supported on this platform, installing sequentially...') diff --git a/my/core/error.py b/my/core/error.py index 7489f69..cd8d093 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -153,7 +153,7 @@ def test_sort_res_by() -> None: Exc('last'), ] - results2 = sort_res_by(ress + [0], lambda x: int(x)) + results2 = sort_res_by([*ress, 0], lambda x: int(x)) assert results2 == [Exc('last'), 0] + results[:-1] assert sort_res_by(['caba', 'a', 'aba', 'daba'], key=lambda x: len(x)) == ['a', 'aba', 'caba', 'daba'] @@ -166,7 +166,7 @@ def test_sort_res_by() -> None: def set_error_datetime(e: Exception, dt: Optional[datetime]) -> None: if dt is None: return - e.args = e.args + (dt,) + e.args = (*e.args, dt) # todo not sure if should return new exception? diff --git a/my/core/source.py b/my/core/source.py index 6e0a78a..4510ef0 100644 --- a/my/core/source.py +++ b/my/core/source.py @@ -61,7 +61,7 @@ def import_source( warnings.warn(f"""If you don't want to use this module, to hide this message, add '{module_name}' to your core config disabled_modules in your config, like: class core: - disabled_modules = [{repr(module_name)}] + disabled_modules = [{module_name!r}] """) # try to check if this is a config error or based on dependencies not being installed if isinstance(err, (ImportError, AttributeError)): diff --git a/my/core/structure.py b/my/core/structure.py index 149a22a..be5b307 100644 --- a/my/core/structure.py +++ b/my/core/structure.py @@ -67,21 +67,21 @@ def match_structure( export_dir ├── exp_2020 - │   ├── channel_data - │   │   ├── data1 - │   │   └── data2 - │   ├── index.json - │   ├── messages - │   │   └── messages.csv - │   └── profile - │   └── settings.json + │ ├── channel_data + │ │ ├── data1 + │ │ └── data2 + │ ├── index.json + │ ├── messages + │ │ └── messages.csv + │ └── profile + │ └── settings.json └── exp_2021 ├── channel_data - │   ├── data1 - │   └── data2 + │ ├── data1 + │ └── data2 ├── index.json ├── messages - │   └── messages.csv + │ └── messages.csv └── profile └── settings.json diff --git a/my/core/time.py b/my/core/time.py index 83a407b..5a47c3d 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -21,7 +21,7 @@ def user_forced() -> Sequence[str]: def _abbr_to_timezone_map() -> Dict[str, pytz.BaseTzInfo]: # also force UTC to always correspond to utc # this makes more sense than Zulu it ends up by default - timezones = pytz.all_timezones + ['UTC'] + list(user_forced()) + timezones = [*pytz.all_timezones, 'UTC', *user_forced()] res: Dict[str, pytz.BaseTzInfo] = {} for tzname in timezones: diff --git a/my/core/util.py b/my/core/util.py index fdd10f9..b49acf6 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -74,7 +74,7 @@ def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModu continue rel_pt = pkg_dir_path.relative_to(pkg_pth) - pkg_pref = '.'.join((pkg_name, ) + rel_pt.parts) + pkg_pref = '.'.join((pkg_name, *rel_pt.parts)) yield from _walk_packages( (str(pkg_dir_path), ), prefix=f'{pkg_pref}.', diff --git a/my/experimental/destructive_parsing.py b/my/experimental/destructive_parsing.py index 05c5920..056cc0b 100644 --- a/my/experimental/destructive_parsing.py +++ b/my/experimental/destructive_parsing.py @@ -26,7 +26,7 @@ class Helper: assert actual == expected, (key, actual, expected) def zoom(self, key: str) -> 'Helper': - return self.manager.helper(item=self.item.pop(key), path=self.path + (key,)) + return self.manager.helper(item=self.item.pop(key), path=(*self.path, key)) def is_empty(x) -> bool: diff --git a/my/google/takeout/html.py b/my/google/takeout/html.py index 3ce692c..750beac 100644 --- a/my/google/takeout/html.py +++ b/my/google/takeout/html.py @@ -122,7 +122,7 @@ class TakeoutHTMLParser(HTMLParser): # JamiexxVEVO # Jun 21, 2018, 5:48:34 AM # Products: - #  YouTube + # YouTube def handle_data(self, data): if self.state == State.OUTSIDE: if data[:-1].strip() in ("Watched", "Visited"): diff --git a/my/location/common.py b/my/location/common.py index 510e005..f406370 100644 --- a/my/location/common.py +++ b/my/location/common.py @@ -70,7 +70,7 @@ def locations_to_gpx(locations: Iterable[LocationProtocol], buffer: TextIO) -> I ) except AttributeError: yield TypeError( - f"Expected a Location or Location-like object, got {type(location)} {repr(location)}" + f"Expected a Location or Location-like object, got {type(location)} {location!r}" ) continue gpx_segment.points.append(point) diff --git a/my/photos/main.py b/my/photos/main.py index 6262eac..63a6fea 100644 --- a/my/photos/main.py +++ b/my/photos/main.py @@ -209,7 +209,7 @@ def print_all() -> None: if isinstance(p, Exception): print('ERROR!', p) else: - print(f"{str(p.dt):25} {p.path} {p.geo}") + print(f"{p.dt!s:25} {p.path} {p.geo}") # todo cachew -- improve AttributeError: type object 'tuple' has no attribute '__annotations__' -- improve errors? # todo cachew -- invalidate if function code changed? diff --git a/my/polar.py b/my/polar.py index cd2c719..197de18 100644 --- a/my/polar.py +++ b/my/polar.py @@ -27,7 +27,7 @@ class polar(user_config): ''' Polar config is optional, you only need it if you want to specify custom 'polar_dir' ''' - polar_dir: PathIsh = Path('~/.polar').expanduser() + polar_dir: PathIsh = Path('~/.polar').expanduser() # noqa: RUF009 defensive: bool = True # pass False if you want it to fail faster on errors (useful for debugging) diff --git a/my/twitter/android.py b/my/twitter/android.py index f40ad0e..7adfeb6 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -155,7 +155,7 @@ _SELECT_OWN_TWEETS = '_SELECT_OWN_TWEETS' def get_own_user_id(conn) -> str: # unclear what's the reliable way to query it, so we use multiple different ones and arbitrate # NOTE: 'SELECT DISTINCT ev_owner_id FROM lists' doesn't work, might include lists from other people? - res = set() + res: Set[str] = set() for q in [ 'SELECT DISTINCT list_mapping_user_id FROM list_mapping', 'SELECT DISTINCT owner_id FROM cursors', @@ -164,7 +164,8 @@ def get_own_user_id(conn) -> str: for (r,) in conn.execute(q): res.add(r) assert len(res) == 1, res - return str(list(res)[0]) + [r] = res + return r # NOTE: diff --git a/ruff.toml b/ruff.toml index a8af399..2b77622 100644 --- a/ruff.toml +++ b/ruff.toml @@ -6,6 +6,7 @@ lint.extend-select = [ "C4", # flake8-comprehensions -- unnecessary list/map/dict calls "UP", # detect deprecated python stdlib stuff "FBT", # detect use of boolean arguments + "RUF", # various ruff-specific rules ] lint.ignore = [ @@ -38,4 +39,6 @@ lint.ignore = [ "UP006", # use type instead of Type "UP007", # use X | Y instead of Union ### + "RUF100", # unused noqa -- handle later + "RUF012", # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs ] From d0df8e8f2db2dd0a1776fa7bad54f2bbbba1bde1 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 00:29:06 +0100 Subject: [PATCH 266/302] ruff: enable PLR rules and fix bug in my.github.gdpr._is_bot --- my/github/gdpr.py | 2 +- ruff.toml | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/my/github/gdpr.py b/my/github/gdpr.py index 4ca8e84..acbeb8f 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -145,7 +145,7 @@ def _parse_repository(d: Dict) -> Event: def _is_bot(user: Optional[str]) -> bool: if user is None: return False - return "[bot]" in "user" + return "[bot]" in user def _parse_issue_comment(d: Dict) -> Event: diff --git a/ruff.toml b/ruff.toml index 2b77622..69af75a 100644 --- a/ruff.toml +++ b/ruff.toml @@ -7,6 +7,10 @@ lint.extend-select = [ "UP", # detect deprecated python stdlib stuff "FBT", # detect use of boolean arguments "RUF", # various ruff-specific rules + + "PLR", + # "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks + # "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives ] lint.ignore = [ @@ -41,4 +45,16 @@ lint.ignore = [ ### "RUF100", # unused noqa -- handle later "RUF012", # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs + +### these are just nitpicky, we usually know better + "PLR0911", # too many return statements + "PLR0912", # too many branches + "PLR0913", # too many function arguments + "PLR0915", # too many statements + "PLR1714", # consider merging multiple comparisons + "PLR2044", # line with empty comment + "PLR5501", # use elif instead of else if + "PLR2004", # magic value in comparison -- super annoying in tests +### + "PLR0402", # import X.Y as Y -- TODO maybe consider enabling it, but double check ] From 72cc8ff3acae427232c28ca32080f22c9164f7d7 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 01:13:18 +0100 Subject: [PATCH 267/302] ruff: enable B warnings (mainly suppressed exceptions and unused variables) --- my/bluemaestro.py | 2 +- my/body/exercise/cross_trainer.py | 2 +- my/coding/github.py | 13 ++++++++----- my/core/__main__.py | 4 ++-- my/core/cachew.py | 6 ++++-- my/core/common.py | 7 +++---- my/core/error.py | 2 +- my/core/hpi_compat.py | 4 ++-- my/core/init.py | 6 +++--- my/core/logging.py | 6 +++--- my/core/orgmode.py | 12 ++++++------ my/core/query.py | 4 ++-- my/core/query_range.py | 2 +- my/core/source.py | 2 +- my/core/stats.py | 2 +- my/core/util.py | 4 ++-- my/core/utils/itertools.py | 7 ++++--- my/core/warnings.py | 10 +++++----- my/jawbone/__init__.py | 2 +- my/location/google.py | 11 ++++++----- my/media/imdb.py | 2 +- my/polar.py | 4 ++-- my/reddit/rexport.py | 6 +++--- my/rss/common.py | 2 +- my/tests/location/google.py | 4 ++-- my/tests/shared_tz_config.py | 4 ++-- my/time/tz/common.py | 2 +- my/twitter/archive.py | 4 ++-- ruff.toml | 10 +++++++++- tests/github.py | 4 +++- 30 files changed, 83 insertions(+), 67 deletions(-) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index 50338bb..5d0968b 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -153,7 +153,7 @@ def measurements() -> Iterable[Res[Measurement]]: oldfmt = False db_dt = None - for i, (name, tsc, temp, hum, pres, dewp) in enumerate(datas): + for (name, tsc, temp, hum, pres, dewp) in datas: if is_bad_table(name): continue diff --git a/my/body/exercise/cross_trainer.py b/my/body/exercise/cross_trainer.py index d073f43..edbb557 100644 --- a/my/body/exercise/cross_trainer.py +++ b/my/body/exercise/cross_trainer.py @@ -105,7 +105,7 @@ def dataframe() -> DataFrameT: rows = [] idxs = [] # type: ignore[var-annotated] NO_ENDOMONDO = 'no endomondo matches' - for i, row in mdf.iterrows(): + for _i, row in mdf.iterrows(): rd = row.to_dict() mdate = row['date'] if pd.isna(mdate): diff --git a/my/coding/github.py b/my/coding/github.py index 9358b04..de64f05 100644 --- a/my/coding/github.py +++ b/my/coding/github.py @@ -1,9 +1,12 @@ -import warnings +from typing import TYPE_CHECKING -warnings.warn('my.coding.github is deprecated! Please use my.github.all instead!') +from my.core import warnings + +warnings.high('my.coding.github is deprecated! Please use my.github.all instead!') # todo why aren't DeprecationWarning shown by default?? -from ..github.all import events, get_events +if not TYPE_CHECKING: + from ..github.all import events, get_events -# todo deprecate properly -iter_events = events + # todo deprecate properly + iter_events = events diff --git a/my/core/__main__.py b/my/core/__main__.py index 3af8e08..c5e4552 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -456,9 +456,9 @@ def _locate_functions_or_prompt(qualified_names: List[str], *, prompt: bool = Tr # user to select a 'data provider' like function try: mod = importlib.import_module(qualname) - except Exception: + except Exception as ie: eprint(f"During fallback, importing '{qualname}' as module failed") - raise qr_err + raise qr_err from ie # find data providers in this module data_providers = [f for _, f in inspect.getmembers(mod, inspect.isfunction) if is_data_provider(f)] diff --git a/my/core/cachew.py b/my/core/cachew.py index e0e7adf..dc6ed79 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -2,7 +2,6 @@ from .internal import assert_subpackage; assert_subpackage(__name__) import logging import sys -import warnings from contextlib import contextmanager from pathlib import Path from typing import ( @@ -20,6 +19,9 @@ from typing import ( import appdirs # type: ignore[import-untyped] +from . import warnings + + PathIsh = Union[str, Path] # avoid circular import from .common @@ -116,7 +118,7 @@ def _mcachew_impl(cache_path=_cache_path_dflt, **kwargs): try: import cachew except ModuleNotFoundError: - warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew') + warnings.high('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew') return lambda orig_func: orig_func else: kwargs['cache_path'] = cache_path diff --git a/my/core/common.py b/my/core/common.py index b97866f..5f8d03a 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -1,5 +1,4 @@ import os -import warnings from glob import glob as do_glob from pathlib import Path from typing import ( @@ -15,7 +14,7 @@ from typing import ( ) from . import compat -from . import warnings as core_warnings +from . import warnings as warnings # some helper functions # TODO start deprecating this? soon we'd be able to use Path | str syntax which is shorter and more explicit @@ -63,7 +62,7 @@ def get_files( gs = str(src) if '*' in gs: if glob != DEFAULT_GLOB: - warnings.warn(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!") + warnings.medium(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!") paths.extend(map(Path, do_glob(gs))) elif os.path.isdir(str(src)): # NOTE: we're using os.path here on purpose instead of src.is_dir @@ -85,7 +84,7 @@ def get_files( if len(paths) == 0: # todo make it conditionally defensive based on some global settings - core_warnings.high(f''' + warnings.high(f''' {caller()}: no paths were matched against {pp}. This might result in missing data. Likely, the directory you passed is empty. '''.strip()) # traceback is useful to figure out what config caused it? diff --git a/my/core/error.py b/my/core/error.py index cd8d093..e869614 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -119,7 +119,7 @@ def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> List[Res[T] group = [] results: List[Res[T]] = [] - for v, grp in sorted(groups, key=lambda p: p[0]): # type: ignore[return-value, arg-type] # TODO SupportsLessThan?? + for _v, grp in sorted(groups, key=lambda p: p[0]): # type: ignore[return-value, arg-type] # TODO SupportsLessThan?? results.extend(grp) results.extend(group) # handle last group (it will always be errors only) diff --git a/my/core/hpi_compat.py b/my/core/hpi_compat.py index bad0b17..9330e49 100644 --- a/my/core/hpi_compat.py +++ b/my/core/hpi_compat.py @@ -6,7 +6,7 @@ import inspect import os import re from types import ModuleType -from typing import Iterator, List, Optional, TypeVar +from typing import Iterator, List, Optional, Sequence, TypeVar from . import warnings @@ -71,7 +71,7 @@ def pre_pip_dal_handler( name: str, e: ModuleNotFoundError, cfg, - requires=[], + requires: Sequence[str] = (), ) -> ModuleType: ''' https://github.com/karlicoss/HPI/issues/79 diff --git a/my/core/init.py b/my/core/init.py index 49148de..7a30955 100644 --- a/my/core/init.py +++ b/my/core/init.py @@ -25,7 +25,7 @@ def setup_config() -> None: warnings.warn(f""" 'my.config' package isn't found! (expected at '{mycfg_dir}'). This is likely to result in issues. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info. -""".strip()) +""".strip(), stacklevel=1) return mpath = str(mycfg_dir) @@ -47,7 +47,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-mo warnings.warn(f""" Importing 'my.config' failed! (error: {ex}). This is likely to result in issues. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info. -""") +""", stacklevel=1) else: # defensive just in case -- __file__ may not be present if there is some dynamic magic involved used_config_file = getattr(my.config, '__file__', None) @@ -63,7 +63,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-mo Expected my.config to be located at {mycfg_dir}, but instead its path is {used_config_path}. This will likely cause issues down the line -- double check {mycfg_dir} structure. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info. -""", +""", stacklevel=1 ) diff --git a/my/core/logging.py b/my/core/logging.py index 734c1e0..bdee9aa 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -15,7 +15,7 @@ def test() -> None: ## prepare exception for later try: - None.whatever # type: ignore[attr-defined] + None.whatever # type: ignore[attr-defined] # noqa: B018 except Exception as e: ex = e ## @@ -146,7 +146,7 @@ def _setup_handlers_and_formatters(name: str) -> None: # try colorlog first, so user gets nice colored logs import colorlog except ModuleNotFoundError: - warnings.warn("You might want to 'pip install colorlog' for nice colored logs") + warnings.warn("You might want to 'pip install colorlog' for nice colored logs", stacklevel=1) formatter = logging.Formatter(FORMAT_NOCOLOR) else: # log_color/reset are specific to colorlog @@ -233,7 +233,7 @@ def get_enlighten(): try: import enlighten # type: ignore[import-untyped] except ModuleNotFoundError: - warnings.warn("You might want to 'pip install enlighten' for a nice progress bar") + warnings.warn("You might want to 'pip install enlighten' for a nice progress bar", stacklevel=1) return Mock() diff --git a/my/core/orgmode.py b/my/core/orgmode.py index d9a254c..c70ded6 100644 --- a/my/core/orgmode.py +++ b/my/core/orgmode.py @@ -6,12 +6,12 @@ from datetime import datetime def parse_org_datetime(s: str) -> datetime: s = s.strip('[]') - for fmt, cl in [ - ("%Y-%m-%d %a %H:%M", datetime), - ("%Y-%m-%d %H:%M" , datetime), - # todo not sure about these... fallback on 00:00? - # ("%Y-%m-%d %a" , date), - # ("%Y-%m-%d" , date), + for fmt, _cls in [ + ("%Y-%m-%d %a %H:%M", datetime), + ("%Y-%m-%d %H:%M" , datetime), + # todo not sure about these... fallback on 00:00? + # ("%Y-%m-%d %a" , date), + # ("%Y-%m-%d" , date), ]: try: return datetime.strptime(s, fmt) diff --git a/my/core/query.py b/my/core/query.py index daf702d..c337e5c 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -72,7 +72,7 @@ def locate_function(module_name: str, function_name: str) -> Callable[[], Iterab if func is not None and callable(func): return func except Exception as e: - raise QueryException(str(e)) + raise QueryException(str(e)) # noqa: B904 raise QueryException(f"Could not find function '{function_name}' in '{module_name}'") @@ -468,7 +468,7 @@ Will attempt to call iter() on the value""") try: itr: Iterator[ET] = iter(it) except TypeError as t: - raise QueryException("Could not convert input src to an Iterator: " + str(t)) + raise QueryException("Could not convert input src to an Iterator: " + str(t)) # noqa: B904 # if both drop_exceptions and drop_exceptions are provided for some reason, # should raise exceptions before dropping them diff --git a/my/core/query_range.py b/my/core/query_range.py index 761b045..0a1b321 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -109,7 +109,7 @@ def _datelike_to_float(dl: Any) -> float: try: return parse_datetime_float(dl) except QueryException as q: - raise QueryException(f"While attempting to extract datetime from {dl}, to order by datetime:\n\n" + str(q)) + raise QueryException(f"While attempting to extract datetime from {dl}, to order by datetime:\n\n" + str(q)) # noqa: B904 class RangeTuple(NamedTuple): diff --git a/my/core/source.py b/my/core/source.py index 4510ef0..52c58c1 100644 --- a/my/core/source.py +++ b/my/core/source.py @@ -62,7 +62,7 @@ def import_source( class core: disabled_modules = [{module_name!r}] -""") +""", stacklevel=1) # try to check if this is a config error or based on dependencies not being installed if isinstance(err, (ImportError, AttributeError)): matched_config_err = warn_my_config_import_error(err, module_name=module_name, help_url=help_url) diff --git a/my/core/stats.py b/my/core/stats.py index 4c9fb0c..aa05355 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -440,7 +440,7 @@ def _guess_datetime(x: Any) -> Optional[datetime]: d = asdict(x) except: # noqa: E722 bare except return None - for k, v in d.items(): + for _k, v in d.items(): if isinstance(v, datetime): return v return None diff --git a/my/core/util.py b/my/core/util.py index b49acf6..fb3edf8 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -93,11 +93,11 @@ def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModu def _walk_packages(path: Iterable[str], prefix: str='', onerror=None) -> Iterable[HPIModule]: """ Modified version of https://github.com/python/cpython/blob/d50a0700265536a20bcce3fb108c954746d97625/Lib/pkgutil.py#L53, - to alvoid importing modules that are skipped + to avoid importing modules that are skipped """ from .core_config import config - def seen(p, m={}): + def seen(p, m={}): # noqa: B006 if p in m: return True m[p] = True diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index 66f82bd..b945ad8 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -24,6 +24,8 @@ import more_itertools from decorator import decorator from ..compat import ParamSpec +from .. import warnings as core_warnings + T = TypeVar('T') K = TypeVar('K') @@ -142,8 +144,7 @@ def _warn_if_empty(func, *args, **kwargs): if isinstance(iterable, Sized): sz = len(iterable) if sz == 0: - # todo use hpi warnings here? - warnings.warn(f"Function {func} returned empty container, make sure your config paths are correct") + core_warnings.medium(f"Function {func} returned empty container, make sure your config paths are correct") return iterable else: # must be an iterator @@ -153,7 +154,7 @@ def _warn_if_empty(func, *args, **kwargs): yield i empty = False if empty: - warnings.warn(f"Function {func} didn't emit any data, make sure your config paths are correct") + core_warnings.medium(f"Function {func} didn't emit any data, make sure your config paths are correct") return wit() diff --git a/my/core/warnings.py b/my/core/warnings.py index 82e539b..2ffc3e4 100644 --- a/my/core/warnings.py +++ b/my/core/warnings.py @@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Optional import click -def _colorize(x: str, color: Optional[str]=None) -> str: +def _colorize(x: str, color: Optional[str] = None) -> str: if color is None: return x @@ -24,10 +24,10 @@ def _colorize(x: str, color: Optional[str]=None) -> str: return click.style(x, fg=color) -def _warn(message: str, *args, color: Optional[str]=None, **kwargs) -> None: +def _warn(message: str, *args, color: Optional[str] = None, **kwargs) -> None: stacklevel = kwargs.get('stacklevel', 1) - kwargs['stacklevel'] = stacklevel + 2 # +1 for this function, +1 for medium/high wrapper - warnings.warn(_colorize(message, color=color), *args, **kwargs) + kwargs['stacklevel'] = stacklevel + 2 # +1 for this function, +1 for medium/high wrapper + warnings.warn(_colorize(message, color=color), *args, **kwargs) # noqa: B028 def low(message: str, *args, **kwargs) -> None: @@ -55,4 +55,4 @@ if not TYPE_CHECKING: def warn(*args, **kwargs): import warnings - return warnings.warn(*args, **kwargs) + return warnings.warn(*args, **kwargs) # noqa: B028 diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 1706a54..affe230 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -274,7 +274,7 @@ def plot() -> None: fig: Figure = plt.figure(figsize=(15, sleeps_count * 1)) axarr = fig.subplots(nrows=len(sleeps)) - for i, (sleep, axes) in enumerate(zip(sleeps, axarr)): + for (sleep, axes) in zip(sleeps, axarr): plot_one(sleep, fig, axes, showtext=True) used = melatonin_data.get(sleep.date_, None) sused: str diff --git a/my/location/google.py b/my/location/google.py index a7a92d3..b966ec6 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -22,9 +22,10 @@ import geopy # type: ignore from my.core import stat, Stats, make_logger from my.core.cachew import cache_dir, mcachew -from my.core.warnings import high +from my.core import warnings -high("Please set up my.google.takeout.parser module for better takeout support") + +warnings.high("Please set up my.google.takeout.parser module for better takeout support") # otherwise uses ijson @@ -52,8 +53,7 @@ def _iter_via_ijson(fo) -> Iterable[TsLatLon]: # pip3 install ijson cffi import ijson.backends.yajl2_cffi as ijson # type: ignore except: - import warnings - warnings.warn("Falling back to default ijson because 'cffi' backend isn't found. It's up to 2x faster, you might want to check it out") + warnings.medium("Falling back to default ijson because 'cffi' backend isn't found. It's up to 2x faster, you might want to check it out") import ijson # type: ignore for d in ijson.items(fo, 'locations.item'): @@ -105,7 +105,8 @@ def _iter_locations_fo(fit) -> Iterable[Location]: errors += 1 if float(errors) / total > 0.01: # todo make defensive? - raise RuntimeError('too many errors! aborting') + # todo exceptiongroup? + raise RuntimeError('too many errors! aborting') # noqa: B904 else: continue diff --git a/my/media/imdb.py b/my/media/imdb.py index df6d62d..c66f5dc 100644 --- a/my/media/imdb.py +++ b/my/media/imdb.py @@ -22,7 +22,7 @@ def iter_movies() -> Iterator[Movie]: with last.open() as fo: reader = csv.DictReader(fo) - for i, line in enumerate(reader): + for line in reader: # TODO extract directors?? title = line['Title'] rating = int(line['You rated']) diff --git a/my/polar.py b/my/polar.py index 197de18..9125f17 100644 --- a/my/polar.py +++ b/my/polar.py @@ -166,7 +166,7 @@ class Loader: htags: List[str] = [] if 'tags' in h: ht = h['tags'].zoom() - for k, v in list(ht.items()): + for _k, v in list(ht.items()): ctag = v.zoom() ctag['id'].consume() ct = ctag['label'].zoom() @@ -199,7 +199,7 @@ class Loader: def load_items(self, metas: Json) -> Iterable[Highlight]: - for p, meta in metas.items(): + for _p, meta in metas.items(): with wrap(meta, throw=not config.defensive) as meta: yield from self.load_item(meta) diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index 6a6be61..5dcd7d9 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -144,9 +144,9 @@ if not TYPE_CHECKING: try: # here we just check that types are available, we don't actually want to import them # fmt: off - dal.Subreddit - dal.Profile - dal.Multireddit + dal.Subreddit # noqa: B018 + dal.Profil # noqa: B018e + dal.Multireddit # noqa: B018 # fmt: on except AttributeError as ae: warnings.high(f'{ae} : please update "rexport" installation') diff --git a/my/rss/common.py b/my/rss/common.py index 54067d6..bb75297 100644 --- a/my/rss/common.py +++ b/my/rss/common.py @@ -32,7 +32,7 @@ def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscri by_url: Dict[str, Subscription] = {} # ah. dates are used for sorting - for when, state in sorted(states): + for _when, state in sorted(states): # TODO use 'when'? for feed in state: if feed.url not in by_url: diff --git a/my/tests/location/google.py b/my/tests/location/google.py index 612522b..43b8646 100644 --- a/my/tests/location/google.py +++ b/my/tests/location/google.py @@ -44,8 +44,8 @@ def _prepare_takeouts_dir(tmp_path: Path) -> Path: try: track = one(testdata().rglob('italy-slovenia-2017-07-29.json')) - except ValueError: - raise RuntimeError('testdata not found, setup git submodules?') + except ValueError as e: + raise RuntimeError('testdata not found, setup git submodules?') from e # todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs import zipfile diff --git a/my/tests/shared_tz_config.py b/my/tests/shared_tz_config.py index 3d95a9e..810d989 100644 --- a/my/tests/shared_tz_config.py +++ b/my/tests/shared_tz_config.py @@ -49,8 +49,8 @@ def _prepare_takeouts_dir(tmp_path: Path) -> Path: try: track = one(testdata().rglob('italy-slovenia-2017-07-29.json')) - except ValueError: - raise RuntimeError('testdata not found, setup git submodules?') + except ValueError as e: + raise RuntimeError('testdata not found, setup git submodules?') from e # todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs import zipfile diff --git a/my/time/tz/common.py b/my/time/tz/common.py index 89150c7..13c8ac0 100644 --- a/my/time/tz/common.py +++ b/my/time/tz/common.py @@ -33,7 +33,7 @@ def default_policy() -> TzPolicy: def localize_with_policy( lfun: Callable[[datetime], datetime_aware], dt: datetime, - policy: TzPolicy=default_policy() + policy: TzPolicy=default_policy() # noqa: B008 ) -> datetime_aware: tz = dt.tzinfo if tz is None: diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 685f7fc..d326d70 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -14,9 +14,9 @@ except ImportError as ie: try: from my.config import twitter as user_config # type: ignore[assignment] except ImportError: - raise ie # raise the original exception.. must be something else + raise ie # raise the original exception.. must be something else # noqa: B904 else: - from ..core import warnings + from my.core import warnings warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config') ## diff --git a/ruff.toml b/ruff.toml index 69af75a..3d97fc9 100644 --- a/ruff.toml +++ b/ruff.toml @@ -7,8 +7,11 @@ lint.extend-select = [ "UP", # detect deprecated python stdlib stuff "FBT", # detect use of boolean arguments "RUF", # various ruff-specific rules + "PLR", # 'refactor' rules + "B", # 'bugbear' set -- various possible bugs + + - "PLR", # "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks # "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives ] @@ -57,4 +60,9 @@ lint.ignore = [ "PLR2004", # magic value in comparison -- super annoying in tests ### "PLR0402", # import X.Y as Y -- TODO maybe consider enabling it, but double check + + "B009", # calling gettattr with constant attribute -- this is useful to convince mypy + "B010", # same as above, but setattr + "B017", # pytest.raises(Exception) + "B023", # seems to result in false positives? ] diff --git a/tests/github.py b/tests/github.py index 6b7df23..ed89053 100644 --- a/tests/github.py +++ b/tests/github.py @@ -5,11 +5,13 @@ from more_itertools import ilen def test_gdpr() -> None: import my.github.gdpr as gdpr + assert ilen(gdpr.events()) > 100 def test() -> None: - from my.coding.github import get_events + from my.github.all import get_events + events = get_events() assert ilen(events) > 100 for e in events: From 985c0f94e633dcab9f51e0da459ad049d8f8c73b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 01:51:14 +0100 Subject: [PATCH 268/302] ruff: attempt to enable ARG checks, suppress in some places --- my/core/pytest.py | 2 +- my/core/utils/itertools.py | 6 +++--- my/pdfs.py | 2 +- ruff.toml | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/my/core/pytest.py b/my/core/pytest.py index c73c71a..e514957 100644 --- a/my/core/pytest.py +++ b/my/core/pytest.py @@ -15,7 +15,7 @@ if typing.TYPE_CHECKING or under_pytest: parametrize = pytest.mark.parametrize else: - def parametrize(*args, **kwargs): + def parametrize(*_args, **_kwargs): def wrapper(f): return f diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index b945ad8..ae9402d 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -63,7 +63,7 @@ def test_ensure_unique() -> None: list(it) # hacky way to force distinct objects? - list(ensure_unique(dups, key=lambda i: object())) + list(ensure_unique(dups, key=lambda _: object())) def make_dict( @@ -115,7 +115,7 @@ def _listify(func: Callable[LFP, Iterable[LV]], *args: LFP.args, **kwargs: LFP.k # so seems easiest to just use specialize instantiations of decorator instead if TYPE_CHECKING: - def listify(func: Callable[LFP, Iterable[LV]]) -> Callable[LFP, List[LV]]: ... + def listify(func: Callable[LFP, Iterable[LV]]) -> Callable[LFP, List[LV]]: ... # noqa: ARG001 else: listify = _listify @@ -162,7 +162,7 @@ def _warn_if_empty(func, *args, **kwargs): if TYPE_CHECKING: FF = TypeVar('FF', bound=Callable[..., Iterable]) - def warn_if_empty(f: FF) -> FF: ... + def warn_if_empty(func: FF) -> FF: ... # noqa: ARG001 else: warn_if_empty = _warn_if_empty diff --git a/my/pdfs.py b/my/pdfs.py index db49c0e..de9324d 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -25,7 +25,7 @@ class config(Protocol): def paths(self) -> Paths: return () # allowed to be empty for 'filelist' logic - def is_ignored(self, p: Path) -> bool: + def is_ignored(self, p: Path) -> bool: # noqa: ARG002 """ You can override this in user config if you want to ignore some files that are tooheavy """ diff --git a/ruff.toml b/ruff.toml index 3d97fc9..e7c6f07 100644 --- a/ruff.toml +++ b/ruff.toml @@ -10,8 +10,8 @@ lint.extend-select = [ "PLR", # 'refactor' rules "B", # 'bugbear' set -- various possible bugs - - + # "FA", # TODO enable later after we make sure cachew works? + # "ARG", # TODO useful, but results in some false positives in pytest fixtures... maybe later # "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks # "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives ] From bd1e5d2f1167232017fcf5aad34c554582bedc69 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 02:16:12 +0100 Subject: [PATCH 269/302] ruff: enable PERF checks set --- my/core/query.py | 2 +- my/core/stats.py | 2 +- my/polar.py | 2 +- ruff.toml | 9 +++++++++ 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/my/core/query.py b/my/core/query.py index c337e5c..45806fb 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -114,7 +114,7 @@ def attribute_func(obj: T, where: Where, default: Optional[U] = None) -> Optiona if where(v): return lambda o: o.get(k, default) # type: ignore[union-attr] elif dataclasses.is_dataclass(obj): - for (field_name, _annotation) in obj.__annotations__.items(): + for field_name in obj.__annotations__.keys(): if where(getattr(obj, field_name)): return lambda o: getattr(o, field_name, default) elif is_namedtuple(obj): diff --git a/my/core/stats.py b/my/core/stats.py index aa05355..674a8d1 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -440,7 +440,7 @@ def _guess_datetime(x: Any) -> Optional[datetime]: d = asdict(x) except: # noqa: E722 bare except return None - for _k, v in d.items(): + for v in d.values(): if isinstance(v, datetime): return v return None diff --git a/my/polar.py b/my/polar.py index 9125f17..e52bb14 100644 --- a/my/polar.py +++ b/my/polar.py @@ -199,7 +199,7 @@ class Loader: def load_items(self, metas: Json) -> Iterable[Highlight]: - for _p, meta in metas.items(): + for _p, meta in metas.items(): # noqa: PERF102 with wrap(meta, throw=not config.defensive) as meta: yield from self.load_item(meta) diff --git a/ruff.toml b/ruff.toml index e7c6f07..2c9c39b 100644 --- a/ruff.toml +++ b/ruff.toml @@ -10,6 +10,7 @@ lint.extend-select = [ "PLR", # 'refactor' rules "B", # 'bugbear' set -- various possible bugs + "PERF", # various potential performance speedups # "FA", # TODO enable later after we make sure cachew works? # "ARG", # TODO useful, but results in some false positives in pytest fixtures... maybe later # "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks @@ -65,4 +66,12 @@ lint.ignore = [ "B010", # same as above, but setattr "B017", # pytest.raises(Exception) "B023", # seems to result in false positives? + + # a bit too annoying, offers to convert for loops to list comprehension + # , which may heart readability + "PERF401", + + # suggests no using exception in for loops + # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost" + "PERF203", ] From 9fd4227abf7fa2619c9f1297e754eead0aee7fc3 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 02:50:05 +0100 Subject: [PATCH 270/302] ruff: enable RET/PIE/PLW --- my/coding/commits.py | 3 +- my/core/__main__.py | 10 ++--- my/core/_deprecated/kompress.py | 2 +- my/core/error.py | 3 +- my/core/orgmode.py | 3 +- my/core/query_range.py | 60 +++++++++++++------------- my/core/serialize.py | 3 +- my/core/util.py | 2 +- my/experimental/destructive_parsing.py | 2 +- my/location/fallback/via_home.py | 19 ++++---- my/photos/main.py | 3 +- my/smscalls.py | 7 ++- my/time/tz/via_location.py | 6 +-- ruff.toml | 32 ++++++++++---- 14 files changed, 80 insertions(+), 75 deletions(-) diff --git a/my/coding/commits.py b/my/coding/commits.py index 20b66a0..d4e05b7 100644 --- a/my/coding/commits.py +++ b/my/coding/commits.py @@ -187,8 +187,7 @@ def _repo_depends_on(_repo: Path) -> int: ff = _repo / pp if ff.exists(): return int(ff.stat().st_mtime) - else: - raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}") + raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}") def _commits(_repos: List[Path]) -> Iterator[Commit]: diff --git a/my/core/__main__.py b/my/core/__main__.py index c5e4552..8553942 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -43,7 +43,7 @@ def run_mypy(cfg_path: Path) -> Optional[CompletedProcess]: cmd = mypy_cmd() if cmd is None: return None - mres = run([ # noqa: UP022 + mres = run([ # noqa: UP022,PLW1510 *cmd, '--namespace-packages', '--color-output', # not sure if works?? @@ -214,10 +214,10 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module if len(errors) > 0: error(f'config check: {len(errors)} errors') return False - else: - # note: shouldn't exit here, might run something else - info('config check: success!') - return True + + # note: shouldn't exit here, might run something else + info('config check: success!') + return True from .util import HPIModule, modules diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index 803e515..cd09c06 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -87,7 +87,7 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO: elif name.endswith(Ext.lz4): import lz4.frame # type: ignore return lz4.frame.open(str(pp), mode, *args, **kwargs) - elif name.endswith(Ext.zstd) or name.endswith(Ext.zst): + elif name.endswith(Ext.zstd) or name.endswith(Ext.zst): # noqa: PIE810 kwargs['mode'] = mode return _zstd_open(pp, *args, **kwargs) elif name.endswith(Ext.targz): diff --git a/my/core/error.py b/my/core/error.py index e869614..ed26dda 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -41,8 +41,7 @@ def notnone(x: Optional[T]) -> T: def unwrap(res: Res[T]) -> T: if isinstance(res, Exception): raise res - else: - return res + return res def drop_exceptions(itr: Iterator[Res[T]]) -> Iterator[T]: diff --git a/my/core/orgmode.py b/my/core/orgmode.py index c70ded6..979f288 100644 --- a/my/core/orgmode.py +++ b/my/core/orgmode.py @@ -17,8 +17,7 @@ def parse_org_datetime(s: str) -> datetime: return datetime.strptime(s, fmt) except ValueError: continue - else: - raise RuntimeError(f"Bad datetime string {s}") + raise RuntimeError(f"Bad datetime string {s}") # TODO I guess want to borrow inspiration from bs4? element type <-> tag; and similar logic for find_one, find_all diff --git a/my/core/query_range.py b/my/core/query_range.py index 0a1b321..1f4a7ff 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -341,37 +341,37 @@ def select_range( if order_by_chosen is None: raise QueryException("""Can't order by range if we have no way to order_by! Specify a type or a key to order the value by""") - else: - # force drop_unsorted=True so we can use _create_range_filter - # sort the iterable by the generated order_by_chosen function - itr = select(itr, order_by=order_by_chosen, drop_unsorted=True) - filter_func: Optional[Where] - if order_by_value_type in [datetime, date]: - filter_func = _create_range_filter( - unparsed_range=unparsed_range, - end_parser=parse_datetime_float, - within_parser=parse_timedelta_float, - attr_func=order_by_chosen, # type: ignore[arg-type] - default_before=time.time(), - value_coercion_func=_datelike_to_float) - elif order_by_value_type in [int, float]: - # allow primitives to be converted using the default int(), float() callables - filter_func = _create_range_filter( - unparsed_range=unparsed_range, - end_parser=order_by_value_type, - within_parser=order_by_value_type, - attr_func=order_by_chosen, # type: ignore[arg-type] - default_before=None, - value_coercion_func=order_by_value_type) - else: - # TODO: add additional kwargs to let the user sort by other values, by specifying the parsers? - # would need to allow passing the end_parser, within parser, default before and value_coercion_func... - # (seems like a lot?) - raise QueryException("Sorting by custom types is currently unsupported") - # use the created filter function - # we've already applied drop_exceptions and kwargs related to unsortable values above - itr = select(itr, where=filter_func, limit=limit, reverse=reverse) + # force drop_unsorted=True so we can use _create_range_filter + # sort the iterable by the generated order_by_chosen function + itr = select(itr, order_by=order_by_chosen, drop_unsorted=True) + filter_func: Optional[Where] + if order_by_value_type in [datetime, date]: + filter_func = _create_range_filter( + unparsed_range=unparsed_range, + end_parser=parse_datetime_float, + within_parser=parse_timedelta_float, + attr_func=order_by_chosen, # type: ignore[arg-type] + default_before=time.time(), + value_coercion_func=_datelike_to_float) + elif order_by_value_type in [int, float]: + # allow primitives to be converted using the default int(), float() callables + filter_func = _create_range_filter( + unparsed_range=unparsed_range, + end_parser=order_by_value_type, + within_parser=order_by_value_type, + attr_func=order_by_chosen, # type: ignore[arg-type] + default_before=None, + value_coercion_func=order_by_value_type) + else: + # TODO: add additional kwargs to let the user sort by other values, by specifying the parsers? + # would need to allow passing the end_parser, within parser, default before and value_coercion_func... + # (seems like a lot?) + raise QueryException("Sorting by custom types is currently unsupported") + + # use the created filter function + # we've already applied drop_exceptions and kwargs related to unsortable values above + itr = select(itr, where=filter_func, limit=limit, reverse=reverse) else: # wrap_unsorted may be used here if the user specified an order_key, # or manually passed a order_value function diff --git a/my/core/serialize.py b/my/core/serialize.py index b196d47..ab11a20 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -145,8 +145,7 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: res = factory() if res is not None: return res - else: - raise RuntimeError("Should not happen!") + raise RuntimeError("Should not happen!") def dumps( diff --git a/my/core/util.py b/my/core/util.py index fb3edf8..a247f81 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -100,7 +100,7 @@ def _walk_packages(path: Iterable[str], prefix: str='', onerror=None) -> Iterabl def seen(p, m={}): # noqa: B006 if p in m: return True - m[p] = True + m[p] = True # noqa: RET503 for info in pkgutil.iter_modules(path, prefix): mname = info.name diff --git a/my/experimental/destructive_parsing.py b/my/experimental/destructive_parsing.py index 056cc0b..b389f7e 100644 --- a/my/experimental/destructive_parsing.py +++ b/my/experimental/destructive_parsing.py @@ -35,7 +35,7 @@ def is_empty(x) -> bool: elif isinstance(x, list): return all(map(is_empty, x)) else: - assert_never(x) + assert_never(x) # noqa: RET503 class Manager: diff --git a/my/location/fallback/via_home.py b/my/location/fallback/via_home.py index 199ebb0..e44c59d 100644 --- a/my/location/fallback/via_home.py +++ b/my/location/fallback/via_home.py @@ -92,13 +92,12 @@ def estimate_location(dt: DateExact) -> Iterator[FallbackLocation]: dt=datetime.fromtimestamp(d, timezone.utc), datasource='via_home') return - else: - # I guess the most reasonable is to fallback on the first location - lat, lon = hist[-1][1] - yield FallbackLocation( - lat=lat, - lon=lon, - accuracy=config.home_accuracy, - dt=datetime.fromtimestamp(d, timezone.utc), - datasource='via_home') - return + + # I guess the most reasonable is to fallback on the first location + lat, lon = hist[-1][1] + yield FallbackLocation( + lat=lat, + lon=lon, + accuracy=config.home_accuracy, + dt=datetime.fromtimestamp(d, timezone.utc), + datasource='via_home') diff --git a/my/photos/main.py b/my/photos/main.py index 63a6fea..c326405 100644 --- a/my/photos/main.py +++ b/my/photos/main.py @@ -43,8 +43,7 @@ class Photo(NamedTuple): for bp in config.paths: if self.path.startswith(bp): return self.path[len(bp):] - else: - raise RuntimeError(f"Weird path {self.path}, can't match against anything") + raise RuntimeError(f"Weird path {self.path}, can't match against anything") @property def name(self) -> str: diff --git a/my/smscalls.py b/my/smscalls.py index b56026d..78bf7ee 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -182,10 +182,9 @@ class MMS(NamedTuple): for (addr, _type) in self.addresses: if _type == 137: return addr - else: - # hmm, maybe return instead? but this probably shouldnt happen, means - # something is very broken - raise RuntimeError(f'No from address matching 137 found in {self.addresses}') + # hmm, maybe return instead? but this probably shouldnt happen, means + # something is very broken + raise RuntimeError(f'No from address matching 137 found in {self.addresses}') @property def from_me(self) -> bool: diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 156a5db..8f521e0 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -63,16 +63,14 @@ def _get_user_config(): except ImportError as ie: if "'time'" not in str(ie): raise ie - else: - return empty_config + return empty_config try: user_config = time.tz.via_location except AttributeError as ae: if not ("'tz'" in str(ae) or "'via_location'" in str(ae)): raise ae - else: - return empty_config + return empty_config return user_config diff --git a/ruff.toml b/ruff.toml index 2c9c39b..3cb9f76 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,18 +1,22 @@ target-version = "py38" # NOTE: inferred from pyproject.toml if present lint.extend-select = [ - "F", # flakes rules -- default, but extend just in case - "E", # pycodestyle -- default, but extend just in case - "C4", # flake8-comprehensions -- unnecessary list/map/dict calls - "UP", # detect deprecated python stdlib stuff - "FBT", # detect use of boolean arguments - "RUF", # various ruff-specific rules - "PLR", # 'refactor' rules - "B", # 'bugbear' set -- various possible bugs - + "F", # flakes rules -- default, but extend just in case + "E", # pycodestyle -- default, but extend just in case + "C4", # flake8-comprehensions -- unnecessary list/map/dict calls + "UP", # detect deprecated python stdlib stuff + "FBT", # detect use of boolean arguments + "RUF", # various ruff-specific rules + "PLR", # 'refactor' rules + "B", # 'bugbear' set -- various possible bugs "PERF", # various potential performance speedups + "RET", # early returns + "PIE", # 'misc' lints + "PLW", # pylint warnings # "FA", # TODO enable later after we make sure cachew works? + # "PTH", # pathlib migration -- TODO enable later # "ARG", # TODO useful, but results in some false positives in pytest fixtures... maybe later + # "A", # TODO builtin shadowing -- handle later # "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks # "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives ] @@ -67,6 +71,10 @@ lint.ignore = [ "B017", # pytest.raises(Exception) "B023", # seems to result in false positives? + # complains about useless pass, but has sort of a false positive if the function has a docstring? + # this is common for click entrypoints (e.g. in __main__), so disable + "PIE790", + # a bit too annoying, offers to convert for loops to list comprehension # , which may heart readability "PERF401", @@ -74,4 +82,10 @@ lint.ignore = [ # suggests no using exception in for loops # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost" "PERF203", + + "RET504", # unnecessary assignment before returning -- that can be useful for readability + "RET505", # unnecessary else after return -- can hurt readability + + "PLW0603", # global variable update.. we usually know why we are doing this + "PLW2901", # for loop variable overwritten, usually this is intentional ] From ac08af7aabc09d6fd28f44ba0c155c62382cf96f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 02:54:26 +0100 Subject: [PATCH 271/302] ruff: enable PT (pytest) rules --- ruff.toml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ruff.toml b/ruff.toml index 3cb9f76..cb9445f 100644 --- a/ruff.toml +++ b/ruff.toml @@ -13,12 +13,17 @@ lint.extend-select = [ "RET", # early returns "PIE", # 'misc' lints "PLW", # pylint warnings + "PT", # pytest stuff # "FA", # TODO enable later after we make sure cachew works? # "PTH", # pathlib migration -- TODO enable later # "ARG", # TODO useful, but results in some false positives in pytest fixtures... maybe later # "A", # TODO builtin shadowing -- handle later # "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks # "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives + # "EM", # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying + # "FIX", # complains about fixmes/todos -- annoying + # "TD", # complains about todo formatting -- too annoying + # "ALL", ] lint.ignore = [ @@ -88,4 +93,8 @@ lint.ignore = [ "PLW0603", # global variable update.. we usually know why we are doing this "PLW2901", # for loop variable overwritten, usually this is intentional + + "PT004", # deprecated rule, will be removed later + "PT011", # pytest raises should is too broad + "PT012", # pytest raises should contain a single statement ] From c5df3ce1284d3730e6650321a4fe0c922960954f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 03:05:24 +0100 Subject: [PATCH 272/302] ruff: enable W, COM, EXE rules --- my/bluemaestro.py | 1 - my/coding/commits.py | 2 +- my/core/discovery_pure.py | 2 +- my/fbmessenger/android.py | 2 +- my/jawbone/__init__.py | 2 +- my/media/imdb.py | 1 - my/rtm.py | 2 +- my/telegram/telegram_backup.py | 2 +- ruff.toml | 19 +++++++++++++------ 9 files changed, 19 insertions(+), 14 deletions(-) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index 5d0968b..4c33fd1 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -1,4 +1,3 @@ -#!/usr/bin/python3 """ [[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor """ diff --git a/my/coding/commits.py b/my/coding/commits.py index d4e05b7..9661ae5 100644 --- a/my/coding/commits.py +++ b/my/coding/commits.py @@ -136,7 +136,7 @@ def canonical_name(repo: Path) -> str: # else: # rname = r.name # if 'backups/github' in repo: - # pass # TODO + # pass # TODO def _fd_path() -> str: diff --git a/my/core/discovery_pure.py b/my/core/discovery_pure.py index 63d9922..b753de8 100644 --- a/my/core/discovery_pure.py +++ b/my/core/discovery_pure.py @@ -242,7 +242,7 @@ def test_pure() -> None: src = Path(__file__).read_text() # 'import my' is allowed, but # dont allow anything other HPI modules - assert re.findall('import ' + r'my\.\S+', src, re.M) == [] + assert re.findall('import ' + r'my\.\S+', src, re.MULTILINE) == [] assert 'from ' + 'my' not in src diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index bc06114..7e48c78 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -228,7 +228,7 @@ def _process_db_threads_db2(db: sqlite3.Connection) -> Iterator[Res[Entity]]: for r in db.execute( ''' - SELECT *, json_extract(sender, "$.user_key") AS user_key FROM messages + SELECT *, json_extract(sender, "$.user_key") AS user_key FROM messages WHERE msg_type NOT IN ( -1, /* these don't have any data at all, likely immediately deleted or something? */ 2 /* these are 'left group' system messages, also a bit annoying since they might reference nonexistent users */ diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index affe230..35112ba 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -239,7 +239,7 @@ def plot_one(sleep: SleepEntry, fig, axes, xlims=None, *, showtext=True): # axes.title.set_size(10) if showtext: - axes.text(xlims[1] - timedelta(hours=1.5), 20, str(sleep),) + axes.text(xlims[1] - timedelta(hours=1.5), 20, str(sleep)) # plt.text(sleep.asleep(), 0, hhmm(sleep.asleep())) diff --git a/my/media/imdb.py b/my/media/imdb.py index c66f5dc..df31032 100644 --- a/my/media/imdb.py +++ b/my/media/imdb.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 import csv from datetime import datetime from typing import Iterator, List, NamedTuple diff --git a/my/rtm.py b/my/rtm.py index 22752fe..b559ba4 100644 --- a/my/rtm.py +++ b/my/rtm.py @@ -58,7 +58,7 @@ class MyTodo: def get_status(self) -> str: if 'STATUS' not in self.todo: return None # type: ignore - # TODO 'COMPLETED'? + # TODO 'COMPLETED'? return str(self.todo['STATUS']) # TODO tz? diff --git a/my/telegram/telegram_backup.py b/my/telegram/telegram_backup.py index 0617501..ff4f904 100644 --- a/my/telegram/telegram_backup.py +++ b/my/telegram/telegram_backup.py @@ -18,7 +18,7 @@ from my.config import telegram as user_config class config(user_config.telegram_backup): # path to the export database.sqlite export_path: PathIsh - + @dataclass class Chat: diff --git a/ruff.toml b/ruff.toml index cb9445f..8cbc642 100644 --- a/ruff.toml +++ b/ruff.toml @@ -3,17 +3,22 @@ target-version = "py38" # NOTE: inferred from pyproject.toml if present lint.extend-select = [ "F", # flakes rules -- default, but extend just in case "E", # pycodestyle -- default, but extend just in case - "C4", # flake8-comprehensions -- unnecessary list/map/dict calls - "UP", # detect deprecated python stdlib stuff - "FBT", # detect use of boolean arguments - "RUF", # various ruff-specific rules - "PLR", # 'refactor' rules + "W", # various warnings + "B", # 'bugbear' set -- various possible bugs + "C4", # flake8-comprehensions -- unnecessary list/map/dict calls + "COM", # trailing commas + "EXE", # various checks wrt executable files + "FBT", # detect use of boolean arguments + "FURB", # various rules "PERF", # various potential performance speedups - "RET", # early returns "PIE", # 'misc' lints + "PLR", # 'refactor' rules "PLW", # pylint warnings "PT", # pytest stuff + "RET", # early returns + "RUF", # various ruff-specific rules + "UP", # detect deprecated python stdlib stuff # "FA", # TODO enable later after we make sure cachew works? # "PTH", # pathlib migration -- TODO enable later # "ARG", # TODO useful, but results in some false positives in pytest fixtures... maybe later @@ -97,4 +102,6 @@ lint.ignore = [ "PT004", # deprecated rule, will be removed later "PT011", # pytest raises should is too broad "PT012", # pytest raises should contain a single statement + + "COM812", # trailing comma missing -- TODO maybe use this? ] From fc0e0be291795970cec45209e9249d893d5173cc Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 03:09:32 +0100 Subject: [PATCH 273/302] ruff: enable ICN and PD rules --- my/body/weight.py | 2 +- my/emfit/__init__.py | 4 ++-- ruff.toml | 4 ++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/my/body/weight.py b/my/body/weight.py index 277b4d1..51e6513 100644 --- a/my/body/weight.py +++ b/my/body/weight.py @@ -83,7 +83,7 @@ def make_dataframe(data: Iterator[Result]): } df = pd.DataFrame(it()) - df.set_index('dt', inplace=True) + df = df.set_index('dt') # TODO not sure about UTC?? df.index = pd.to_datetime(df.index, utc=True) return df diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index 71a483f..9934903 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -155,9 +155,9 @@ def dataframe() -> DataFrameT: last = s # meh dicts.append(d) - import pandas + import pandas as pd - return pandas.DataFrame(dicts) + return pd.DataFrame(dicts) def stats() -> Stats: diff --git a/ruff.toml b/ruff.toml index 8cbc642..c2c88ef 100644 --- a/ruff.toml +++ b/ruff.toml @@ -9,9 +9,11 @@ lint.extend-select = [ "C4", # flake8-comprehensions -- unnecessary list/map/dict calls "COM", # trailing commas "EXE", # various checks wrt executable files + "ICN", # various import conventions "FBT", # detect use of boolean arguments "FURB", # various rules "PERF", # various potential performance speedups + "PD", # pandas rules "PIE", # 'misc' lints "PLR", # 'refactor' rules "PLW", # pylint warnings @@ -104,4 +106,6 @@ lint.ignore = [ "PT012", # pytest raises should contain a single statement "COM812", # trailing comma missing -- TODO maybe use this? + + "PD901", # generic variable name df ] From affa79ba3ae0aee53eb908dffdeee88800215c45 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 03:14:23 +0100 Subject: [PATCH 274/302] my.time.tz.via_location: fix accidental RuntimeError introduced in previous MR --- my/time/tz/via_location.py | 1 - 1 file changed, 1 deletion(-) diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 8f521e0..4920333 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -104,7 +104,6 @@ def _timezone_finder(*, fast: bool) -> Any: # for backwards compatibility def _locations() -> Iterator[Tuple[LatLon, datetime_aware]]: try: - raise RuntimeError import my.location.all for loc in my.location.all.locations(): From 1c5efc46aa18ebfb8fdb5ccbfde03b731e5355bf Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 03:18:45 +0100 Subject: [PATCH 275/302] ruff: enable TRY rules --- my/core/_deprecated/kompress.py | 3 ++- my/core/time.py | 3 ++- ruff.toml | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index cd09c06..1cd2636 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -141,9 +141,10 @@ open = kopen # TODO deprecate def kexists(path: PathIsh, subpath: str) -> bool: try: kopen(path, subpath) - return True except Exception: return False + else: + return True import zipfile diff --git a/my/core/time.py b/my/core/time.py index 5a47c3d..6de4105 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -11,10 +11,11 @@ def user_forced() -> Sequence[str]: # https://stackoverflow.com/questions/36067621/python-all-possible-timezone-abbreviations-for-given-timezone-name-and-vise-ve try: from my.config import time as user_config - return user_config.tz.force_abbreviations # type: ignore[attr-defined] except: # todo log/apply policy return [] + else: + return user_config.tz.force_abbreviations # type: ignore[attr-defined] @lru_cache(1) diff --git a/ruff.toml b/ruff.toml index c2c88ef..9a932ef 100644 --- a/ruff.toml +++ b/ruff.toml @@ -20,6 +20,7 @@ lint.extend-select = [ "PT", # pytest stuff "RET", # early returns "RUF", # various ruff-specific rules + "TRY", # various exception handling rules "UP", # detect deprecated python stdlib stuff # "FA", # TODO enable later after we make sure cachew works? # "PTH", # pathlib migration -- TODO enable later @@ -108,4 +109,10 @@ lint.ignore = [ "COM812", # trailing comma missing -- TODO maybe use this? "PD901", # generic variable name df + + "TRY003", # suggests defining exception messages in exception class -- kinda annoying + "TRY004", # prefer TypeError -- don't see the point + "TRY201", # raise without specifying exception name -- sometimes hurts readability + "TRY400", # TODO double check this, might be useful + "TRY401", # redundant exception in logging.exception call? TODO double check, might result in excessive logging ] From d58453410c34d75715b71c041f7a58a4f0954436 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2024 03:58:28 +0100 Subject: [PATCH 276/302] ruff: process remaining existing checks and suppress the annoying ones --- my/coding/commits.py | 4 ++-- my/core/_deprecated/kompress.py | 2 +- my/core/common.py | 2 +- my/core/hpi_compat.py | 2 +- my/photos/main.py | 2 +- ruff.toml | 34 +++++++++++++++++++++++++++++++-- 6 files changed, 38 insertions(+), 8 deletions(-) diff --git a/my/coding/commits.py b/my/coding/commits.py index 9661ae5..31c366e 100644 --- a/my/coding/commits.py +++ b/my/coding/commits.py @@ -178,12 +178,12 @@ def repos() -> List[Path]: # returns modification time for an index to use as hash function def _repo_depends_on(_repo: Path) -> int: - for pp in { + for pp in [ ".git/FETCH_HEAD", ".git/HEAD", "FETCH_HEAD", # bare "HEAD", # bare - }: + ]: ff = _repo / pp if ff.exists(): return int(ff.stat().st_mtime) diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index 1cd2636..63ce523 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -27,7 +27,7 @@ class Ext: def is_compressed(p: Path) -> bool: # todo kinda lame way for now.. use mime ideally? # should cooperate with kompress.kopen? - return any(p.name.endswith(ext) for ext in {Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.zst, Ext.targz}) + return any(p.name.endswith(ext) for ext in [Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.zst, Ext.targz]) def _zstd_open(path: Path, *args, **kwargs) -> IO: diff --git a/my/core/common.py b/my/core/common.py index 5f8d03a..a2c2ad3 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -14,7 +14,7 @@ from typing import ( ) from . import compat -from . import warnings as warnings +from . import warnings # some helper functions # TODO start deprecating this? soon we'd be able to use Path | str syntax which is shorter and more explicit diff --git a/my/core/hpi_compat.py b/my/core/hpi_compat.py index 9330e49..6261c23 100644 --- a/my/core/hpi_compat.py +++ b/my/core/hpi_compat.py @@ -123,7 +123,7 @@ class always_supports_sequence(Iterator[V]): self.it = it self._list: Optional[List] = None - def __iter__(self) -> Iterator[V]: + def __iter__(self) -> Iterator[V]: # noqa: PYI034 return self.it.__iter__() def __next__(self) -> V: diff --git a/my/photos/main.py b/my/photos/main.py index c326405..bf912e4 100644 --- a/my/photos/main.py +++ b/my/photos/main.py @@ -65,7 +65,7 @@ def _make_photo_aux(*args, **kwargs) -> List[Result]: def _make_photo(photo: Path, mtype: str, *, parent_geo: Optional[LatLon]) -> Iterator[Result]: exif: Exif - if any(x in mtype for x in {'image/png', 'image/x-ms-bmp', 'video'}): + if any(x in mtype for x in ['image/png', 'image/x-ms-bmp', 'video']): # TODO don't remember why.. logger.debug(f"skipping exif extraction for {photo} due to mime {mtype}") exif = {} diff --git a/ruff.toml b/ruff.toml index 9a932ef..0d3bb16 100644 --- a/ruff.toml +++ b/ruff.toml @@ -15,11 +15,15 @@ lint.extend-select = [ "PERF", # various potential performance speedups "PD", # pandas rules "PIE", # 'misc' lints - "PLR", # 'refactor' rules + "PLC", # pylint convention rules + "PLR", # pylint refactor rules "PLW", # pylint warnings "PT", # pytest stuff + "PYI", # various type hinting rules "RET", # early returns "RUF", # various ruff-specific rules + "TID", # various imports suggestions + "TCH", # various type checking rules "TRY", # various exception handling rules "UP", # detect deprecated python stdlib stuff # "FA", # TODO enable later after we make sure cachew works? @@ -31,10 +35,15 @@ lint.extend-select = [ # "EM", # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying # "FIX", # complains about fixmes/todos -- annoying # "TD", # complains about todo formatting -- too annoying - # "ALL", + # "ANN", # missing type annotations? seems way to string though + + # "ALL", # uncomment this to check for new rules! ] lint.ignore = [ + "D", # annoying nags about docstrings + "N", # pep naming + ### too opinionated style checks "E501", # too long lines "E702", # Multiple statements on one line (semicolon) @@ -115,4 +124,25 @@ lint.ignore = [ "TRY201", # raise without specifying exception name -- sometimes hurts readability "TRY400", # TODO double check this, might be useful "TRY401", # redundant exception in logging.exception call? TODO double check, might result in excessive logging + + "TCH002", # suggests moving imports into type checking blocks -- too annoying + "TCH003", # suggests moving imports into type checking blocks -- too annoying + + "I001", # unsorted import block TODO consider these? + "PGH", # TODO force error code in mypy instead + + # TODO enable TID? + "TID252", # Prefer absolute imports over relative imports from parent modules + + ## too annoying + "T20", # just complains about prints and pprints + "Q", # flake quotes, too annoying + "C90", # some complexity checking + "G004", # logging statement uses f string + "ERA001", # commented out code + "SLF001", # private member accessed + "BLE001", # do not catch 'blind' Exception + "INP001", # complains about implicit namespace packages + "SIM", # some if statements crap + ## ] From 71fdeca5e10d99526b39d7cbd1eb8bd5aa43cbf9 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 31 Aug 2024 02:03:22 +0100 Subject: [PATCH 277/302] ci: update mypy config and make ruff config more consistent with other projects --- my/coding/github.py | 2 +- my/config.py | 2 +- my/core/__main__.py | 2 +- my/core/_deprecated/kompress.py | 2 +- my/core/tests/test_config.py | 1 - my/core/utils/concurrent.py | 2 +- my/demo.py | 1 - my/endomondo.py | 2 +- my/foursquare.py | 1 - my/hackernews/common.py | 2 +- my/jawbone/plots.py | 1 - mypy.ini | 9 ++----- ruff.toml | 43 ++++++++++++++++++--------------- 13 files changed, 32 insertions(+), 38 deletions(-) diff --git a/my/coding/github.py b/my/coding/github.py index de64f05..c495554 100644 --- a/my/coding/github.py +++ b/my/coding/github.py @@ -6,7 +6,7 @@ warnings.high('my.coding.github is deprecated! Please use my.github.all instead! # todo why aren't DeprecationWarning shown by default?? if not TYPE_CHECKING: - from ..github.all import events, get_events + from ..github.all import events, get_events # noqa: F401 # todo deprecate properly iter_events = events diff --git a/my/config.py b/my/config.py index a92b2bc..2dd9cda 100644 --- a/my/config.py +++ b/my/config.py @@ -10,7 +10,7 @@ This file is used for: - for loading the actual user config ''' #### NOTE: you won't need this line VVVV in your personal config -from my.core import init +from my.core import init # noqa: F401 ### diff --git a/my/core/__main__.py b/my/core/__main__.py index 8553942..c675676 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -577,7 +577,7 @@ def query_hpi_functions( # output == 'repl' eprint(f"\nInteract with the results by using the {click.style('res', fg='green')} variable\n") try: - import IPython # type: ignore[import] + import IPython # type: ignore[import,unused-ignore] except ModuleNotFoundError: eprint("'repl' typically uses ipython, install it with 'python3 -m pip install ipython'. falling back to stdlib...") import code diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index 63ce523..b08f04b 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -120,7 +120,7 @@ class CPath(BasePath): Path only has _accessor and _closed slots, so can't directly set .open method _accessor.open has to return file descriptor, doesn't work for compressed stuff. """ - def open(self, *args, **kwargs): + def open(self, *args, **kwargs): # noqa: ARG002 kopen_kwargs = {} mode = kwargs.get('mode') if mode is not None: diff --git a/my/core/tests/test_config.py b/my/core/tests/test_config.py index a318a95..78d1a62 100644 --- a/my/core/tests/test_config.py +++ b/my/core/tests/test_config.py @@ -8,7 +8,6 @@ from pathlib import Path import pytest import pytz -from more_itertools import ilen import my.config from my.core import notnone diff --git a/my/core/utils/concurrent.py b/my/core/utils/concurrent.py index 5f11ab0..146861b 100644 --- a/my/core/utils/concurrent.py +++ b/my/core/utils/concurrent.py @@ -47,5 +47,5 @@ class DummyExecutor(Executor): return f - def shutdown(self, wait: bool = True, **kwargs) -> None: # noqa: FBT001,FBT002 + def shutdown(self, wait: bool = True, **kwargs) -> None: # noqa: FBT001,FBT002,ARG002 self._shutdown = True diff --git a/my/demo.py b/my/demo.py index e27b5dd..0c54792 100644 --- a/my/demo.py +++ b/my/demo.py @@ -3,7 +3,6 @@ Just a demo module for testing and documentation purposes ''' import json -from abc import abstractmethod from dataclasses import dataclass from datetime import datetime, timezone, tzinfo from pathlib import Path diff --git a/my/endomondo.py b/my/endomondo.py index 1d7acc2..293a542 100644 --- a/my/endomondo.py +++ b/my/endomondo.py @@ -31,7 +31,7 @@ def inputs() -> Sequence[Path]: # todo add a doctor check for pip endoexport module import endoexport.dal as dal -from endoexport.dal import Point, Workout +from endoexport.dal import Point, Workout # noqa: F401 from .core import Res diff --git a/my/foursquare.py b/my/foursquare.py index 63e1837..394fdf3 100644 --- a/my/foursquare.py +++ b/my/foursquare.py @@ -4,7 +4,6 @@ Foursquare/Swarm checkins from datetime import datetime, timezone, timedelta from itertools import chain -from pathlib import Path import json # TODO pytz for timezone??? diff --git a/my/hackernews/common.py b/my/hackernews/common.py index 0c5ff9b..6990987 100644 --- a/my/hackernews/common.py +++ b/my/hackernews/common.py @@ -1,6 +1,6 @@ from typing import Protocol -from my.core import datetime_aware, Json +from my.core import datetime_aware def hackernews_link(id: str) -> str: diff --git a/my/jawbone/plots.py b/my/jawbone/plots.py index 5dcb63d..d26d606 100755 --- a/my/jawbone/plots.py +++ b/my/jawbone/plots.py @@ -3,7 +3,6 @@ from pathlib import Path # from kython.plotting import * from csv import DictReader -from itertools import islice from typing import Dict, Any, NamedTuple diff --git a/mypy.ini b/mypy.ini index ebc81a5..9c34fcc 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,18 +1,13 @@ [mypy] -namespace_packages = True pretty = True show_error_context = True -show_error_codes = True show_column_numbers = True show_error_end = True +warn_redundant_casts = True warn_unused_ignores = True check_untyped_defs = True -enable_error_code = possibly-undefined strict_equality = True - -# a bit annoying, it has optional ipython import which should be ignored in mypy-core configuration.. -[mypy-my.core.__main__] -warn_unused_ignores = False +enable_error_code = possibly-undefined # todo ok, maybe it wasn't such a good idea.. # mainly because then tox picks it up and running against the user config, not the repository config diff --git a/ruff.toml b/ruff.toml index 0d3bb16..5fbd657 100644 --- a/ruff.toml +++ b/ruff.toml @@ -9,6 +9,7 @@ lint.extend-select = [ "C4", # flake8-comprehensions -- unnecessary list/map/dict calls "COM", # trailing commas "EXE", # various checks wrt executable files + # "I", # sort imports "ICN", # various import conventions "FBT", # detect use of boolean arguments "FURB", # various rules @@ -23,26 +24,26 @@ lint.extend-select = [ "RET", # early returns "RUF", # various ruff-specific rules "TID", # various imports suggestions - "TCH", # various type checking rules "TRY", # various exception handling rules "UP", # detect deprecated python stdlib stuff - # "FA", # TODO enable later after we make sure cachew works? + # "FA", # suggest using from __future__ import annotations TODO enable later after we make sure cachew works? # "PTH", # pathlib migration -- TODO enable later - # "ARG", # TODO useful, but results in some false positives in pytest fixtures... maybe later - # "A", # TODO builtin shadowing -- handle later - # "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks - # "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives + "ARG", # unused argument checks + # "A", # builtin shadowing -- TODO handle later # "EM", # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying - # "FIX", # complains about fixmes/todos -- annoying - # "TD", # complains about todo formatting -- too annoying - # "ANN", # missing type annotations? seems way to string though # "ALL", # uncomment this to check for new rules! ] lint.ignore = [ - "D", # annoying nags about docstrings - "N", # pep naming + "D", # annoying nags about docstrings + "N", # pep naming + "TCH", # type checking rules, mostly just suggests moving imports under TYPE_CHECKING + "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks + "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives + "FIX", # complains about fixmes/todos -- annoying + "TD", # complains about todo formatting -- too annoying + "ANN", # missing type annotations? seems way to strict though ### too opinionated style checks "E501", # too long lines @@ -62,10 +63,9 @@ lint.ignore = [ "E402", # Module level import not at top of file ### maybe consider these soon -# sometimes it's useful to give a variable a name even if we don't use it as a documentation -# on the other hand, often is a sign of error + # sometimes it's useful to give a variable a name even if we don't use it as a documentation + # on the other hand, often is a sign of error "F841", # Local variable `count` is assigned to but never used - "F401", # imported but unused ### ### TODO should be fine to use these with from __future__ import annotations? @@ -90,8 +90,10 @@ lint.ignore = [ "B009", # calling gettattr with constant attribute -- this is useful to convince mypy "B010", # same as above, but setattr + "B011", # complains about assert False "B017", # pytest.raises(Exception) "B023", # seems to result in false positives? + "B028", # suggest using explicit stacklevel? TODO double check later, but not sure it's useful # complains about useless pass, but has sort of a false positive if the function has a docstring? # this is common for click entrypoints (e.g. in __main__), so disable @@ -115,7 +117,7 @@ lint.ignore = [ "PT011", # pytest raises should is too broad "PT012", # pytest raises should contain a single statement - "COM812", # trailing comma missing -- TODO maybe use this? + "COM812", # trailing comma missing -- mostly just being annoying with long multiline strings "PD901", # generic variable name df @@ -125,15 +127,12 @@ lint.ignore = [ "TRY400", # TODO double check this, might be useful "TRY401", # redundant exception in logging.exception call? TODO double check, might result in excessive logging - "TCH002", # suggests moving imports into type checking blocks -- too annoying - "TCH003", # suggests moving imports into type checking blocks -- too annoying - - "I001", # unsorted import block TODO consider these? "PGH", # TODO force error code in mypy instead - # TODO enable TID? "TID252", # Prefer absolute imports over relative imports from parent modules + "UP038", # suggests using | (union) in isisntance checks.. but it results in slower code + ## too annoying "T20", # just complains about prints and pprints "Q", # flake quotes, too annoying @@ -144,5 +143,9 @@ lint.ignore = [ "BLE001", # do not catch 'blind' Exception "INP001", # complains about implicit namespace packages "SIM", # some if statements crap + "RSE102", # complains about missing parens in exceptions ## + + "ARG001", # ugh, kinda annoying when using pytest fixtures + "F401" , # TODO nice to have, but annoying with NOT_HPI_MODULE thing ] From 27178c09398939d01803e27fcc28d0cefa6d1422 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 13 Sep 2024 01:18:40 +0100 Subject: [PATCH 278/302] my.google.takeout.parser: speedup event merging on newer google_takeout_parser versions --- my/google/takeout/parser.py | 19 +++++++++++++++---- my/youtube/takeout.py | 12 ++++++------ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 258ab96..170553a 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -31,6 +31,7 @@ ABBR_TIMEZONES.extend(user_forced()) import google_takeout_parser from google_takeout_parser.path_dispatch import TakeoutParser from google_takeout_parser.merge import GoogleEventSet, CacheResults +from google_takeout_parser.models import BaseEvent # see https://github.com/seanbreckenridge/dotfiles/blob/master/.config/my/my/config/__init__.py for an example from my.config import google as user_config @@ -95,6 +96,17 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: error_policy = config.error_policy count = 0 emitted = GoogleEventSet() + + try: + emitted_add = emitted.add_if_not_present + except AttributeError: + # compat for older versions of google_takeout_parser which didn't have this method + def emitted_add(other: BaseEvent) -> bool: + if other in emitted: + return False + emitted.add(other) + return True + # reversed shouldn't really matter? but logic is to use newer # takeouts if they're named according to date, since JSON Activity # is nicer than HTML Activity @@ -123,10 +135,9 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: elif error_policy == 'drop': pass continue - if event in emitted: - continue - emitted.add(event) - yield event # type: ignore[misc] + + if emitted_add(event): + yield event # type: ignore[misc] logger.debug( f"HPI Takeout merge: from a total of {count} events, removed {count - len(emitted)} duplicates" ) diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index 99d65d9..284c082 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -1,10 +1,10 @@ from typing import NamedTuple, List, Iterable, TYPE_CHECKING -from ..core import datetime_aware, Res, LazyLogger -from ..core.compat import removeprefix +from my.core import datetime_aware, make_logger, stat, Res, Stats +from my.core.compat import deprecated, removeprefix -logger = LazyLogger(__name__) +logger = make_logger(__name__) class Watched(NamedTuple): @@ -93,7 +93,6 @@ def watched() -> Iterable[Res[Watched]]: ) -from ..core import stat, Stats def stats() -> Stats: return stat(watched) @@ -101,8 +100,9 @@ def stats() -> Stats: ### deprecated stuff (keep in my.media.youtube) if not TYPE_CHECKING: - # "deprecate" by hiding from mypy - get_watched = watched + @deprecated("use 'watched' instead") + def get_watched(*args, **kwargs): + return watched(*args, **kwargs) def _watched_legacy() -> Iterable[Watched]: From 201ddd4d7c45f63f3e3196f6b9be22402822680d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 16 Sep 2024 23:41:58 +0100 Subject: [PATCH 279/302] my.core.structure: add support for .tar.gz archives this will be useful to migrate .tar.gz processing to kompress in a backwards compatible way, or to run them against unpacked folder structure if user prefers --- my/core/structure.py | 47 +++++++++++------- my/core/tests/structure.py | 7 +-- .../tests/structure_data/gdpr_export.tar.gz | Bin 0 -> 349 bytes 3 files changed, 33 insertions(+), 21 deletions(-) create mode 100644 my/core/tests/structure_data/gdpr_export.tar.gz diff --git a/my/core/structure.py b/my/core/structure.py index be5b307..fa26532 100644 --- a/my/core/structure.py +++ b/my/core/structure.py @@ -1,6 +1,8 @@ import atexit import os import shutil +import sys +import tarfile import tempfile import zipfile from contextlib import contextmanager @@ -34,6 +36,7 @@ def _structure_exists(base_dir: Path, paths: Sequence[str], *, partial: bool = F ZIP_EXT = {".zip"} +TARGZ_EXT = {".tar.gz"} @contextmanager @@ -44,7 +47,7 @@ def match_structure( partial: bool = False, ) -> Generator[Tuple[Path, ...], None, None]: """ - Given a 'base' directory or zipfile, recursively search for one or more paths that match the + Given a 'base' directory or archive (zip/tar.gz), recursively search for one or more paths that match the pattern described in 'expected'. That can be a single string, or a list of relative paths (as strings) you expect at the same directory. @@ -52,12 +55,12 @@ def match_structure( expected be present, not all of them. This reduces the chances of the user misconfiguring gdpr exports, e.g. - if they zipped the folders instead of the parent directory or vice-versa + if they archived the folders instead of the parent directory or vice-versa When this finds a matching directory structure, it stops searching in that subdirectory and continues onto other possible subdirectories which could match - If base is a zipfile, this extracts the zipfile into a temporary directory + If base is an archive, this extracts it into a temporary directory (configured by core_config.config.get_tmp_dir), and then searches the extracted folder for matching structures @@ -93,12 +96,12 @@ def match_structure( This doesn't require an exhaustive list of expected values, but its a good idea to supply a complete picture of the expected structure to avoid false-positives - This does not recursively unzip zipfiles in the subdirectories, - it only unzips into a temporary directory if 'base' is a zipfile + This does not recursively decompress archives in the subdirectories, + it only unpacks into a temporary directory if 'base' is an archive A common pattern for using this might be to use get_files to get a list - of zipfiles or top-level gdpr export directories, and use match_structure - to search the resulting paths for a export structure you're expecting + of archives or top-level gdpr export directories, and use match_structure + to search the resulting paths for an export structure you're expecting """ from . import core_config as CC @@ -108,26 +111,34 @@ def match_structure( expected = (expected,) is_zip: bool = base.suffix in ZIP_EXT + is_targz: bool = any(base.name.endswith(suffix) for suffix in TARGZ_EXT) searchdir: Path = base.absolute() try: - # if the file given by the user is a zipfile, create a temporary - # directory and extract the zipfile to that temporary directory + # if the file given by the user is an archive, create a temporary + # directory and extract it to that temporary directory # # this temporary directory is removed in the finally block - if is_zip: + if is_zip or is_targz: # sanity check before we start creating directories/rm-tree'ing things - assert base.exists(), f"zipfile at {base} doesn't exist" + assert base.exists(), f"archive at {base} doesn't exist" searchdir = Path(tempfile.mkdtemp(dir=tdir)) - # base might already be a ZipPath, and str(base) would end with / - zf = zipfile.ZipFile(str(base).rstrip('/')) - zf.extractall(path=str(searchdir)) - + if is_zip: + # base might already be a ZipPath, and str(base) would end with / + zf = zipfile.ZipFile(str(base).rstrip('/')) + zf.extractall(path=str(searchdir)) + elif is_targz: + with tarfile.open(str(base)) as tar: + # filter is a security feature, will be required param in later python version + mfilter = {'filter': 'data'} if sys.version_info[:2] >= (3, 12) else {} + tar.extractall(path=str(searchdir), **mfilter) # type: ignore[arg-type] + else: + raise RuntimeError("can't happen") else: if not searchdir.is_dir(): - raise NotADirectoryError(f"Expected either a zipfile or a directory, received {searchdir}") + raise NotADirectoryError(f"Expected either a zip/tar.gz archive or a directory, received {searchdir}") matches: List[Path] = [] possible_targets: List[Path] = [searchdir] @@ -150,9 +161,9 @@ def match_structure( finally: - if is_zip: + if is_zip or is_targz: # make sure we're not mistakenly deleting data - assert str(searchdir).startswith(str(tdir)), f"Expected the temporary directory for extracting zip to start with the temporary directory prefix ({tdir}), found {searchdir}" + assert str(searchdir).startswith(str(tdir)), f"Expected the temporary directory for extracting archive to start with the temporary directory prefix ({tdir}), found {searchdir}" shutil.rmtree(str(searchdir)) diff --git a/my/core/tests/structure.py b/my/core/tests/structure.py index 6a94fc4..741e0ea 100644 --- a/my/core/tests/structure.py +++ b/my/core/tests/structure.py @@ -14,8 +14,9 @@ def test_gdpr_structure_exists() -> None: assert results == (structure_data / "gdpr_subdirs" / "gdpr_export",) -def test_gdpr_unzip() -> None: - with match_structure(structure_data / "gdpr_export.zip", expected=gdpr_expected) as results: +@pytest.mark.parametrize("archive", ["gdpr_export.zip", "gdpr_export.tar.gz"]) +def test_gdpr_unpack(archive: str) -> None: + with match_structure(structure_data / archive, expected=gdpr_expected) as results: assert len(results) == 1 extracted = results[0] index_file = extracted / "messages" / "index.csv" @@ -32,6 +33,6 @@ def test_match_partial() -> None: def test_not_directory() -> None: - with pytest.raises(NotADirectoryError, match=r"Expected either a zipfile or a directory"): + with pytest.raises(NotADirectoryError, match=r"Expected either a zip/tar.gz archive or a directory"): with match_structure(structure_data / "messages/index.csv", expected=gdpr_expected): pass diff --git a/my/core/tests/structure_data/gdpr_export.tar.gz b/my/core/tests/structure_data/gdpr_export.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..4f0597cdc7f3aa062ae896786375c5df87d49ec0 GIT binary patch literal 349 zcmV-j0iymNiwFP!000021MQgYZh|lrhWmS0!4+sf=;`GcgpM$UlC~O?W%s_i%*2^Y zXFZ&OZgnulN@i{zdo#NJi2B!+HOBA;|M`W&T_3Tv6*Z`7g2m&BlMz zRr;_f-9Fy`jr_m#&-tr{*NS^|K6I{W~;~O0r%hkN&(*g^YHJq_f1z1i2s=UmlQDnG5?Wd^FP|(pSb=n v1m^!d{15&6^N0Ko&VTw3+YIx63cPkc`*w{t0fHb1f; Date: Wed, 18 Sep 2024 23:03:03 +0100 Subject: [PATCH 280/302] my.github.gdpr/my.zulip.organization: use kompress support for tar.gz if it's available otherwise fall back onto unpacking into tmp dir via my.core.structure --- my/core/kompress.py | 6 +-- my/github/gdpr.py | 106 +++++++++++++++++++++++---------------- my/zulip/organization.py | 91 ++++++++++++++++++++++----------- setup.py | 14 +++--- 4 files changed, 135 insertions(+), 82 deletions(-) diff --git a/my/core/kompress.py b/my/core/kompress.py index 6ab3228..7cbf310 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -1,4 +1,5 @@ from .internal import assert_subpackage; assert_subpackage(__name__) + from . import warnings # do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath) @@ -8,10 +9,7 @@ try: from kompress import * except ModuleNotFoundError as e: if e.name == 'kompress': - warnings.high('Please install kompress (pip3 install kompress), it will be required in the future. Falling onto vendorized kompress for now.') + warnings.high('Please install kompress (pip3 install kompress). Falling onto vendorized kompress for now.') from ._deprecated.kompress import * # type: ignore[assignment] else: raise e - -# this is deprecated in compress, keep here for backwards compatibility -open = kopen # noqa: F405 diff --git a/my/github/gdpr.py b/my/github/gdpr.py index acbeb8f..a56ff46 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -1,36 +1,42 @@ """ Github data (uses [[https://github.com/settings/admin][official GDPR export]]) """ -from dataclasses import dataclass + +from __future__ import annotations + import json +from abc import abstractmethod from pathlib import Path -import tarfile -from typing import Iterable, Any, Sequence, Dict, Optional +from typing import Any, Iterator, Sequence -from my.core import get_files, Res, PathIsh, stat, Stats, make_logger -from my.core.cfg import make_config -from my.core.error import notnone, echain - -from .common import Event, parse_dt, EventIds - -# TODO later, use a separate user config? (github_gdpr) -from my.config import github as user_config - - -@dataclass -class github(user_config): - gdpr_dir: PathIsh # path to unpacked GDPR archive - - -config = make_config(github) +from my.core import Paths, Res, Stats, get_files, make_logger, stat, warnings +from my.core.error import echain +from .common import Event, EventIds, parse_dt logger = make_logger(__name__) +class config: + @property + @abstractmethod + def gdpr_dir(self) -> Paths: + raise NotImplementedError + + +def make_config() -> config: + # TODO later, use a separate user config? (github_gdpr) + from my.config import github as user_config + + class combined_config(user_config, config): + pass + + return combined_config() + + def inputs() -> Sequence[Path]: - gdir = config.gdpr_dir - res = get_files(gdir) + gdpr_dir = make_config().gdpr_dir + res = get_files(gdpr_dir) schema_json = [f for f in res if f.name == 'schema.json'] was_unpacked = len(schema_json) > 0 if was_unpacked: @@ -43,22 +49,37 @@ def inputs() -> Sequence[Path]: return res -def events() -> Iterable[Res[Event]]: +def events() -> Iterator[Res[Event]]: last = max(inputs()) logger.info(f'extracting data from {last}') - # a bit naughty and ad-hoc, but we will generify reading from tar.gz. once we have more examples - # another one is zulip archive - if last.is_dir(): - files = sorted(last.glob('*.json')) # looks like all files are in the root - open_file = lambda f: f.open() + root: Path | None = None + + if last.is_dir(): # if it's already CPath, this will match it + root = last else: - # treat as .tar.gz - tfile = tarfile.open(last) - files = sorted(map(Path, tfile.getnames())) - files = [p for p in files if len(p.parts) == 1 and p.suffix == '.json'] - open_file = lambda p: notnone(tfile.extractfile(f'./{p}')) # NOTE odd, doesn't work without ./ + try: + from kompress import CPath + + root = CPath(last) + assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support + except Exception as e: + logger.exception(e) + warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.") + + if root is None: + from my.core.structure import match_structure + + with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns + [root] = res + yield from _process_one(root) + else: + yield from _process_one(root) + + +def _process_one(root: Path) -> Iterator[Res[Event]]: + files = sorted(root.glob('*.json')) # looks like all files are in the root # fmt: off handler_map = { @@ -100,8 +121,7 @@ def events() -> Iterable[Res[Event]]: # ignored continue - with open_file(f) as fo: - j = json.load(fo) + j = json.loads(f.read_text()) for r in j: try: yield handler(r) @@ -116,7 +136,7 @@ def stats() -> Stats: # TODO typing.TypedDict could be handy here.. -def _parse_common(d: Dict) -> Dict: +def _parse_common(d: dict) -> dict: url = d['url'] body = d.get('body') return { @@ -126,7 +146,7 @@ def _parse_common(d: Dict) -> Dict: } -def _parse_repository(d: Dict) -> Event: +def _parse_repository(d: dict) -> Event: pref = 'https://github.com/' url = d['url'] dts = d['created_at'] @@ -142,13 +162,13 @@ def _parse_repository(d: Dict) -> Event: # user may be None if the user was deleted -def _is_bot(user: Optional[str]) -> bool: +def _is_bot(user: str | None) -> bool: if user is None: return False return "[bot]" in user -def _parse_issue_comment(d: Dict) -> Event: +def _parse_issue_comment(d: dict) -> Event: url = d['url'] return Event( **_parse_common(d), @@ -158,7 +178,7 @@ def _parse_issue_comment(d: Dict) -> Event: ) -def _parse_issue(d: Dict) -> Event: +def _parse_issue(d: dict) -> Event: url = d['url'] title = d['title'] return Event( @@ -169,7 +189,7 @@ def _parse_issue(d: Dict) -> Event: ) -def _parse_pull_request(d: Dict) -> Event: +def _parse_pull_request(d: dict) -> Event: dts = d['created_at'] url = d['url'] title = d['title'] @@ -183,7 +203,7 @@ def _parse_pull_request(d: Dict) -> Event: ) -def _parse_project(d: Dict) -> Event: +def _parse_project(d: dict) -> Event: url = d['url'] title = d['name'] is_bot = "[bot]" in d["creator"] @@ -198,7 +218,7 @@ def _parse_project(d: Dict) -> Event: ) -def _parse_release(d: Dict) -> Event: +def _parse_release(d: dict) -> Event: tag = d['tag_name'] return Event( **_parse_common(d), @@ -207,7 +227,7 @@ def _parse_release(d: Dict) -> Event: ) -def _parse_commit_comment(d: Dict) -> Event: +def _parse_commit_comment(d: dict) -> Event: url = d['url'] return Event( **_parse_common(d), diff --git a/my/zulip/organization.py b/my/zulip/organization.py index 8725411..2e0df4b 100644 --- a/my/zulip/organization.py +++ b/my/zulip/organization.py @@ -1,38 +1,55 @@ """ Zulip data from [[https://memex.zulipchat.com/help/export-your-organization][Organization export]] """ + +from __future__ import annotations + +import json +from abc import abstractmethod from dataclasses import dataclass from datetime import datetime, timezone from itertools import count -import json from pathlib import Path -from typing import Sequence, Iterator, Dict, Union +from typing import Iterator, Sequence from my.core import ( - assert_never, - datetime_aware, - get_files, - stat, Json, Paths, Res, Stats, + assert_never, + datetime_aware, + get_files, + make_logger, + stat, + warnings, ) -from my.core.error import notnone -import my.config + +logger = make_logger(__name__) -@dataclass -class organization(my.config.zulip.organization): - # paths[s]/glob to the exported JSON data - export_path: Paths +class config: + @property + @abstractmethod + def export_path(self) -> Paths: + """paths[s]/glob to the exported JSON data""" + raise NotImplementedError + + +def make_config() -> config: + from my.config import zulip as user_config + + class combined_config(user_config.organization, config): + pass + + return combined_config() def inputs() -> Sequence[Path]: # TODO: seems like export ids are kinda random.. # not sure what's the best way to figure out the last without renaming? # could use mtime perhaps? - return get_files(organization.export_path, sort=False) + return get_files(make_config().export_path, sort=False) @dataclass(frozen=True) @@ -85,19 +102,39 @@ class Message: # todo cache it -def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: +def _entities() -> Iterator[Res[Server | Sender | _Message]]: last = max(inputs()) - # todo would be nice to switch it to unpacked dirs as well, similar to ZipPath - # I guess makes sense to have a special implementation for .tar.gz considering how common are they - import tarfile + logger.info(f'extracting data from {last}') - tfile = tarfile.open(last) + root: Path | None = None - subdir = tfile.getnames()[0] # there is a directory inside tar file, first name should be that + if last.is_dir(): # if it's already CPath, this will match it + root = last + else: + try: + from kompress import CPath - with notnone(tfile.extractfile(f'{subdir}/realm.json')) as fo: - rj = json.load(fo) + root = CPath(last) + assert len(list(root.iterdir())) > 0 # trigger to check if we have the kompress version with targz support + except Exception as e: + logger.exception(e) + warnings.high("Upgrade 'kompress' to latest version with native .tar.gz support. Falling back to unpacking to tmp dir.") + + if root is None: + from my.core.structure import match_structure + + with match_structure(last, expected=()) as res: # expected=() matches it regardless any patterns + [root] = res + yield from _process_one(root) + else: + yield from _process_one(root) + + +def _process_one(root: Path) -> Iterator[Res[Server | Sender | _Message]]: + [subdir] = root.iterdir() # there is a directory inside tar file, first name should be that + + rj = json.loads((subdir / 'realm.json').read_text()) [sj] = rj['zerver_realm'] server = Server( @@ -136,12 +173,10 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: for idx in count(start=1, step=1): fname = f'messages-{idx:06}.json' - fpath = f'{subdir}/{fname}' - if fpath not in tfile.getnames(): - # tarfile doesn't have .exists? + fpath = subdir / fname + if not fpath.exists(): break - with notnone(tfile.extractfile(fpath)) as fo: - mj = json.load(fo) + mj = json.loads(fpath.read_text()) # TODO handle zerver_usermessage for j in mj['zerver_message']: try: @@ -151,8 +186,8 @@ def _entities() -> Iterator[Res[Union[Server, Sender, _Message]]]: def messages() -> Iterator[Res[Message]]: - id2sender: Dict[int, Sender] = {} - id2server: Dict[int, Server] = {} + id2sender: dict[int, Sender] = {} + id2server: dict[int, Server] = {} for x in _entities(): if isinstance(x, Exception): yield x diff --git a/setup.py b/setup.py index cf4b79f..8335851 100644 --- a/setup.py +++ b/setup.py @@ -4,13 +4,13 @@ from setuptools import setup, find_namespace_packages # type: ignore INSTALL_REQUIRES = [ - 'pytz', # even though it's not needed by the core, it's so common anyway... - 'typing-extensions', # one of the most common pypi packages, ok to depend for core - 'appdirs', # very common, and makes it portable - 'more-itertools', # it's just too useful and very common anyway - 'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core - 'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI - 'kompress' , # for transparent access to compressed files via pathlib.Path + 'pytz' , # even though it's not needed by the core, it's so common anyway... + 'typing-extensions' , # one of the most common pypi packages, ok to depend for core + 'appdirs' , # very common, and makes it portable + 'more-itertools' , # it's just too useful and very common anyway + 'decorator' , # less pain in writing correct decorators. very mature and stable, so worth keeping in core + 'click>=8.1' , # for the CLI, printing colors, decorator-based - may allow extensions to CLI + 'kompress>=0.2.20240918' , # for transparent access to compressed files via pathlib.Path ] From 2ca323da8487a2f99d283bd78c8141a35b700cb3 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 21 Sep 2024 23:18:50 +0100 Subject: [PATCH 281/302] my.fbmessenger.android: exclude unsent messages to avoid duplication --- my/fbmessenger/android.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index 7e48c78..effabab 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -168,6 +168,15 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: CAST(sender_id AS TEXT) AS sender_id, reply_source_id FROM messages + WHERE + /* Regular message_id conforms to mid.* regex. + However seems that when message is not sent yet it doesn't have this server id yet + (happened only once, but could be just luck of course!) + We exclude these messages to avoid duplication. + However poisitive filter (e.g. message_id LIKE 'mid%') feels a bit wrong, e.g. what if mesage ids change or something + So instead this excludes only such unsent messages. + */ + message_id != offline_threading_id ORDER BY timestamp_ms /* they aren't in order in the database, so need to sort */ ''' ): From e036cc9e8523debe92829ec7dc7b3b867860535f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 21 Sep 2024 23:55:06 +0100 Subject: [PATCH 282/302] my.twitter.android: get own user id as string, consistent with rest of module --- my/twitter/android.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/my/twitter/android.py b/my/twitter/android.py index 7adfeb6..ada04ae 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -156,10 +156,11 @@ def get_own_user_id(conn) -> str: # unclear what's the reliable way to query it, so we use multiple different ones and arbitrate # NOTE: 'SELECT DISTINCT ev_owner_id FROM lists' doesn't work, might include lists from other people? res: Set[str] = set() + # need to cast as it's int by default for q in [ - 'SELECT DISTINCT list_mapping_user_id FROM list_mapping', - 'SELECT DISTINCT owner_id FROM cursors', - 'SELECT DISTINCT user_id FROM users WHERE _id == 1', + 'SELECT DISTINCT CAST(list_mapping_user_id AS TEXT) FROM list_mapping', + 'SELECT DISTINCT CAST(owner_id AS TEXT) FROM cursors', + 'SELECT DISTINCT CAST(user_id AS TEXT) FROM users WHERE _id == 1', ]: for (r,) in conn.execute(q): res.add(r) From 239e6617fe62f7b14e71092175148255a10f4ac9 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 22 Sep 2024 01:48:12 +0100 Subject: [PATCH 283/302] my.twitter.archive: deduplicate tweets based on id_str/created_at and raw tweet text --- my/twitter/archive.py | 73 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index d326d70..5fa89f4 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -226,11 +226,80 @@ class ZipExport: yield Like(r, screen_name=self.screen_name) +def _cleanup_tweet_json(rj: Json) -> None: + # note: for now this isn't used, was just an attempt to normalise raw data... + + rj.pop('edit_info', None) # useless for downstream processing, but results in dupes, so let's remove it + + ## could probably just take the last one? dunno + rj.pop('retweet_count', None) + rj.pop('favorite_count', None) + ## + + entities = rj.get('entities', {}) + ext_entities = rj.get('extended_entities', {}) + + # TODO shit. unclear how to 'merge' changes to these + # links sometimes change for no apparent reason -- and sometimes old one is still valid but not the new one??? + for m in entities.get('media', {}): + m.pop('media_url', None) + m.pop('media_url_https', None) + for m in ext_entities.get('media', {}): + m.pop('media_url', None) + m.pop('media_url_https', None) + ## + + for m in entities.get('user_mentions', {}): + # changes if user renames themselves... + m.pop('name', None) + + # hmm so can change to -1? maybe if user was deleted? + # but also can change to actually something else?? second example + entities.pop('user_mentions', None) + + # TODO figure out what else is changing there later... + rj.pop('entities', None) + rj.pop('extended_entities', None) + + ## useless attributes which should be fine to exclude + rj.pop('possibly_sensitive', None) # not sure what is this.. sometimes appears with False value?? + rj.pop('withheld_in_countries', None) + rj.pop('lang', None) + ## + + # ugh. might change if the Twitter client was deleted or description renamed?? + rj.pop('source', None) + + ## ugh. sometimes trailing 0 after decimal point is present? + rj.pop('coordinates', None) + rj.get('geo', {}).pop('coordinates', None) + ## + + # ugh. this changes if user changed their name... + # or disappears if account was deleted? + rj.pop('in_reply_to_screen_name', None) + + # todo not sure about list and sorting? although can't hurt considering json is not iterative? def tweets() -> Iterator[Res[Tweet]]: _all = chain.from_iterable(ZipExport(i).tweets() for i in inputs()) - res = unique_everseen(_all, key=json_dumps) - yield from sorted(res, key=lambda t: t.dt) + + # NOTE raw json data in archived tweets changes all the time even for same tweets + # there is an attempt to clean it up... but it's tricky since users rename themselves, twitter stats are changing + # so it's unclear how to pick up + # we should probably 'merge' tweets into a canonical version, e.g. + # - pick latest tweet stats + # - keep history of usernames we were replying to that share the same user id + # - pick 'best' media url somehow?? + # - normalise coordinates data + def key(t: Tweet): + # NOTE: not using t.text, since it actually changes if entities in tweet are changing... + # whereas full_text seems stable + text = t.raw['full_text'] + return (t.created_at, t.id_str, text) + + res = unique_everseen(_all, key=key) + yield from sorted(res, key=lambda t: t.created_at) def likes() -> Iterator[Res[Like]]: From 02dabe9f2b90a472fc878aa31ee7299e094b137d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 22 Sep 2024 02:03:28 +0100 Subject: [PATCH 284/302] my.twitter.archive: cleanup linting and use proper configuration via abstract class --- my/twitter/archive.py | 97 ++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 5fa89f4..1573754 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -2,73 +2,75 @@ Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]]) """ +from __future__ import annotations -# before this config was named 'twitter', doesn't make too much sense for archive -# todo unify with other code like this, e.g. time.tz.via_location -try: - from my.config import twitter_archive as user_config -except ImportError as ie: - if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)): - # must be caused by something else - raise ie - try: - from my.config import twitter as user_config # type: ignore[assignment] - except ImportError: - raise ie # raise the original exception.. must be something else # noqa: B904 - else: - from my.core import warnings - warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config') -## - - +import html +import json # hmm interesting enough, orjson didn't give much speedup here? +from abc import abstractmethod from dataclasses import dataclass from datetime import datetime -from itertools import chain -import json # hmm interesting enough, orjson didn't give much speedup here? -from pathlib import Path from functools import cached_property -import html +from itertools import chain +from pathlib import Path from typing import ( + TYPE_CHECKING, Iterator, - List, - Optional, Sequence, ) from more_itertools import unique_everseen from my.core import ( - datetime_aware, - get_files, - make_logger, - stat, Json, Paths, Res, Stats, + datetime_aware, + get_files, + make_logger, + stat, + warnings, ) -from my.core import warnings -from my.core.cfg import make_config from my.core.serialize import dumps as json_dumps from .common import TweetId, permalink - -@dataclass -class twitter_archive(user_config): - export_path: Paths # path[s]/glob to the twitter archive takeout - - -### - -config = make_config(twitter_archive) - - logger = make_logger(__name__) +class config: + @property + @abstractmethod + def export_path(self) -> Paths: + """path[s]/glob to the twitter archive takeout""" + raise NotImplementedError + + +def make_config() -> config: + # before this config was named 'twitter', doesn't make too much sense for archive + # todo unify with other code like this, e.g. time.tz.via_location + try: + from my.config import twitter_archive as user_config + except ImportError as ie: + if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)): + # must be caused by something else + raise ie + try: + from my.config import twitter as user_config # type: ignore[assignment] + except ImportError: + raise ie # raise the original exception.. must be something else # noqa: B904 + else: + warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config') + ## + + class combined_config(user_config, config): + pass + + return combined_config() + + def inputs() -> Sequence[Path]: - return get_files(config.export_path) + return get_files(make_config().export_path) # TODO make sure it's not used anywhere else and simplify interface @@ -121,7 +123,7 @@ class Tweet: return res @property - def urls(self) -> List[str]: + def urls(self) -> list[str]: ents = self.entities us = ents['urls'] return [u['expanded_url'] for u in us] @@ -162,10 +164,10 @@ class Like: return self.raw['tweetId'] @property - def text(self) -> Optional[str]: + def text(self) -> str | None: # NOTE: likes basically don't have anything except text and url # ugh. I think none means that tweet was deleted? - res: Optional[str] = self.raw.get('fullText') + res: str | None = self.raw.get('fullText') if res is None: return None res = html.unescape(res) @@ -186,7 +188,7 @@ class ZipExport: if not (self.zpath / 'Your archive.html').exists(): self.old_format = True - def raw(self, what: str, *, fname: Optional[str] = None) -> Iterator[Json]: + def raw(self, what: str, *, fname: str | None = None) -> Iterator[Json]: logger.info(f'{self.zpath} : processing {what}') path = fname or what @@ -317,4 +319,5 @@ def stats() -> Stats: ## Deprecated stuff -Tid = TweetId +if not TYPE_CHECKING: + Tid = TweetId From 3166109f15c08f8a23e60a384047b7f9125c252b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 22 Sep 2024 04:27:32 +0100 Subject: [PATCH 285/302] my.core: fix list constructor in always_support_sequence and add some tests --- my/core/hpi_compat.py | 132 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 121 insertions(+), 11 deletions(-) diff --git a/my/core/hpi_compat.py b/my/core/hpi_compat.py index 6261c23..949046d 100644 --- a/my/core/hpi_compat.py +++ b/my/core/hpi_compat.py @@ -2,6 +2,7 @@ Contains various backwards compatibility/deprecation helpers relevant to HPI itself. (as opposed to .compat module which implements compatibility between python versions) """ + import inspect import os import re @@ -116,32 +117,141 @@ V = TypeVar('V') # named to be kinda consistent with more_itertools, e.g. more_itertools.always_iterable class always_supports_sequence(Iterator[V]): """ - Helper to make migration from Sequence/List to Iterable/Iterator type backwards compatible + Helper to make migration from Sequence/List to Iterable/Iterator type backwards compatible in runtime """ def __init__(self, it: Iterator[V]) -> None: - self.it = it - self._list: Optional[List] = None + self._it = it + self._list: Optional[List[V]] = None + self._lit: Optional[Iterator[V]] = None def __iter__(self) -> Iterator[V]: # noqa: PYI034 - return self.it.__iter__() + if self._list is not None: + self._lit = iter(self._list) + return self def __next__(self) -> V: - return self.it.__next__() + if self._list is not None: + assert self._lit is not None + delegate = self._lit + else: + delegate = self._it + return next(delegate) def __getattr__(self, name): - return getattr(self.it, name) + return getattr(self._it, name) @property - def aslist(self) -> List[V]: + def _aslist(self) -> List[V]: if self._list is None: - qualname = getattr(self.it, '__qualname__', '') # defensive just in case + qualname = getattr(self._it, '__qualname__', '') # defensive just in case warnings.medium(f'Using {qualname} as list is deprecated. Migrate to iterative processing or call list() explicitly.') - self._list = list(self.it) + self._list = list(self._it) + + # this is necessary for list constructor to work correctly + # since it's __iter__ first, then tries to compute length and then starts iterating... + self._lit = iter(self._list) return self._list def __len__(self) -> int: - return len(self.aslist) + return len(self._aslist) def __getitem__(self, i: int) -> V: - return self.aslist[i] + return self._aslist[i] + + +def test_always_supports_sequence_list_constructor() -> None: + exhausted = 0 + + def it() -> Iterator[str]: + nonlocal exhausted + yield from ['a', 'b', 'c'] + exhausted += 1 + + sit = always_supports_sequence(it()) + + # list constructor is a bit special... it's trying to compute length if it's available to optimize memory allocation + # so, what's happening in this case is + # - sit.__iter__ is called + # - sit.__len__ is called + # - sit.__next__ is called + res = list(sit) + assert res == ['a', 'b', 'c'] + assert exhausted == 1 + + res = list(sit) + assert res == ['a', 'b', 'c'] + assert exhausted == 1 # this will iterate over 'cached' list now, so original generator is only exhausted once + + +def test_always_supports_sequence_indexing() -> None: + exhausted = 0 + + def it() -> Iterator[str]: + nonlocal exhausted + yield from ['a', 'b', 'c'] + exhausted += 1 + + sit = always_supports_sequence(it()) + + assert len(sit) == 3 + assert exhausted == 1 + + assert sit[2] == 'c' + assert sit[1] == 'b' + assert sit[0] == 'a' + assert exhausted == 1 + + # a few tests to make sure list-like operations are working.. + assert list(sit) == ['a', 'b', 'c'] + assert [x for x in sit] == ['a', 'b', 'c'] # noqa: C416 + assert list(sit) == ['a', 'b', 'c'] + assert [x for x in sit] == ['a', 'b', 'c'] # noqa: C416 + assert exhausted == 1 + + +def test_always_supports_sequence_next() -> None: + exhausted = 0 + + def it() -> Iterator[str]: + nonlocal exhausted + yield from ['a', 'b', 'c'] + exhausted += 1 + + sit = always_supports_sequence(it()) + + x = next(sit) + assert x == 'a' + assert exhausted == 0 + + x = next(sit) + assert x == 'b' + assert exhausted == 0 + + +def test_always_supports_sequence_iter() -> None: + exhausted = 0 + + def it() -> Iterator[str]: + nonlocal exhausted + yield from ['a', 'b', 'c'] + exhausted += 1 + + sit = always_supports_sequence(it()) + + for x in sit: + assert x == 'a' + break + + x = next(sit) + assert x == 'b' + + assert exhausted == 0 + + x = next(sit) + assert x == 'c' + assert exhausted == 0 + + for _ in sit: + raise RuntimeError # shouldn't trigger, just exhaust the iterator + assert exhausted == 1 From 75639a3d5ec3b07fb7e6b638e9e3c342a23cc1a2 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 22 Sep 2024 19:50:58 +0100 Subject: [PATCH 286/302] tox: some prep for potentially using uv on CI instead of pip see https://github.com/karlicoss/HPI/issues/391 --- setup.py | 10 ++++++++ tox.ini | 78 +++++++++++++++++++++++++++++--------------------------- 2 files changed, 51 insertions(+), 37 deletions(-) diff --git a/setup.py b/setup.py index 8335851..e49eee0 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,16 @@ def main() -> None: 'orjson', # for my.core.serialize and denylist 'simplejson', # for my.core.serialize + + ## + # ideally we'd use --instal-types in mypy + # , but looks like it doesn't respect uv venv if it's running in it :( + 'types-pytz' , # for my.core + 'types-decorator' , # for my.core.compat + 'pandas-stubs' , # for my.core.pandas + 'types-dateparser', # for my.core.query_range + 'types-simplejson', # for my.core.serialize + ## ], 'optional': [ # todo document these? diff --git a/tox.ini b/tox.ini index 6b95088..4e5dff6 100644 --- a/tox.ini +++ b/tox.ini @@ -24,16 +24,19 @@ passenv = [testenv:ruff] +install_command = {envpython} -m pip install --use-pep517 {opts} {packages} +deps = + -e .[testing] commands = - {envpython} -m pip install --use-pep517 -e .[testing] {envpython} -m ruff check my/ # just the very core tests with minimal dependencies [testenv:tests-core] +install_command = {envpython} -m pip install --use-pep517 {opts} {packages} +deps = + -e .[testing] commands = - {envpython} -m pip install --use-pep517 -e .[testing] - {envpython} -m pytest \ # importlib is the new suggested import-mode # without it test package names end up as core.tests.* instead of my.core.tests.* @@ -53,31 +56,26 @@ setenv = # TODO not sure if need it? MY_CONFIG=nonexistent HPI_TESTS_USES_OPTIONAL_DEPS=true +install_command = {envpython} -m pip install --use-pep517 {opts} {packages} +deps = + -e .[testing] + cachew + ijson # optional dependency for various modules commands = - {envpython} -m pip install --use-pep517 -e .[testing] - - {envpython} -m pip install cachew - - {envpython} -m my.core module install my.location.google - {envpython} -m pip install ijson # optional dependency - - # tz/location - {envpython} -m my.core module install my.time.tz.via_location - {envpython} -m my.core module install my.ip.all - {envpython} -m my.core module install my.location.gpslogger - {envpython} -m my.core module install my.location.fallback.via_ip - {envpython} -m my.core module install my.google.takeout.parser - - {envpython} -m my.core module install my.calendar.holidays - - # my.body.weight dep - {envpython} -m my.core module install my.orgmode - - {envpython} -m my.core module install my.coding.commits - - {envpython} -m my.core module install my.pdfs - - {envpython} -m my.core module install my.reddit.rexport + {envpython} -m my.core module install \ + ## tz/location + my.location.google \ + my.time.tz.via_location \ + my.ip.all \ + my.location.gpslogger \ + my.location.fallback.via_ip \ + my.google.takeout.parser \ + ## + my.calendar.holidays \ + my.orgmode \ # my.body.weight dep + my.coding.commits \ + my.pdfs \ + my.reddit.rexport {envpython} -m pytest \ # importlib is the new suggested import-mode @@ -88,18 +86,20 @@ commands = [testenv:demo] +deps = + git+https://github.com/karlicoss/hypexport commands = - {envpython} -m pip install git+https://github.com/karlicoss/hypexport {envpython} ./demo.py [testenv:mypy-core] +install_command = {envpython} -m pip install --use-pep517 {opts} {packages} +deps = + -e .[testing,optional] + orgparse # for core.orgmode + gpxpy # for hpi query --output gpx commands = - {envpython} -m pip install --use-pep517 -e .[testing,optional] - {envpython} -m pip install orgparse # used it core.orgmode? - {envpython} -m pip install gpxpy # for hpi query --output gpx - - {envpython} -m mypy --install-types --non-interactive \ + {envpython} -m mypy --no-install-types \ -p {[testenv]package_name}.core \ --txt-report .coverage.mypy-core \ --html-report .coverage.mypy-core \ @@ -109,9 +109,13 @@ commands = # specific modules that are known to be mypy compliant (to avoid false negatives) # todo maybe split into separate jobs? need to add comment how to run [testenv:mypy-misc] +install_command = {envpython} -m pip install --use-pep517 {opts} {packages} +deps = + -e .[testing,optional] + lxml-stubs # for my.smscalls + types-protobuf # for my.google.maps.android + types-Pillow # for my.photos commands = - {envpython} -m pip install --use-pep517 -e .[testing,optional] - {envpython} -m my.core module install \ my.arbtt \ my.browser.export \ @@ -143,13 +147,13 @@ commands = my.time.tz.via_location - {envpython} -m mypy --install-types --non-interactive \ + {envpython} -m mypy --no-install-types \ -p {[testenv]package_name} \ --txt-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \ {posargs} - {envpython} -m mypy --install-types --non-interactive \ + {envpython} -m mypy --no-install-types \ tests # note: this comment doesn't seem relevant anymore, but keeping it in case the issue happens again From 8ed9e1947ec186b7939e2d8a53d07b6ffdd832ed Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 22 Sep 2024 17:47:05 +0100 Subject: [PATCH 287/302] my.youtube.takeout: deduplicate watched videos and sort out a few minor errors --- my/core/compat.py | 7 ++- my/media/youtube.py | 13 +++-- my/youtube/takeout.py | 115 ++++++++++++++++++++++++++++++++---------- 3 files changed, 102 insertions(+), 33 deletions(-) diff --git a/my/core/compat.py b/my/core/compat.py index eccaf07..29d4f04 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -22,12 +22,17 @@ if not TYPE_CHECKING: source.backup(dest, **kwargs) -# can remove after python3.9 (although need to keep the method itself for bwd compat) +## can remove after python3.9 (although need to keep the method itself for bwd compat) def removeprefix(text: str, prefix: str) -> str: if text.startswith(prefix): return text[len(prefix) :] return text +def removesuffix(text: str, suffix: str) -> str: + if text.endswith(suffix): + return text[:-len(suffix)] + return text +## ## used to have compat function before 3.8 for these, keeping for runtime back compatibility if not TYPE_CHECKING: diff --git a/my/media/youtube.py b/my/media/youtube.py index efaa74b..3ddbc14 100644 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -1,5 +1,10 @@ -from ..core.warnings import high -high("DEPRECATED! Please use my.youtube.takeout instead.") -from ..core.util import __NOT_HPI_MODULE__ +from my.core import __NOT_HPI_MODULE__ -from ..youtube.takeout import * +from typing import TYPE_CHECKING + +from my.core.warnings import high + +high("DEPRECATED! Please use my.youtube.takeout instead.") + +if not TYPE_CHECKING: + from my.youtube.takeout import * diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index 284c082..bbce46a 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -1,13 +1,16 @@ -from typing import NamedTuple, List, Iterable, TYPE_CHECKING +from __future__ import annotations -from my.core import datetime_aware, make_logger, stat, Res, Stats -from my.core.compat import deprecated, removeprefix +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Iterable, Iterator +from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings +from my.core.compat import deprecated, removeprefix, removesuffix logger = make_logger(__name__) -class Watched(NamedTuple): +@dataclass +class Watched: url: str title: str when: datetime_aware @@ -16,19 +19,57 @@ class Watched(NamedTuple): def eid(self) -> str: return f'{self.url}-{self.when.isoformat()}' + def is_deleted(self) -> bool: + return self.title == self.url + # todo define error policy? # although it has one from google takeout module.. so not sure -def watched() -> Iterable[Res[Watched]]: + +def watched() -> Iterator[Res[Watched]]: + emitted: dict[Any, Watched] = {} + for w in _watched(): + if isinstance(w, Exception): + yield w # TODO also make unique? + continue + + # older exports (e.g. html) didn't have microseconds + # wheras newer json ones do have them + # seconds resolution is enough to distinguish watched videos + # also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data + without_microsecond = w.when.replace(microsecond=0) + + key = w.url, without_microsecond + prev = emitted.get(key, None) + if prev is not None: + # NOTE: some video titles start with 'Liked ' for liked videos activity + # but they'd have different timestamp, so fine not to handle them as a special case here + if w.title in prev.title: + # often more stuff added to the title, like 'Official Video' + # in this case not worth emitting the change + # also handles the case when titles match + continue + # otherwise if title changed completely, just emit the change... not sure what else we could do? + # could merge titles in the 'titles' field and update dynamically? but a bit complicated, maybe later.. + + # TODO would also be nice to handle is_deleted here somehow... + # but for that would need to process data in direct order vs reversed.. + # not sure, maybe this could use a special mode or something? + + emitted[key] = w + yield w + + +def _watched() -> Iterator[Res[Watched]]: try: - from ..google.takeout.parser import events from google_takeout_parser.models import Activity + + from ..google.takeout.parser import events except ModuleNotFoundError as ex: logger.exception(ex) - from ..core.warnings import high - high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.") - yield from _watched_legacy() + warnings.high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.") + yield from _watched_legacy() # type: ignore[name-defined] return YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v=' @@ -43,12 +84,12 @@ def watched() -> Iterable[Res[Watched]]: continue url = e.titleUrl - header = e.header - title = e.title if url is None: continue + header = e.header + if header in {'Image Search', 'Search', 'Chrome'}: # sometimes results in youtube links.. but definitely not watch history continue @@ -61,6 +102,8 @@ def watched() -> Iterable[Res[Watched]]: pass continue + title = e.title + if header == 'youtube.com' and title.startswith('Visited '): continue @@ -76,16 +119,32 @@ def watched() -> Iterable[Res[Watched]]: # also compatible with legacy titles title = removeprefix(title, 'Watched ') + # watches originating from some activity end with this, remove it for consistency + title = removesuffix(title, ' - YouTube') + if YOUTUBE_VIDEO_LINK not in url: - if e.details == ['From Google Ads']: - # weird, sometimes results in odd + if 'youtube.com/post/' in url: + # some sort of channel updates? continue - if title == 'Used YouTube' and e.products == ['Android']: + if 'youtube.com/playlist' in url: + # 'saved playlist' actions + continue + if 'music.youtube.com' in url: + # todo maybe allow it? + continue + if any('From Google Ads' in d for d in e.details): + # weird, sometimes results in odd urls + continue + + if title == 'Used YouTube': continue yield RuntimeError(f'Unexpected url: {e}') continue + # TODO contribute to takeout parser? seems that these still might happen in json data + title = title.replace("\xa0", " ") + yield Watched( url=url, title=title, @@ -100,24 +159,24 @@ def stats() -> Stats: ### deprecated stuff (keep in my.media.youtube) if not TYPE_CHECKING: + @deprecated("use 'watched' instead") def get_watched(*args, **kwargs): return watched(*args, **kwargs) + def _watched_legacy() -> Iterable[Watched]: + from ..google.takeout.html import read_html + from ..google.takeout.paths import get_last_takeout -def _watched_legacy() -> Iterable[Watched]: - from ..google.takeout.html import read_html - from ..google.takeout.paths import get_last_takeout + # todo looks like this one doesn't have retention? so enough to use the last + path = 'Takeout/My Activity/YouTube/MyActivity.html' + last = get_last_takeout(path=path) + if last is None: + return [] - # todo looks like this one doesn't have retention? so enough to use the last - path = 'Takeout/My Activity/YouTube/MyActivity.html' - last = get_last_takeout(path=path) - if last is None: - return [] + watches: list[Watched] = [] + for dt, url, title in read_html(last, path): + watches.append(Watched(url=url, title=title, when=dt)) - watches: List[Watched] = [] - for dt, url, title in read_html(last, path): - watches.append(Watched(url=url, title=title, when=dt)) - - # todo hmm they already come sorted.. wonder if should just rely on it.. - return sorted(watches, key=lambda e: e.when) + # todo hmm they already come sorted.. wonder if should just rely on it.. + return sorted(watches, key=lambda e: e.when) From bf8af6c598803b7ad3bee5d5cbade015ce4b88a7 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 22 Sep 2024 22:32:34 +0100 Subject: [PATCH 288/302] tox: try using uv for CI, should result in speedup see https://github.com/karlicoss/HPI/issues/391 --- .ci/run | 13 ++++++++++--- my/core/__main__.py | 3 ++- tox.ini | 10 +++++----- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/.ci/run b/.ci/run index fe2719e..7656575 100755 --- a/.ci/run +++ b/.ci/run @@ -11,6 +11,8 @@ if ! command -v sudo; then } fi +# --parallel-live to show outputs while it's running +tox_cmd='run-parallel --parallel-live' if [ -n "${CI-}" ]; then # install OS specific stuff here case "$OSTYPE" in @@ -20,7 +22,8 @@ if [ -n "${CI-}" ]; then ;; cygwin* | msys* | win*) # windows - : + # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that + tox_cmd='run' ;; *) # must be linux? @@ -37,5 +40,9 @@ if ! command -v python3 &> /dev/null; then PY_BIN="python" fi -"$PY_BIN" -m pip install --user tox -"$PY_BIN" -m tox --parallel --parallel-live "$@" + +# TODO hmm for some reason installing uv with pip and then running +# "$PY_BIN" -m uv tool fails with missing setuptools error?? +# just uvx directly works, but it's not present in PATH... +"$PY_BIN" -m pip install --user pipx +"$PY_BIN" -m pipx run uv tool run --with=tox-uv tox $tox_cmd "$@" diff --git a/my/core/__main__.py b/my/core/__main__.py index c675676..9ec637c 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -373,8 +373,9 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool=False, b warning('requirements list is empty, no need to install anything') return + use_uv = 'HPI_MODULE_INSTALL_USE_UV' in os.environ pre_cmd = [ - sys.executable, '-m', 'pip', + sys.executable, '-m', *(['uv'] if use_uv else []), 'pip', 'install', *(['--user'] if user else []), # todo maybe instead, forward all the remaining args to pip? *(['--break-system-packages'] if break_system_packages else []), # https://peps.python.org/pep-0668/ diff --git a/tox.ini b/tox.ini index 4e5dff6..d202bd2 100644 --- a/tox.ini +++ b/tox.ini @@ -17,6 +17,9 @@ passenv = PYTHONPYCACHEPREFIX MYPY_CACHE_DIR RUFF_CACHE_DIR +setenv = + HPI_MODULE_INSTALL_USE_UV=true +uv_seed = true # seems necessary so uv creates separate venvs per tox env? # note: --use-pep517 below is necessary for tox --parallel flag to work properly @@ -24,7 +27,6 @@ passenv = [testenv:ruff] -install_command = {envpython} -m pip install --use-pep517 {opts} {packages} deps = -e .[testing] commands = @@ -33,7 +35,6 @@ commands = # just the very core tests with minimal dependencies [testenv:tests-core] -install_command = {envpython} -m pip install --use-pep517 {opts} {packages} deps = -e .[testing] commands = @@ -56,9 +57,9 @@ setenv = # TODO not sure if need it? MY_CONFIG=nonexistent HPI_TESTS_USES_OPTIONAL_DEPS=true -install_command = {envpython} -m pip install --use-pep517 {opts} {packages} deps = -e .[testing] + uv # for hpi module install cachew ijson # optional dependency for various modules commands = @@ -93,7 +94,6 @@ commands = [testenv:mypy-core] -install_command = {envpython} -m pip install --use-pep517 {opts} {packages} deps = -e .[testing,optional] orgparse # for core.orgmode @@ -109,9 +109,9 @@ commands = # specific modules that are known to be mypy compliant (to avoid false negatives) # todo maybe split into separate jobs? need to add comment how to run [testenv:mypy-misc] -install_command = {envpython} -m pip install --use-pep517 {opts} {packages} deps = -e .[testing,optional] + uv # for hpi module install lxml-stubs # for my.smscalls types-protobuf # for my.google.maps.android types-Pillow # for my.photos From 6a6d15704063c328f9bf66330630a53a0c421915 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 23 Sep 2024 01:14:49 +0100 Subject: [PATCH 289/302] cli: fix minor race condition in creating hpi_temp_dir --- my/core/__main__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/my/core/__main__.py b/my/core/__main__.py index 9ec637c..a80aa52 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -615,9 +615,8 @@ def main(*, debug: bool) -> None: # to run things at the end (would need to use a callback or pass context) # https://click.palletsprojects.com/en/7.x/commands/#nested-handling-and-contexts - tdir: str = os.path.join(tempfile.gettempdir(), 'hpi_temp_dir') - if not os.path.exists(tdir): - os.makedirs(tdir) + tdir = Path(tempfile.gettempdir()) / 'hpi_temp_dir' + tdir.mkdir(exist_ok=True) os.chdir(tdir) From a8f86e32b981aef62890605e12da9cd59c9cc0c8 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 23 Sep 2024 22:01:57 +0100 Subject: [PATCH 290/302] core.time: hotfix for default force_abbreviations attribute --- my/core/time.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/my/core/time.py b/my/core/time.py index 6de4105..fa20a7c 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -11,11 +11,11 @@ def user_forced() -> Sequence[str]: # https://stackoverflow.com/questions/36067621/python-all-possible-timezone-abbreviations-for-given-timezone-name-and-vise-ve try: from my.config import time as user_config + return user_config.tz.force_abbreviations # type: ignore[attr-defined] # noqa: TRY300 + # note: noqa since we're catching case where config doesn't have attribute here as well except: # todo log/apply policy return [] - else: - return user_config.tz.force_abbreviations # type: ignore[attr-defined] @lru_cache(1) From bc7c3ac25355899f2a4ae56382469d85fd2c33bc Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 19 Oct 2024 18:27:35 +0100 Subject: [PATCH 291/302] general: python3.9 reached EOL, switch min version also enable 3.13 on CI --- .github/workflows/main.yml | 11 ++++++---- my/core/__main__.py | 4 +--- my/core/_deprecated/kompress.py | 2 +- my/core/compat.py | 24 ++++++++------------- my/core/pandas.py | 2 +- my/core/utils/concurrent.py | 38 +++++++++++---------------------- my/youtube/takeout.py | 6 +++--- setup.py | 2 +- 8 files changed, 36 insertions(+), 53 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 53d8e53..111d0e9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,19 +21,20 @@ on: jobs: build: strategy: + fail-fast: false matrix: platform: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] exclude: [ # windows runners are pretty scarce, so let's only run lowest and highest python version - {platform: windows-latest, python-version: '3.9' }, {platform: windows-latest, python-version: '3.10'}, {platform: windows-latest, python-version: '3.11'}, + {platform: windows-latest, python-version: '3.12'}, # same, macos is a bit too slow and ubuntu covers python quirks well - {platform: macos-latest , python-version: '3.9' }, {platform: macos-latest , python-version: '3.10' }, {platform: macos-latest , python-version: '3.11' }, + {platform: macos-latest , python-version: '3.12' }, ] runs-on: ${{ matrix.platform }} @@ -63,11 +64,13 @@ jobs: - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms uses: actions/upload-artifact@v4 with: + include-hidden-files: true name: .coverage.mypy-misc_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-misc/ - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms uses: actions/upload-artifact@v4 with: + include-hidden-files: true name: .coverage.mypy-core_${{ matrix.platform }}_${{ matrix.python-version }} path: .coverage.mypy-core/ @@ -81,7 +84,7 @@ jobs: - uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.10' - uses: actions/checkout@v4 with: diff --git a/my/core/__main__.py b/my/core/__main__.py index a80aa52..2777008 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -171,8 +171,6 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module # use a temporary directory, useful because # - compileall ignores -B, so always craps with .pyc files (annoyng on RO filesystems) # - compileall isn't following symlinks, just silently ignores them - # note: ugh, annoying that copytree requires a non-existing dir before 3.8. - # once we have min version 3.8, can use dirs_exist_ok=True param tdir = Path(td) / 'cfg' # NOTE: compileall still returns code 0 if the path doesn't exist.. # but in our case hopefully it's not an issue @@ -181,7 +179,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module try: # this will resolve symlinks when copying # should be under try/catch since might fail if some symlinks are missing - shutil.copytree(cfg_path, tdir) + shutil.copytree(cfg_path, tdir, dirs_exist_ok=True) check_call(cmd) info('syntax check: ' + ' '.join(cmd)) except Exception as e: diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index b08f04b..cd27a7f 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -210,7 +210,7 @@ class ZipPath(zipfile_Path): def iterdir(self) -> Iterator[ZipPath]: for s in self._as_dir().iterdir(): - yield ZipPath(s.root, s.at) # type: ignore[attr-defined] + yield ZipPath(s.root, s.at) @property def stem(self) -> str: diff --git a/my/core/compat.py b/my/core/compat.py index 29d4f04..3273ff4 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -21,25 +21,19 @@ if not TYPE_CHECKING: # TODO warn here? source.backup(dest, **kwargs) + # keeping for runtime backwards compatibility (added in 3.9) + @deprecated('use .removeprefix method on string directly instead') + def removeprefix(text: str, prefix: str) -> str: + return text.removeprefix(prefix) -## can remove after python3.9 (although need to keep the method itself for bwd compat) -def removeprefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix) :] - return text + @deprecated('use .removesuffix method on string directly instead') + def removesuffix(text: str, suffix: str) -> str: + return text.removesuffix(suffix) + ## -def removesuffix(text: str, suffix: str) -> str: - if text.endswith(suffix): - return text[:-len(suffix)] - return text -## - -## used to have compat function before 3.8 for these, keeping for runtime back compatibility -if not TYPE_CHECKING: + ## used to have compat function before 3.8 for these, keeping for runtime back compatibility from functools import cached_property from typing import Literal, Protocol, TypedDict -else: - from typing_extensions import Literal, Protocol, TypedDict ## diff --git a/my/core/pandas.py b/my/core/pandas.py index d38465a..8f5fd29 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -181,7 +181,7 @@ Schema = Any def _as_columns(s: Schema) -> Dict[str, Type]: # todo would be nice to extract properties; add tests for this as well if dataclasses.is_dataclass(s): - return {f.name: f.type for f in dataclasses.fields(s)} + return {f.name: f.type for f in dataclasses.fields(s)} # type: ignore[misc] # ugh, why mypy thinks f.type can return str?? # else must be NamedTuple?? # todo assert my.core.common.is_namedtuple? return getattr(s, '_field_types') diff --git a/my/core/utils/concurrent.py b/my/core/utils/concurrent.py index 146861b..73944ec 100644 --- a/my/core/utils/concurrent.py +++ b/my/core/utils/concurrent.py @@ -1,6 +1,6 @@ import sys from concurrent.futures import Executor, Future -from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar +from typing import Any, Callable, Optional, TypeVar from ..compat import ParamSpec @@ -19,33 +19,21 @@ class DummyExecutor(Executor): self._shutdown = False self._max_workers = max_workers - if TYPE_CHECKING: - if sys.version_info[:2] <= (3, 8): - # 3.8 doesn't support ParamSpec as Callable arg :( - # and any attempt to type results in incompatible supertype.. so whatever - def submit(self, fn, *args, **kwargs): ... + def submit(self, fn: Callable[_P, _T], /, *args: _P.args, **kwargs: _P.kwargs) -> Future[_T]: + if self._shutdown: + raise RuntimeError('cannot schedule new futures after shutdown') + f: Future[Any] = Future() + try: + result = fn(*args, **kwargs) + except KeyboardInterrupt: + raise + except BaseException as e: + f.set_exception(e) else: + f.set_result(result) - def submit(self, fn: Callable[_P, _T], /, *args: _P.args, **kwargs: _P.kwargs) -> Future[_T]: ... - - else: - - def submit(self, fn, *args, **kwargs): - if self._shutdown: - raise RuntimeError('cannot schedule new futures after shutdown') - - f: Future[Any] = Future() - try: - result = fn(*args, **kwargs) - except KeyboardInterrupt: - raise - except BaseException as e: - f.set_exception(e) - else: - f.set_result(result) - - return f + return f def shutdown(self, wait: bool = True, **kwargs) -> None: # noqa: FBT001,FBT002,ARG002 self._shutdown = True diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index bbce46a..f29b2e3 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Iterable, Iterator from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings -from my.core.compat import deprecated, removeprefix, removesuffix +from my.core.compat import deprecated logger = make_logger(__name__) @@ -117,10 +117,10 @@ def _watched() -> Iterator[Res[Watched]]: # all titles contain it, so pointless to include 'Watched ' # also compatible with legacy titles - title = removeprefix(title, 'Watched ') + title = title.removeprefix('Watched ') # watches originating from some activity end with this, remove it for consistency - title = removesuffix(title, ' - YouTube') + title = title.removesuffix(' - YouTube') if YOUTUBE_VIDEO_LINK not in url: if 'youtube.com/post/' in url: diff --git a/setup.py b/setup.py index e49eee0..385c810 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ def main() -> None: author_email='karlicoss@gmail.com', description='A Python interface to my life', - python_requires='>=3.8', + python_requires='>=3.9', install_requires=INSTALL_REQUIRES, extras_require={ 'testing': [ From d3f9a8e8b69542361ad0838a1012a1ad10440b5b Mon Sep 17 00:00:00 2001 From: karlicoss Date: Sat, 19 Oct 2024 20:55:09 +0100 Subject: [PATCH 292/302] core: migrate code to benefit from 3.9 stuff (#401) for now keeping ruff on 3.8 target version, need to sort out modules as well --- my/core/__init__.py | 4 +- my/core/__main__.py | 97 +++++++++++++++++++++------------ my/core/_cpu_pool.py | 9 ++- my/core/_deprecated/kompress.py | 5 +- my/core/cachew.py | 29 +++++----- my/core/cfg.py | 20 +++++-- my/core/common.py | 36 ++++++------ my/core/compat.py | 7 ++- my/core/core_config.py | 33 ++++++----- my/core/denylist.py | 28 +++++----- my/core/discovery_pure.py | 19 ++++--- my/core/error.py | 43 ++++++++------- my/core/experimental.py | 6 +- my/core/freezer.py | 29 +++++----- my/core/hpi_compat.py | 13 +++-- my/core/influxdb.py | 29 +++++++--- my/core/init.py | 2 + my/core/kompress.py | 4 +- my/core/konsume.py | 39 +++++++------ my/core/mime.py | 11 ++-- my/core/orgmode.py | 8 ++- my/core/pandas.py | 7 +-- my/core/preinit.py | 1 + my/core/pytest.py | 4 +- my/core/query.py | 76 ++++++++++++-------------- my/core/query_range.py | 68 +++++++++++++---------- my/core/serialize.py | 20 ++++--- my/core/source.py | 12 +++- my/core/sqlite.py | 47 +++++++++------- my/core/stats.py | 34 +++++------- my/core/structure.py | 14 +++-- my/core/tests/auto_stats.py | 2 +- my/core/tests/common.py | 6 +- my/core/tests/denylist.py | 3 +- my/core/tests/test_cachew.py | 8 +-- my/core/tests/test_config.py | 2 +- my/core/time.py | 15 +++-- my/core/types.py | 13 +++-- my/core/util.py | 28 ++++++---- my/core/utils/concurrent.py | 7 ++- my/core/utils/imports.py | 14 ++--- my/core/utils/itertools.py | 59 +++++++++----------- my/core/warnings.py | 8 ++- 43 files changed, 515 insertions(+), 404 deletions(-) diff --git a/my/core/__init__.py b/my/core/__init__.py index ba633f6..cc549d5 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING from .cfg import make_config from .common import PathIsh, Paths, get_files from .compat import assert_never -from .error import Res, unwrap, notnone +from .error import Res, notnone, unwrap from .logging import ( make_logger, ) @@ -52,7 +52,7 @@ __all__ = [ # you could put _init_hook.py next to your private my/config # that way you can configure logging/warnings/env variables on every HPI import try: - import my._init_hook # type: ignore[import-not-found] + import my._init_hook # type: ignore[import-not-found] # noqa: F401 except: pass ## diff --git a/my/core/__main__.py b/my/core/__main__.py index 2777008..00ac4ee 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import functools import importlib import inspect @@ -7,17 +9,18 @@ import shutil import sys import tempfile import traceback +from collections.abc import Iterable, Sequence from contextlib import ExitStack from itertools import chain from pathlib import Path from subprocess import PIPE, CompletedProcess, Popen, check_call, run -from typing import Any, Callable, Iterable, List, Optional, Sequence, Type +from typing import Any, Callable import click @functools.lru_cache -def mypy_cmd() -> Optional[Sequence[str]]: +def mypy_cmd() -> Sequence[str] | None: try: # preferably, use mypy from current python env import mypy # noqa: F401 fine not to use it @@ -32,7 +35,7 @@ def mypy_cmd() -> Optional[Sequence[str]]: return None -def run_mypy(cfg_path: Path) -> Optional[CompletedProcess]: +def run_mypy(cfg_path: Path) -> CompletedProcess | None: # todo dunno maybe use the same mypy config in repository? # I'd need to install mypy.ini then?? env = {**os.environ} @@ -63,21 +66,27 @@ def eprint(x: str) -> None: # err=True prints to stderr click.echo(x, err=True) + def indent(x: str) -> str: + # todo use textwrap.indent? return ''.join(' ' + l for l in x.splitlines(keepends=True)) -OK = '✅' +OK = '✅' OFF = '🔲' + def info(x: str) -> None: eprint(OK + ' ' + x) + def error(x: str) -> None: eprint('❌ ' + x) + def warning(x: str) -> None: - eprint('❗ ' + x) # todo yellow? + eprint('❗ ' + x) # todo yellow? + def tb(e: Exception) -> None: tb = ''.join(traceback.format_exception(Exception, e, e.__traceback__)) @@ -86,6 +95,7 @@ def tb(e: Exception) -> None: def config_create() -> None: from .preinit import get_mycfg_dir + mycfg_dir = get_mycfg_dir() created = False @@ -94,7 +104,8 @@ def config_create() -> None: my_config = mycfg_dir / 'my' / 'config' / '__init__.py' my_config.parent.mkdir(parents=True) - my_config.write_text(''' + my_config.write_text( + ''' ### HPI personal config ## see # https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-modules @@ -117,7 +128,8 @@ class example: ### you can insert your own configuration below ### but feel free to delete the stuff above if you don't need ti -'''.lstrip()) +'''.lstrip() + ) info(f'created empty config: {my_config}') created = True else: @@ -130,12 +142,13 @@ class example: # todo return the config as a result? def config_ok() -> bool: - errors: List[Exception] = [] + errors: list[Exception] = [] # at this point 'my' should already be imported, so doesn't hurt to extract paths from it import my + try: - paths: List[str] = list(my.__path__) + paths: list[str] = list(my.__path__) except Exception as e: errors.append(e) error('failed to determine module import path') @@ -145,19 +158,23 @@ def config_ok() -> bool: # first try doing as much as possible without actually importing my.config from .preinit import get_mycfg_dir + cfg_path = get_mycfg_dir() # alternative is importing my.config and then getting cfg_path from its __file__/__path__ # not sure which is better tbh ## check we're not using stub config import my.core + try: core_pkg_path = str(Path(my.core.__path__[0]).parent) if str(cfg_path).startswith(core_pkg_path): - error(f''' + error( + f''' Seems that the stub config is used ({cfg_path}). This is likely not going to work. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-modules for more information -'''.strip()) +'''.strip() + ) errors.append(RuntimeError('bad config path')) except Exception as e: errors.append(e) @@ -189,7 +206,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module ## check types mypy_res = run_mypy(cfg_path) - if mypy_res is not None: # has mypy + if mypy_res is not None: # has mypy rc = mypy_res.returncode if rc == 0: info('mypy check : success') @@ -221,7 +238,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module from .util import HPIModule, modules -def _modules(*, all: bool=False) -> Iterable[HPIModule]: +def _modules(*, all: bool = False) -> Iterable[HPIModule]: skipped = [] for m in modules(): if not all and m.skip_reason is not None: @@ -232,7 +249,7 @@ def _modules(*, all: bool=False) -> Iterable[HPIModule]: warning(f'Skipped {len(skipped)} modules: {skipped}. Pass --all if you want to see them.') -def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: List[str]) -> None: +def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: list[str]) -> None: if len(for_modules) > 0: # if you're checking specific modules, show errors # hopefully makes sense? @@ -256,7 +273,7 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: Li # todo add a --all argument to disregard is_active check? for mr in mods: skip = mr.skip_reason - m = mr.name + m = mr.name if skip is not None: eprint(f'{OFF} {click.style("SKIP", fg="yellow")}: {m:<50} {skip}') continue @@ -306,8 +323,8 @@ def list_modules(*, list_all: bool) -> None: tabulate_warnings() for mr in _modules(all=list_all): - m = mr.name - sr = mr.skip_reason + m = mr.name + sr = mr.skip_reason if sr is None: pre = OK suf = '' @@ -323,17 +340,20 @@ def tabulate_warnings() -> None: Helper to avoid visual noise in hpi modules/doctor ''' import warnings + orig = warnings.formatwarning def override(*args, **kwargs) -> str: res = orig(*args, **kwargs) return ''.join(' ' + x for x in res.splitlines(keepends=True)) + warnings.formatwarning = override # TODO loggers as well? def _requires(modules: Sequence[str]) -> Sequence[str]: from .discovery_pure import module_by_name + mods = [module_by_name(module) for module in modules] res = [] for mod in mods: @@ -360,7 +380,7 @@ def module_requires(*, module: Sequence[str]) -> None: click.echo(x) -def module_install(*, user: bool, module: Sequence[str], parallel: bool=False, break_system_packages: bool=False) -> None: +def module_install(*, user: bool, module: Sequence[str], parallel: bool = False, break_system_packages: bool = False) -> None: if isinstance(module, str): # legacy behavior, used to take a since argument module = [module] @@ -437,7 +457,7 @@ def _ui_getchar_pick(choices: Sequence[str], prompt: str = 'Select from: ') -> i return result_map[ch] -def _locate_functions_or_prompt(qualified_names: List[str], *, prompt: bool = True) -> Iterable[Callable[..., Any]]: +def _locate_functions_or_prompt(qualified_names: list[str], *, prompt: bool = True) -> Iterable[Callable[..., Any]]: from .query import QueryException, locate_qualified_function from .stats import is_data_provider @@ -487,6 +507,7 @@ def _locate_functions_or_prompt(qualified_names: List[str], *, prompt: bool = Tr def _warn_exceptions(exc: Exception) -> None: from my.core import make_logger + logger = make_logger('CLI', level='warning') logger.exception(f'hpi query: {exc}') @@ -498,14 +519,14 @@ def query_hpi_functions( *, output: str = 'json', stream: bool = False, - qualified_names: List[str], - order_key: Optional[str], - order_by_value_type: Optional[Type], + qualified_names: list[str], + order_key: str | None, + order_by_value_type: type | None, after: Any, before: Any, within: Any, reverse: bool = False, - limit: Optional[int], + limit: int | None, drop_unsorted: bool, wrap_unsorted: bool, warn_exceptions: bool, @@ -529,7 +550,8 @@ def query_hpi_functions( warn_exceptions=warn_exceptions, warn_func=_warn_exceptions, raise_exceptions=raise_exceptions, - drop_exceptions=drop_exceptions) + drop_exceptions=drop_exceptions, + ) if output == 'json': from .serialize import dumps @@ -563,7 +585,7 @@ def query_hpi_functions( # can ignore the mypy warning here, locations_to_gpx yields any errors # if you didnt pass it something that matches the LocationProtocol - for exc in locations_to_gpx(res, sys.stdout): # type: ignore[arg-type] + for exc in locations_to_gpx(res, sys.stdout): # type: ignore[arg-type] if warn_exceptions: _warn_exceptions(exc) elif raise_exceptions: @@ -580,6 +602,7 @@ def query_hpi_functions( except ModuleNotFoundError: eprint("'repl' typically uses ipython, install it with 'python3 -m pip install ipython'. falling back to stdlib...") import code + code.interact(local=locals()) else: IPython.embed() @@ -619,13 +642,13 @@ def main(*, debug: bool) -> None: @functools.lru_cache(maxsize=1) -def _all_mod_names() -> List[str]: +def _all_mod_names() -> list[str]: """Should include all modules, in case user is trying to diagnose issues""" # sort this, so that the order doesn't change while tabbing through return sorted([m.name for m in modules()]) -def _module_autocomplete(ctx: click.Context, args: Sequence[str], incomplete: str) -> List[str]: +def _module_autocomplete(ctx: click.Context, args: Sequence[str], incomplete: str) -> list[str]: return [m for m in _all_mod_names() if m.startswith(incomplete)] @@ -784,14 +807,14 @@ def query_cmd( function_name: Sequence[str], output: str, stream: bool, - order_key: Optional[str], - order_type: Optional[str], - after: Optional[str], - before: Optional[str], - within: Optional[str], - recent: Optional[str], + order_key: str | None, + order_type: str | None, + after: str | None, + before: str | None, + within: str | None, + recent: str | None, reverse: bool, - limit: Optional[int], + limit: int | None, drop_unsorted: bool, wrap_unsorted: bool, warn_exceptions: bool, @@ -827,7 +850,7 @@ def query_cmd( from datetime import date, datetime - chosen_order_type: Optional[Type] + chosen_order_type: type | None if order_type == "datetime": chosen_order_type = datetime elif order_type == "date": @@ -863,7 +886,8 @@ def query_cmd( wrap_unsorted=wrap_unsorted, warn_exceptions=warn_exceptions, raise_exceptions=raise_exceptions, - drop_exceptions=drop_exceptions) + drop_exceptions=drop_exceptions, + ) except QueryException as qe: eprint(str(qe)) sys.exit(1) @@ -878,6 +902,7 @@ def query_cmd( def test_requires() -> None: from click.testing import CliRunner + result = CliRunner().invoke(main, ['module', 'requires', 'my.github.ghexport', 'my.browser.export']) assert result.exit_code == 0 assert "github.com/karlicoss/ghexport" in result.output diff --git a/my/core/_cpu_pool.py b/my/core/_cpu_pool.py index 2369075..6b107a7 100644 --- a/my/core/_cpu_pool.py +++ b/my/core/_cpu_pool.py @@ -10,15 +10,18 @@ how many cores we want to dedicate to the DAL. Enabled by the env variable, specifying how many cores to dedicate e.g. "HPI_CPU_POOL=4 hpi query ..." """ + +from __future__ import annotations + import os from concurrent.futures import ProcessPoolExecutor -from typing import Optional, cast +from typing import cast _NOT_SET = cast(ProcessPoolExecutor, object()) -_INSTANCE: Optional[ProcessPoolExecutor] = _NOT_SET +_INSTANCE: ProcessPoolExecutor | None = _NOT_SET -def get_cpu_pool() -> Optional[ProcessPoolExecutor]: +def get_cpu_pool() -> ProcessPoolExecutor | None: global _INSTANCE if _INSTANCE is _NOT_SET: use_cpu_pool = os.environ.get('HPI_CPU_POOL') diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index cd27a7f..ce14fad 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -1,16 +1,17 @@ """ Various helpers for compression """ + # fmt: off from __future__ import annotations import io import pathlib -import sys +from collections.abc import Iterator, Sequence from datetime import datetime from functools import total_ordering from pathlib import Path -from typing import IO, Any, Iterator, Sequence, Union +from typing import IO, Any, Union PathIsh = Union[Path, str] diff --git a/my/core/cachew.py b/my/core/cachew.py index dc6ed79..9ccee09 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -1,16 +1,18 @@ -from .internal import assert_subpackage; assert_subpackage(__name__) +from __future__ import annotations + +from .internal import assert_subpackage + +assert_subpackage(__name__) import logging import sys +from collections.abc import Iterator from contextlib import contextmanager from pathlib import Path from typing import ( TYPE_CHECKING, Any, Callable, - Iterator, - Optional, - Type, TypeVar, Union, cast, @@ -21,7 +23,6 @@ import appdirs # type: ignore[import-untyped] from . import warnings - PathIsh = Union[str, Path] # avoid circular import from .common @@ -60,12 +61,12 @@ def _appdirs_cache_dir() -> Path: _CACHE_DIR_NONE_HACK = Path('/tmp/hpi/cachew_none_hack') -def cache_dir(suffix: Optional[PathIsh] = None) -> Path: +def cache_dir(suffix: PathIsh | None = None) -> Path: from . import core_config as CC cdir_ = CC.config.get_cache_dir() - sp: Optional[Path] = None + sp: Path | None = None if suffix is not None: sp = Path(suffix) # guess if you do need absolute, better path it directly instead of as suffix? @@ -144,21 +145,19 @@ if TYPE_CHECKING: # we need two versions due to @doublewrap # this is when we just annotate as @cachew without any args @overload # type: ignore[no-overload-impl] - def mcachew(fun: F) -> F: - ... + def mcachew(fun: F) -> F: ... @overload def mcachew( - cache_path: Optional[PathProvider] = ..., + cache_path: PathProvider | None = ..., *, force_file: bool = ..., - cls: Optional[Type] = ..., + cls: type | None = ..., depends_on: HashFunction = ..., - logger: Optional[logging.Logger] = ..., + logger: logging.Logger | None = ..., chunk_by: int = ..., - synthetic_key: Optional[str] = ..., - ) -> Callable[[F], F]: - ... + synthetic_key: str | None = ..., + ) -> Callable[[F], F]: ... else: mcachew = _mcachew_impl diff --git a/my/core/cfg.py b/my/core/cfg.py index a71a7e3..9851443 100644 --- a/my/core/cfg.py +++ b/my/core/cfg.py @@ -3,28 +3,32 @@ from __future__ import annotations import importlib import re import sys +from collections.abc import Iterator from contextlib import ExitStack, contextmanager -from typing import Any, Callable, Dict, Iterator, Optional, Type, TypeVar +from typing import Any, Callable, TypeVar -Attrs = Dict[str, Any] +Attrs = dict[str, Any] C = TypeVar('C') + # todo not sure about it, could be overthinking... # but short enough to change later # TODO document why it's necessary? -def make_config(cls: Type[C], migration: Callable[[Attrs], Attrs]=lambda x: x) -> C: +def make_config(cls: type[C], migration: Callable[[Attrs], Attrs] = lambda x: x) -> C: user_config = cls.__base__ old_props = { # NOTE: deliberately use gettatr to 'force' class properties here - k: getattr(user_config, k) for k in vars(user_config) + k: getattr(user_config, k) + for k in vars(user_config) } new_props = migration(old_props) from dataclasses import fields + params = { k: v for k, v in new_props.items() - if k in {f.name for f in fields(cls)} # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115 + if k in {f.name for f in fields(cls)} # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115 } # todo maybe return type here? return cls(**params) @@ -51,6 +55,8 @@ def _override_config(config: F) -> Iterator[F]: ModuleRegex = str + + @contextmanager def _reload_modules(modules: ModuleRegex) -> Iterator[None]: # need to use list here, otherwise reordering with set might mess things up @@ -81,13 +87,14 @@ def _reload_modules(modules: ModuleRegex) -> Iterator[None]: @contextmanager -def tmp_config(*, modules: Optional[ModuleRegex]=None, config=None): +def tmp_config(*, modules: ModuleRegex | None = None, config=None): if modules is None: assert config is None if modules is not None: assert config is not None import my.config + with ExitStack() as module_reload_stack, _override_config(my.config) as new_config: if config is not None: overrides = {k: v for k, v in vars(config).items() if not k.startswith('__')} @@ -102,6 +109,7 @@ def tmp_config(*, modules: Optional[ModuleRegex]=None, config=None): def test_tmp_config() -> None: class extra: data_path = '/path/to/data' + with tmp_config() as c: assert c.google != 'whatever' assert not hasattr(c, 'extra') diff --git a/my/core/common.py b/my/core/common.py index a2c2ad3..91fe9bd 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -1,20 +1,18 @@ +from __future__ import annotations + import os +from collections.abc import Iterable, Sequence from glob import glob as do_glob from pathlib import Path from typing import ( TYPE_CHECKING, Callable, Generic, - Iterable, - List, - Sequence, - Tuple, TypeVar, Union, ) -from . import compat -from . import warnings +from . import compat, warnings # some helper functions # TODO start deprecating this? soon we'd be able to use Path | str syntax which is shorter and more explicit @@ -24,20 +22,22 @@ Paths = Union[Sequence[PathIsh], PathIsh] DEFAULT_GLOB = '*' + + def get_files( pp: Paths, - glob: str=DEFAULT_GLOB, + glob: str = DEFAULT_GLOB, *, - sort: bool=True, - guess_compression: bool=True, -) -> Tuple[Path, ...]: + sort: bool = True, + guess_compression: bool = True, +) -> tuple[Path, ...]: """ Helper function to avoid boilerplate. Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense """ # TODO FIXME mm, some wrapper to assert iterator isn't empty? - sources: List[Path] + sources: list[Path] if isinstance(pp, Path): sources = [pp] elif isinstance(pp, str): @@ -54,7 +54,7 @@ def get_files( # TODO ugh. very flaky... -3 because [, get_files(), ] return traceback.extract_stack()[-3].filename - paths: List[Path] = [] + paths: list[Path] = [] for src in sources: if src.parts[0] == '~': src = src.expanduser() @@ -64,7 +64,7 @@ def get_files( if glob != DEFAULT_GLOB: warnings.medium(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!") paths.extend(map(Path, do_glob(gs))) - elif os.path.isdir(str(src)): + elif os.path.isdir(str(src)): # noqa: PTH112 # NOTE: we're using os.path here on purpose instead of src.is_dir # the reason is is_dir for archives might return True and then # this clause would try globbing insize the archives @@ -234,16 +234,14 @@ if not TYPE_CHECKING: return types.asdict(*args, **kwargs) # todo wrap these in deprecated decorator as well? + # TODO hmm how to deprecate these in runtime? + # tricky cause they are actually classes/types + from typing import Literal # noqa: F401 + from .cachew import mcachew # noqa: F401 # this is kinda internal, should just use my.core.logging.setup_logger if necessary from .logging import setup_logger - - # TODO hmm how to deprecate these in runtime? - # tricky cause they are actually classes/types - - from typing import Literal # noqa: F401 - from .stats import Stats from .types import ( Json, diff --git a/my/core/compat.py b/my/core/compat.py index 3273ff4..8f719a8 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -3,6 +3,8 @@ Contains backwards compatibility helpers for different python versions. If something is relevant to HPI itself, please put it in .hpi_compat instead ''' +from __future__ import annotations + import sys from typing import TYPE_CHECKING @@ -29,6 +31,7 @@ if not TYPE_CHECKING: @deprecated('use .removesuffix method on string directly instead') def removesuffix(text: str, suffix: str) -> str: return text.removesuffix(suffix) + ## ## used to have compat function before 3.8 for these, keeping for runtime back compatibility @@ -46,13 +49,13 @@ else: # bisect_left doesn't have a 'key' parameter (which we use) # till python3.10 if sys.version_info[:2] <= (3, 9): - from typing import Any, Callable, List, Optional, TypeVar + from typing import Any, Callable, List, Optional, TypeVar # noqa: UP035 X = TypeVar('X') # copied from python src # fmt: off - def bisect_left(a: List[Any], x: Any, lo: int=0, hi: Optional[int]=None, *, key: Optional[Callable[..., Any]]=None) -> int: + def bisect_left(a: list[Any], x: Any, lo: int=0, hi: int | None=None, *, key: Callable[..., Any] | None=None) -> int: if lo < 0: raise ValueError('lo must be non-negative') if hi is None: diff --git a/my/core/core_config.py b/my/core/core_config.py index 9036971..3f26c03 100644 --- a/my/core/core_config.py +++ b/my/core/core_config.py @@ -2,18 +2,21 @@ Bindings for the 'core' HPI configuration ''' +from __future__ import annotations + import re +from collections.abc import Sequence from dataclasses import dataclass from pathlib import Path -from typing import Optional, Sequence -from . import PathIsh, warnings +from . import warnings try: from my.config import core as user_config # type: ignore[attr-defined] except Exception as e: try: from my.config import common as user_config # type: ignore[attr-defined] + warnings.high("'common' config section is deprecated. Please rename it to 'core'.") except Exception as e2: # make it defensive, because it's pretty commonly used and would be annoying if it breaks hpi doctor etc. @@ -24,6 +27,7 @@ except Exception as e: _HPI_CACHE_DIR_DEFAULT = '' + @dataclass class Config(user_config): ''' @@ -34,7 +38,7 @@ class Config(user_config): cache_dir = '/your/custom/cache/path' ''' - cache_dir: Optional[PathIsh] = _HPI_CACHE_DIR_DEFAULT + cache_dir: Path | str | None = _HPI_CACHE_DIR_DEFAULT ''' Base directory for cachew. - if None , means cache is disabled @@ -44,7 +48,7 @@ class Config(user_config): NOTE: you shouldn't use this attribute in HPI modules directly, use Config.get_cache_dir()/cachew.cache_dir() instead ''' - tmp_dir: Optional[PathIsh] = None + tmp_dir: Path | str | None = None ''' Path to a temporary directory. This can be used temporarily while extracting zipfiles etc... @@ -52,34 +56,36 @@ class Config(user_config): - otherwise , use the specified directory as the base temporary directory ''' - enabled_modules : Optional[Sequence[str]] = None + enabled_modules: Sequence[str] | None = None ''' list of regexes/globs - None means 'rely on disabled_modules' ''' - disabled_modules: Optional[Sequence[str]] = None + disabled_modules: Sequence[str] | None = None ''' list of regexes/globs - None means 'rely on enabled_modules' ''' - def get_cache_dir(self) -> Optional[Path]: + def get_cache_dir(self) -> Path | None: cdir = self.cache_dir if cdir is None: return None if cdir == _HPI_CACHE_DIR_DEFAULT: from .cachew import _appdirs_cache_dir + return _appdirs_cache_dir() else: return Path(cdir).expanduser() def get_tmp_dir(self) -> Path: - tdir: Optional[PathIsh] = self.tmp_dir + tdir: Path | str | None = self.tmp_dir tpath: Path # use tempfile if unset if tdir is None: import tempfile + tpath = Path(tempfile.gettempdir()) / 'HPI' else: tpath = Path(tdir) @@ -87,10 +93,10 @@ class Config(user_config): tpath.mkdir(parents=True, exist_ok=True) return tpath - def _is_module_active(self, module: str) -> Optional[bool]: + def _is_module_active(self, module: str) -> bool | None: # None means the config doesn't specify anything # todo might be nice to return the 'reason' too? e.g. which option has matched - def matches(specs: Sequence[str]) -> Optional[str]: + def matches(specs: Sequence[str]) -> str | None: for spec in specs: # not sure because . (packages separate) matches anything, but I guess unlikely to clash if re.match(spec, module): @@ -106,10 +112,10 @@ class Config(user_config): return None else: return False - else: # not None + else: # not None if off is None: return True - else: # not None + else: # not None # fallback onto the 'enable everything', then the user will notice warnings.medium(f"[module]: conflicting regexes '{on}' and '{off}' are set in the config. Please only use one of them.") return True @@ -121,8 +127,8 @@ config = make_config(Config) ### tests start +from collections.abc import Iterator from contextlib import contextmanager as ctx -from typing import Iterator @ctx @@ -163,4 +169,5 @@ def test_active_modules() -> None: assert cc._is_module_active("my.body.exercise") is True assert len(record_warnings) == 1 + ### tests end diff --git a/my/core/denylist.py b/my/core/denylist.py index 92faf2c..c92f9a0 100644 --- a/my/core/denylist.py +++ b/my/core/denylist.py @@ -5,23 +5,25 @@ A helper module for defining denylists for sources programmatically For docs, see doc/DENYLIST.md """ +from __future__ import annotations + import functools import json import sys from collections import defaultdict +from collections.abc import Iterator, Mapping from pathlib import Path -from typing import Any, Dict, Iterator, List, Mapping, Set, TypeVar +from typing import Any, TypeVar import click from more_itertools import seekable -from my.core.common import PathIsh -from my.core.serialize import dumps -from my.core.warnings import medium +from .serialize import dumps +from .warnings import medium T = TypeVar("T") -DenyMap = Mapping[str, Set[Any]] +DenyMap = Mapping[str, set[Any]] def _default_key_func(obj: T) -> str: @@ -29,9 +31,9 @@ def _default_key_func(obj: T) -> str: class DenyList: - def __init__(self, denylist_file: PathIsh): + def __init__(self, denylist_file: Path | str) -> None: self.file = Path(denylist_file).expanduser().absolute() - self._deny_raw_list: List[Dict[str, Any]] = [] + self._deny_raw_list: list[dict[str, Any]] = [] self._deny_map: DenyMap = defaultdict(set) # deny cli, user can override these @@ -45,7 +47,7 @@ class DenyList: return deny_map: DenyMap = defaultdict(set) - data: List[Dict[str, Any]]= json.loads(self.file.read_text()) + data: list[dict[str, Any]] = json.loads(self.file.read_text()) self._deny_raw_list = data for ignore in data: @@ -112,7 +114,7 @@ class DenyList: self._load() self._deny_raw({key: self._stringify_value(value)}, write=write) - def _deny_raw(self, data: Dict[str, Any], *, write: bool = False) -> None: + def _deny_raw(self, data: dict[str, Any], *, write: bool = False) -> None: self._deny_raw_list.append(data) if write: self.write() @@ -131,7 +133,7 @@ class DenyList: def _deny_cli_remember( self, items: Iterator[T], - mem: Dict[str, T], + mem: dict[str, T], ) -> Iterator[str]: keyf = self._deny_cli_key_func or _default_key_func # i.e., convert each item to a string, and map str -> item @@ -157,10 +159,8 @@ class DenyList: # reset the iterator sit.seek(0) # so we can map the selected string from fzf back to the original objects - memory_map: Dict[str, T] = {} - picker = FzfPrompt( - executable_path=self.fzf_path, default_options="--no-multi" - ) + memory_map: dict[str, T] = {} + picker = FzfPrompt(executable_path=self.fzf_path, default_options="--no-multi") picked_l = picker.prompt( self._deny_cli_remember(itr, memory_map), "--read0", diff --git a/my/core/discovery_pure.py b/my/core/discovery_pure.py index b753de8..18a19c4 100644 --- a/my/core/discovery_pure.py +++ b/my/core/discovery_pure.py @@ -10,6 +10,8 @@ This potentially allows it to be: It should be free of external modules, importlib, exec, etc. etc. ''' +from __future__ import annotations + REQUIRES = 'REQUIRES' NOT_HPI_MODULE_VAR = '__NOT_HPI_MODULE__' @@ -19,8 +21,9 @@ import ast import logging import os import re +from collections.abc import Iterable, Sequence from pathlib import Path -from typing import Any, Iterable, List, NamedTuple, Optional, Sequence, cast +from typing import Any, NamedTuple, Optional, cast ''' None means that requirements weren't defined (different from empty requirements) @@ -30,11 +33,11 @@ Requires = Optional[Sequence[str]] class HPIModule(NamedTuple): name: str - skip_reason: Optional[str] - doc: Optional[str] = None - file: Optional[Path] = None + skip_reason: str | None + doc: str | None = None + file: Path | None = None requires: Requires = None - legacy: Optional[str] = None # contains reason/deprecation warning + legacy: str | None = None # contains reason/deprecation warning def ignored(m: str) -> bool: @@ -55,13 +58,13 @@ def has_stats(src: Path) -> bool: def _has_stats(code: str) -> bool: a: ast.Module = ast.parse(code) for x in a.body: - try: # maybe assign + try: # maybe assign [tg] = cast(Any, x).targets if tg.id == 'stats': return True except: pass - try: # maybe def? + try: # maybe def? name = cast(Any, x).name if name == 'stats': return True @@ -144,7 +147,7 @@ def all_modules() -> Iterable[HPIModule]: def _iter_my_roots() -> Iterable[Path]: import my # doesn't import any code, because of namespace package - paths: List[str] = list(my.__path__) + paths: list[str] = list(my.__path__) if len(paths) == 0: # should probably never happen?, if this code is running, it was imported # because something was added to __path__ to match this name diff --git a/my/core/error.py b/my/core/error.py index ed26dda..b308869 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -3,19 +3,16 @@ Various error handling helpers See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail """ +from __future__ import annotations + import traceback +from collections.abc import Iterable, Iterator from datetime import datetime from itertools import tee from typing import ( Any, Callable, - Iterable, - Iterator, - List, Literal, - Optional, - Tuple, - Type, TypeVar, Union, cast, @@ -33,7 +30,7 @@ Res = ResT[T, Exception] ErrorPolicy = Literal["yield", "raise", "drop"] -def notnone(x: Optional[T]) -> T: +def notnone(x: T | None) -> T: assert x is not None return x @@ -60,13 +57,15 @@ def raise_exceptions(itr: Iterable[Res[T]]) -> Iterator[T]: yield o -def warn_exceptions(itr: Iterable[Res[T]], warn_func: Optional[Callable[[Exception], None]] = None) -> Iterator[T]: +def warn_exceptions(itr: Iterable[Res[T]], warn_func: Callable[[Exception], None] | None = None) -> Iterator[T]: # if not provided, use the 'warnings' module if warn_func is None: from my.core.warnings import medium + def _warn_func(e: Exception) -> None: # TODO: print traceback? but user could always --raise-exceptions as well medium(str(e)) + warn_func = _warn_func for o in itr: @@ -81,7 +80,7 @@ def echain(ex: E, cause: Exception) -> E: return ex -def split_errors(l: Iterable[ResT[T, E]], ET: Type[E]) -> Tuple[Iterable[T], Iterable[E]]: +def split_errors(l: Iterable[ResT[T, E]], ET: type[E]) -> tuple[Iterable[T], Iterable[E]]: # TODO would be nice to have ET=Exception default? but it causes some mypy complaints? vit, eit = tee(l) # TODO ugh, not sure if I can reconcile type checking and runtime and convince mypy that ET and E are the same type? @@ -99,7 +98,9 @@ def split_errors(l: Iterable[ResT[T, E]], ET: Type[E]) -> Tuple[Iterable[T], Ite K = TypeVar('K') -def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> List[Res[T]]: + + +def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> list[Res[T]]: """ Sort a sequence potentially interleaved with errors/entries on which the key can't be computed. The general idea is: the error sticks to the non-error entry that follows it @@ -107,7 +108,7 @@ def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> List[Res[T] group = [] groups = [] for i in items: - k: Optional[K] + k: K | None try: k = key(i) except Exception: # error white computing key? dunno, might be nice to handle... @@ -117,10 +118,10 @@ def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> List[Res[T] groups.append((k, group)) group = [] - results: List[Res[T]] = [] - for _v, grp in sorted(groups, key=lambda p: p[0]): # type: ignore[return-value, arg-type] # TODO SupportsLessThan?? + results: list[Res[T]] = [] + for _v, grp in sorted(groups, key=lambda p: p[0]): # type: ignore[return-value, arg-type] # TODO SupportsLessThan?? results.extend(grp) - results.extend(group) # handle last group (it will always be errors only) + results.extend(group) # handle last group (it will always be errors only) return results @@ -162,20 +163,20 @@ def test_sort_res_by() -> None: # helpers to associate timestamps with the errors (so something meaningful could be displayed on the plots, for example) # todo document it under 'patterns' somewhere... # todo proper typevar? -def set_error_datetime(e: Exception, dt: Optional[datetime]) -> None: +def set_error_datetime(e: Exception, dt: datetime | None) -> None: if dt is None: return e.args = (*e.args, dt) # todo not sure if should return new exception? -def attach_dt(e: Exception, *, dt: Optional[datetime]) -> Exception: +def attach_dt(e: Exception, *, dt: datetime | None) -> Exception: set_error_datetime(e, dt) return e # todo it might be problematic because might mess with timezones (when it's converted to string, it's converted to a shift) -def extract_error_datetime(e: Exception) -> Optional[datetime]: +def extract_error_datetime(e: Exception) -> datetime | None: import re for x in reversed(e.args): @@ -201,10 +202,10 @@ MODULE_SETUP_URL = 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#p def warn_my_config_import_error( - err: Union[ImportError, AttributeError], + err: ImportError | AttributeError, *, - help_url: Optional[str] = None, - module_name: Optional[str] = None, + help_url: str | None = None, + module_name: str | None = None, ) -> bool: """ If the user tried to import something from my.config but it failed, @@ -265,7 +266,7 @@ def test_datetime_errors() -> None: import pytz # noqa: I001 dt_notz = datetime.now() - dt_tz = datetime.now(tz=pytz.timezone('Europe/Amsterdam')) + dt_tz = datetime.now(tz=pytz.timezone('Europe/Amsterdam')) for dt in [dt_tz, dt_notz]: e1 = RuntimeError('whatever') assert extract_error_datetime(e1) is None diff --git a/my/core/experimental.py b/my/core/experimental.py index 1a78272..0a1c3b4 100644 --- a/my/core/experimental.py +++ b/my/core/experimental.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import sys import types -from typing import Any, Dict, Optional +from typing import Any # The idea behind this one is to support accessing "overlaid/shadowed" modules from namespace packages @@ -20,7 +22,7 @@ def import_original_module( file: str, *, star: bool = False, - globals: Optional[Dict[str, Any]] = None, + globals: dict[str, Any] | None = None, ) -> types.ModuleType: module_to_restore = sys.modules[module_name] diff --git a/my/core/freezer.py b/my/core/freezer.py index 93bceb7..4fb0e25 100644 --- a/my/core/freezer.py +++ b/my/core/freezer.py @@ -1,29 +1,29 @@ -from .internal import assert_subpackage; assert_subpackage(__name__) +from __future__ import annotations -import dataclasses as dcl +from .internal import assert_subpackage + +assert_subpackage(__name__) + +import dataclasses import inspect -from typing import Any, Type, TypeVar +from typing import Any, Generic, TypeVar D = TypeVar('D') -def _freeze_dataclass(Orig: Type[D]): - ofields = [(f.name, f.type, f) for f in dcl.fields(Orig)] # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115 +def _freeze_dataclass(Orig: type[D]): + ofields = [(f.name, f.type, f) for f in dataclasses.fields(Orig)] # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115 # extract properties along with their types - props = list(inspect.getmembers(Orig, lambda o: isinstance(o, property))) + props = list(inspect.getmembers(Orig, lambda o: isinstance(o, property))) pfields = [(name, inspect.signature(getattr(prop, 'fget')).return_annotation) for name, prop in props] # FIXME not sure about name? # NOTE: sadly passing bases=[Orig] won't work, python won't let us override properties with fields - RRR = dcl.make_dataclass('RRR', fields=[*ofields, *pfields]) + RRR = dataclasses.make_dataclass('RRR', fields=[*ofields, *pfields]) # todo maybe even declare as slots? return props, RRR -# todo need some decorator thingie? -from typing import Generic - - class Freezer(Generic[D]): ''' Some magic which converts dataclass properties into fields. @@ -31,13 +31,13 @@ class Freezer(Generic[D]): For now only supports dataclasses. ''' - def __init__(self, Orig: Type[D]) -> None: + def __init__(self, Orig: type[D]) -> None: self.Orig = Orig self.props, self.Frozen = _freeze_dataclass(Orig) def freeze(self, value: D) -> D: pvalues = {name: getattr(value, name) for name, _ in self.props} - return self.Frozen(**dcl.asdict(value), **pvalues) # type: ignore[call-overload] # see https://github.com/python/typing_extensions/issues/115 + return self.Frozen(**dataclasses.asdict(value), **pvalues) # type: ignore[call-overload] # see https://github.com/python/typing_extensions/issues/115 ### tests @@ -45,7 +45,7 @@ class Freezer(Generic[D]): # this needs to be defined here to prevent a mypy bug # see https://github.com/python/mypy/issues/7281 -@dcl.dataclass +@dataclasses.dataclass class _A: x: Any @@ -71,6 +71,7 @@ def test_freezer() -> None: assert fd['typed'] == 123 assert fd['untyped'] == [1, 2, 3] + ### # TODO shit. what to do with exceptions? diff --git a/my/core/hpi_compat.py b/my/core/hpi_compat.py index 949046d..3687483 100644 --- a/my/core/hpi_compat.py +++ b/my/core/hpi_compat.py @@ -3,11 +3,14 @@ Contains various backwards compatibility/deprecation helpers relevant to HPI its (as opposed to .compat module which implements compatibility between python versions) """ +from __future__ import annotations + import inspect import os import re +from collections.abc import Iterator, Sequence from types import ModuleType -from typing import Iterator, List, Optional, Sequence, TypeVar +from typing import TypeVar from . import warnings @@ -15,7 +18,7 @@ from . import warnings def handle_legacy_import( parent_module_name: str, legacy_submodule_name: str, - parent_module_path: List[str], + parent_module_path: list[str], ) -> bool: ### # this is to trick mypy into treating this as a proper namespace package @@ -122,8 +125,8 @@ class always_supports_sequence(Iterator[V]): def __init__(self, it: Iterator[V]) -> None: self._it = it - self._list: Optional[List[V]] = None - self._lit: Optional[Iterator[V]] = None + self._list: list[V] | None = None + self._lit: Iterator[V] | None = None def __iter__(self) -> Iterator[V]: # noqa: PYI034 if self._list is not None: @@ -142,7 +145,7 @@ class always_supports_sequence(Iterator[V]): return getattr(self._it, name) @property - def _aslist(self) -> List[V]: + def _aslist(self) -> list[V]: if self._list is None: qualname = getattr(self._it, '__qualname__', '') # defensive just in case warnings.medium(f'Using {qualname} as list is deprecated. Migrate to iterative processing or call list() explicitly.') diff --git a/my/core/influxdb.py b/my/core/influxdb.py index 25eeba1..78a439a 100644 --- a/my/core/influxdb.py +++ b/my/core/influxdb.py @@ -2,9 +2,14 @@ TODO doesn't really belong to 'core' morally, but can think of moving out later ''' -from .internal import assert_subpackage; assert_subpackage(__name__) +from __future__ import annotations -from typing import Any, Dict, Iterable, Optional +from .internal import assert_subpackage + +assert_subpackage(__name__) + +from collections.abc import Iterable +from typing import Any import click @@ -21,7 +26,7 @@ class config: RESET_DEFAULT = False -def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_col: str='dt') -> None: +def fill(it: Iterable[Any], *, measurement: str, reset: bool = RESET_DEFAULT, dt_col: str = 'dt') -> None: # todo infer dt column automatically, reuse in stat? # it doesn't like dots, ends up some syntax error? measurement = measurement.replace('.', '_') @@ -30,6 +35,7 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_c db = config.db from influxdb import InfluxDBClient # type: ignore + client = InfluxDBClient() # todo maybe create if not exists? # client.create_database(db) @@ -40,7 +46,7 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_c client.delete_series(database=db, measurement=measurement) # TODO need to take schema here... - cache: Dict[str, bool] = {} + cache: dict[str, bool] = {} def good(f, v) -> bool: c = cache.get(f) @@ -59,9 +65,9 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_c def dit() -> Iterable[Json]: for i in it: d = asdict(i) - tags: Optional[Json] = None - tags_ = d.get('tags') # meh... handle in a more robust manner - if tags_ is not None and isinstance(tags_, dict): # FIXME meh. + tags: Json | None = None + tags_ = d.get('tags') # meh... handle in a more robust manner + if tags_ is not None and isinstance(tags_, dict): # FIXME meh. del d['tags'] tags = tags_ @@ -84,6 +90,7 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_c } from more_itertools import chunked + # "The optimal batch size is 5000 lines of line protocol." # some chunking is def necessary, otherwise it fails inserted = 0 @@ -97,9 +104,9 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool=RESET_DEFAULT, dt_c # todo "Specify timestamp precision when writing to InfluxDB."? -def magic_fill(it, *, name: Optional[str]=None, reset: bool=RESET_DEFAULT) -> None: +def magic_fill(it, *, name: str | None = None, reset: bool = RESET_DEFAULT) -> None: if name is None: - assert callable(it) # generators have no name/module + assert callable(it) # generators have no name/module name = f'{it.__module__}:{it.__name__}' assert name is not None @@ -109,6 +116,7 @@ def magic_fill(it, *, name: Optional[str]=None, reset: bool=RESET_DEFAULT) -> No from itertools import tee from more_itertools import first, one + it, x = tee(it) f = first(x, default=None) if f is None: @@ -118,9 +126,11 @@ def magic_fill(it, *, name: Optional[str]=None, reset: bool=RESET_DEFAULT) -> No # TODO can we reuse pandas code or something? # from .pandas import _as_columns + schema = _as_columns(type(f)) from datetime import datetime + dtex = RuntimeError(f'expected single datetime field. schema: {schema}') dtf = one((f for f, t in schema.items() if t == datetime), too_short=dtex, too_long=dtex) @@ -137,6 +147,7 @@ def main() -> None: @click.argument('FUNCTION_NAME', type=str, required=True) def populate(*, function_name: str, reset: bool) -> None: from .__main__ import _locate_functions_or_prompt + [provider] = list(_locate_functions_or_prompt([function_name])) # todo could have a non-interactive version which populates from all data sources for the provider? magic_fill(provider, reset=reset) diff --git a/my/core/init.py b/my/core/init.py index 7a30955..644c7b4 100644 --- a/my/core/init.py +++ b/my/core/init.py @@ -19,6 +19,7 @@ def setup_config() -> None: from pathlib import Path from .preinit import get_mycfg_dir + mycfg_dir = get_mycfg_dir() if not mycfg_dir.exists(): @@ -43,6 +44,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-mo except ImportError as ex: # just in case... who knows what crazy setup users have import logging + logging.exception(ex) warnings.warn(f""" Importing 'my.config' failed! (error: {ex}). This is likely to result in issues. diff --git a/my/core/kompress.py b/my/core/kompress.py index 7cbf310..8accb2d 100644 --- a/my/core/kompress.py +++ b/my/core/kompress.py @@ -1,4 +1,6 @@ -from .internal import assert_subpackage; assert_subpackage(__name__) +from .internal import assert_subpackage + +assert_subpackage(__name__) from . import warnings diff --git a/my/core/konsume.py b/my/core/konsume.py index 0e4a2fe..6d24167 100644 --- a/my/core/konsume.py +++ b/my/core/konsume.py @@ -5,17 +5,21 @@ This can potentially allow both for safer defensive parsing, and let you know if TODO perhaps need to get some inspiration from linear logic to decide on a nice API... ''' +from __future__ import annotations + from collections import OrderedDict -from typing import Any, List +from typing import Any def ignore(w, *keys): for k in keys: w[k].ignore() + def zoom(w, *keys): return [w[k].zoom() for k in keys] + # TODO need to support lists class Zoomable: def __init__(self, parent, *args, **kwargs) -> None: @@ -40,7 +44,7 @@ class Zoomable: assert self.parent is not None self.parent._remove(self) - def zoom(self) -> 'Zoomable': + def zoom(self) -> Zoomable: self.consume() return self @@ -63,6 +67,7 @@ class Wdict(Zoomable, OrderedDict): def this_consumed(self): return len(self) == 0 + # TODO specify mypy type for the index special method? @@ -77,6 +82,7 @@ class Wlist(Zoomable, list): def this_consumed(self): return len(self) == 0 + class Wvalue(Zoomable): def __init__(self, parent, value: Any) -> None: super().__init__(parent) @@ -87,23 +93,20 @@ class Wvalue(Zoomable): return [] def this_consumed(self): - return True # TODO not sure.. + return True # TODO not sure.. def __repr__(self): return 'WValue{' + repr(self.value) + '}' -from typing import Tuple - - -def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]: +def _wrap(j, parent=None) -> tuple[Zoomable, list[Zoomable]]: res: Zoomable - cc: List[Zoomable] + cc: list[Zoomable] if isinstance(j, dict): res = Wdict(parent) cc = [res] for k, v in j.items(): - vv, c = _wrap(v, parent=res) + vv, c = _wrap(v, parent=res) res[k] = vv cc.extend(c) return res, cc @@ -122,13 +125,14 @@ def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]: raise RuntimeError(f'Unexpected type: {type(j)} {j}') +from collections.abc import Iterator from contextlib import contextmanager -from typing import Iterator class UnconsumedError(Exception): pass + # TODO think about error policy later... @contextmanager def wrap(j, *, throw=True) -> Iterator[Zoomable]: @@ -137,7 +141,7 @@ def wrap(j, *, throw=True) -> Iterator[Zoomable]: yield w for c in children: - if not c.this_consumed(): # TODO hmm. how does it figure out if it's consumed??? + if not c.this_consumed(): # TODO hmm. how does it figure out if it's consumed??? if throw: # TODO need to keep a full path or something... raise UnconsumedError(f''' @@ -153,6 +157,7 @@ from typing import cast def test_unconsumed() -> None: import pytest + with pytest.raises(UnconsumedError): with wrap({'a': 1234}) as w: w = cast(Wdict, w) @@ -163,6 +168,7 @@ def test_unconsumed() -> None: w = cast(Wdict, w) d = w['c']['d'].zoom() + def test_consumed() -> None: with wrap({'a': 1234}) as w: w = cast(Wdict, w) @@ -173,6 +179,7 @@ def test_consumed() -> None: c = w['c'].zoom() d = c['d'].zoom() + def test_types() -> None: # (string, number, object, array, boolean or nul with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w: @@ -181,9 +188,10 @@ def test_types() -> None: w['number'].consume() w['boolean'].zoom() w['null'].zoom() - for x in list(w['list'].zoom()): # TODO eh. how to avoid the extra list thing? + for x in list(w['list'].zoom()): # TODO eh. how to avoid the extra list thing? x.consume() + def test_consume_all() -> None: with wrap({'aaa': {'bbb': {'hi': 123}}}) as w: w = cast(Wdict, w) @@ -193,11 +201,9 @@ def test_consume_all() -> None: def test_consume_few() -> None: import pytest + pytest.skip('Will think about it later..') - with wrap({ - 'important': 123, - 'unimportant': 'whatever' - }) as w: + with wrap({'important': 123, 'unimportant': 'whatever'}) as w: w = cast(Wdict, w) w['important'].zoom() w.consume_all() @@ -206,6 +212,7 @@ def test_consume_few() -> None: def test_zoom() -> None: import pytest + with wrap({'aaa': 'whatever'}) as w: w = cast(Wdict, w) with pytest.raises(KeyError): diff --git a/my/core/mime.py b/my/core/mime.py index cf5bdf5..8235960 100644 --- a/my/core/mime.py +++ b/my/core/mime.py @@ -2,11 +2,14 @@ Utils for mime/filetype handling """ -from .internal import assert_subpackage; assert_subpackage(__name__) +from __future__ import annotations + +from .internal import assert_subpackage + +assert_subpackage(__name__) import functools - -from .common import PathIsh +from pathlib import Path @functools.lru_cache(1) @@ -23,7 +26,7 @@ import mimetypes # todo do I need init()? # todo wtf? fastermime thinks it's mime is application/json even if the extension is xz?? # whereas magic detects correctly: application/x-zstd and application/x-xz -def fastermime(path: PathIsh) -> str: +def fastermime(path: Path | str) -> str: paths = str(path) # mimetypes is faster, so try it first (mime, _) = mimetypes.guess_type(paths) diff --git a/my/core/orgmode.py b/my/core/orgmode.py index 979f288..96c09a4 100644 --- a/my/core/orgmode.py +++ b/my/core/orgmode.py @@ -1,6 +1,7 @@ """ Various helpers for reading org-mode data """ + from datetime import datetime @@ -22,17 +23,20 @@ def parse_org_datetime(s: str) -> datetime: # TODO I guess want to borrow inspiration from bs4? element type <-> tag; and similar logic for find_one, find_all -from typing import Callable, Iterable, TypeVar +from collections.abc import Iterable +from typing import Callable, TypeVar from orgparse import OrgNode V = TypeVar('V') + def collect(n: OrgNode, cfun: Callable[[OrgNode], Iterable[V]]) -> Iterable[V]: yield from cfun(n) for c in n.children: yield from collect(c, cfun) + from more_itertools import one from orgparse.extra import Table @@ -46,7 +50,7 @@ class TypedTable(Table): tt = super().__new__(TypedTable) tt.__dict__ = orig.__dict__ blocks = list(orig.blocks) - header = blocks[0] # fist block is schema + header = blocks[0] # fist block is schema if len(header) == 2: # TODO later interpret first line as types header = header[1:] diff --git a/my/core/pandas.py b/my/core/pandas.py index 8f5fd29..d444965 100644 --- a/my/core/pandas.py +++ b/my/core/pandas.py @@ -7,17 +7,14 @@ from __future__ import annotations # todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential # NOTE: this file is meant to be importable without Pandas installed import dataclasses +from collections.abc import Iterable, Iterator from datetime import datetime, timezone from pprint import pformat from typing import ( TYPE_CHECKING, Any, Callable, - Dict, - Iterable, - Iterator, Literal, - Type, TypeVar, ) @@ -178,7 +175,7 @@ def _to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]: Schema = Any -def _as_columns(s: Schema) -> Dict[str, Type]: +def _as_columns(s: Schema) -> dict[str, type]: # todo would be nice to extract properties; add tests for this as well if dataclasses.is_dataclass(s): return {f.name: f.type for f in dataclasses.fields(s)} # type: ignore[misc] # ugh, why mypy thinks f.type can return str?? diff --git a/my/core/preinit.py b/my/core/preinit.py index be5477b..eb3a34f 100644 --- a/my/core/preinit.py +++ b/my/core/preinit.py @@ -8,6 +8,7 @@ def get_mycfg_dir() -> Path: import os import appdirs # type: ignore[import-untyped] + # not sure if that's necessary, i.e. could rely on PYTHONPATH instead # on the other hand, by using MY_CONFIG we are guaranteed to load it from the desired path? mvar = os.environ.get('MY_CONFIG') diff --git a/my/core/pytest.py b/my/core/pytest.py index e514957..ad9e7d7 100644 --- a/my/core/pytest.py +++ b/my/core/pytest.py @@ -2,7 +2,9 @@ Helpers to prevent depending on pytest in runtime """ -from .internal import assert_subpackage; assert_subpackage(__name__) +from .internal import assert_subpackage + +assert_subpackage(__name__) import sys import typing diff --git a/my/core/query.py b/my/core/query.py index 45806fb..50724a7 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -5,23 +5,20 @@ The main entrypoint to this library is the 'select' function below; try: python3 -c "from my.core.query import select; help(select)" """ +from __future__ import annotations + import dataclasses import importlib import inspect import itertools +from collections.abc import Iterable, Iterator from datetime import datetime from typing import ( Any, Callable, - Dict, - Iterable, - Iterator, - List, NamedTuple, Optional, - Tuple, TypeVar, - Union, ) import more_itertools @@ -51,6 +48,7 @@ class Unsortable(NamedTuple): class QueryException(ValueError): """Used to differentiate query-related errors, so the CLI interface is more expressive""" + pass @@ -63,7 +61,7 @@ def locate_function(module_name: str, function_name: str) -> Callable[[], Iterab """ try: mod = importlib.import_module(module_name) - for (fname, f) in inspect.getmembers(mod, inspect.isfunction): + for fname, f in inspect.getmembers(mod, inspect.isfunction): if fname == function_name: return f # in case the function is defined dynamically, @@ -83,10 +81,10 @@ def locate_qualified_function(qualified_name: str) -> Callable[[], Iterable[ET]] if "." not in qualified_name: raise QueryException("Could not find a '.' in the function name, e.g. my.reddit.rexport.comments") rdot_index = qualified_name.rindex(".") - return locate_function(qualified_name[:rdot_index], qualified_name[rdot_index + 1:]) + return locate_function(qualified_name[:rdot_index], qualified_name[rdot_index + 1 :]) -def attribute_func(obj: T, where: Where, default: Optional[U] = None) -> Optional[OrderFunc]: +def attribute_func(obj: T, where: Where, default: U | None = None) -> OrderFunc | None: """ Attempts to find an attribute which matches the 'where_function' on the object, using some getattr/dict checks. Returns a function which when called with @@ -133,11 +131,11 @@ def attribute_func(obj: T, where: Where, default: Optional[U] = None) -> Optiona def _generate_order_by_func( obj_res: Res[T], *, - key: Optional[str] = None, - where_function: Optional[Where] = None, - default: Optional[U] = None, + key: str | None = None, + where_function: Where | None = None, + default: U | None = None, force_unsortable: bool = False, -) -> Optional[OrderFunc]: +) -> OrderFunc | None: """ Accepts an object Res[T] (Instance of some class or Exception) @@ -202,7 +200,7 @@ pass 'drop_exceptions' to ignore exceptions""") # user must provide either a key or a where predicate if where_function is not None: - func: Optional[OrderFunc] = attribute_func(obj, where_function, default) + func: OrderFunc | None = attribute_func(obj, where_function, default) if func is not None: return func @@ -218,8 +216,6 @@ pass 'drop_exceptions' to ignore exceptions""") return None # couldn't compute a OrderFunc for this class/instance - - # currently using the 'key set' as a proxy for 'this is the same type of thing' def _determine_order_by_value_key(obj_res: ET) -> Any: """ @@ -244,7 +240,7 @@ def _drop_unsorted(itr: Iterator[ET], orderfunc: OrderFunc) -> Iterator[ET]: # try getting the first value from the iterator # similar to my.core.common.warn_if_empty? this doesn't go through the whole iterator though -def _peek_iter(itr: Iterator[ET]) -> Tuple[Optional[ET], Iterator[ET]]: +def _peek_iter(itr: Iterator[ET]) -> tuple[ET | None, Iterator[ET]]: itr = more_itertools.peekable(itr) try: first_item = itr.peek() @@ -255,9 +251,9 @@ def _peek_iter(itr: Iterator[ET]) -> Tuple[Optional[ET], Iterator[ET]]: # similar to 'my.core.error.sort_res_by'? -def _wrap_unsorted(itr: Iterator[ET], orderfunc: OrderFunc) -> Tuple[Iterator[Unsortable], Iterator[ET]]: - unsortable: List[Unsortable] = [] - sortable: List[ET] = [] +def _wrap_unsorted(itr: Iterator[ET], orderfunc: OrderFunc) -> tuple[Iterator[Unsortable], Iterator[ET]]: + unsortable: list[Unsortable] = [] + sortable: list[ET] = [] for o in itr: # if input to select was another select if isinstance(o, Unsortable): @@ -279,7 +275,7 @@ def _handle_unsorted( orderfunc: OrderFunc, drop_unsorted: bool, wrap_unsorted: bool -) -> Tuple[Iterator[Unsortable], Iterator[ET]]: +) -> tuple[Iterator[Unsortable], Iterator[ET]]: # prefer drop_unsorted to wrap_unsorted, if both were present if drop_unsorted: return iter([]), _drop_unsorted(itr, orderfunc) @@ -294,16 +290,16 @@ def _handle_unsorted( # different types. ***This consumes the iterator***, so # you should definitely itertoolts.tee it beforehand # as to not exhaust the values -def _generate_order_value_func(itr: Iterator[ET], order_value: Where, default: Optional[U] = None) -> OrderFunc: +def _generate_order_value_func(itr: Iterator[ET], order_value: Where, default: U | None = None) -> OrderFunc: # TODO: add a kwarg to force lookup for every item? would sort of be like core.common.guess_datetime then - order_by_lookup: Dict[Any, OrderFunc] = {} + order_by_lookup: dict[Any, OrderFunc] = {} # need to go through a copy of the whole iterator here to # pre-generate functions to support sorting mixed types for obj_res in itr: key: Any = _determine_order_by_value_key(obj_res) if key not in order_by_lookup: - keyfunc: Optional[OrderFunc] = _generate_order_by_func( + keyfunc: OrderFunc | None = _generate_order_by_func( obj_res, where_function=order_value, default=default, @@ -324,12 +320,12 @@ def _generate_order_value_func(itr: Iterator[ET], order_value: Where, default: O def _handle_generate_order_by( itr, *, - order_by: Optional[OrderFunc] = None, - order_key: Optional[str] = None, - order_value: Optional[Where] = None, - default: Optional[U] = None, -) -> Tuple[Optional[OrderFunc], Iterator[ET]]: - order_by_chosen: Optional[OrderFunc] = order_by # if the user just supplied a function themselves + order_by: OrderFunc | None = None, + order_key: str | None = None, + order_value: Where | None = None, + default: U | None = None, +) -> tuple[OrderFunc | None, Iterator[ET]]: + order_by_chosen: OrderFunc | None = order_by # if the user just supplied a function themselves if order_by is not None: return order_by, itr if order_key is not None: @@ -354,19 +350,19 @@ def _handle_generate_order_by( def select( - src: Union[Iterable[ET], Callable[[], Iterable[ET]]], + src: Iterable[ET] | Callable[[], Iterable[ET]], *, - where: Optional[Where] = None, - order_by: Optional[OrderFunc] = None, - order_key: Optional[str] = None, - order_value: Optional[Where] = None, - default: Optional[U] = None, + where: Where | None = None, + order_by: OrderFunc | None = None, + order_key: str | None = None, + order_value: Where | None = None, + default: U | None = None, reverse: bool = False, - limit: Optional[int] = None, + limit: int | None = None, drop_unsorted: bool = False, wrap_unsorted: bool = True, warn_exceptions: bool = False, - warn_func: Optional[Callable[[Exception], None]] = None, + warn_func: Callable[[Exception], None] | None = None, drop_exceptions: bool = False, raise_exceptions: bool = False, ) -> Iterator[ET]: @@ -617,7 +613,7 @@ class _B(NamedTuple): # move these to tests/? They are re-used so much in the tests below, # not sure where the best place for these is -def _mixed_iter() -> Iterator[Union[_A, _B]]: +def _mixed_iter() -> Iterator[_A | _B]: yield _A(x=datetime(year=2009, month=5, day=10, hour=4, minute=10, second=1), y=5, z=10) yield _B(y=datetime(year=2015, month=5, day=10, hour=4, minute=10, second=1)) yield _A(x=datetime(year=2005, month=5, day=10, hour=4, minute=10, second=1), y=10, z=2) @@ -626,7 +622,7 @@ def _mixed_iter() -> Iterator[Union[_A, _B]]: yield _A(x=datetime(year=2005, month=4, day=10, hour=4, minute=10, second=1), y=2, z=-5) -def _mixed_iter_errors() -> Iterator[Res[Union[_A, _B]]]: +def _mixed_iter_errors() -> Iterator[Res[_A | _B]]: m = _mixed_iter() yield from itertools.islice(m, 0, 3) yield RuntimeError("Unhandled error!") diff --git a/my/core/query_range.py b/my/core/query_range.py index 1f4a7ff..2a8d7bd 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -7,11 +7,14 @@ filtered iterator See the select_range function below """ +from __future__ import annotations + import re import time +from collections.abc import Iterator from datetime import date, datetime, timedelta -from functools import lru_cache -from typing import Any, Callable, Iterator, NamedTuple, Optional, Type +from functools import cache +from typing import Any, Callable, NamedTuple import more_itertools @@ -25,7 +28,9 @@ from .query import ( select, ) -timedelta_regex = re.compile(r"^((?P[\.\d]+?)w)?((?P[\.\d]+?)d)?((?P[\.\d]+?)h)?((?P[\.\d]+?)m)?((?P[\.\d]+?)s)?$") +timedelta_regex = re.compile( + r"^((?P[\.\d]+?)w)?((?P[\.\d]+?)d)?((?P[\.\d]+?)h)?((?P[\.\d]+?)m)?((?P[\.\d]+?)s)?$" +) # https://stackoverflow.com/a/51916936 @@ -88,7 +93,7 @@ def parse_datetime_float(date_str: str) -> float: # dateparser is a bit more lenient than the above, lets you type # all sorts of dates as inputs # https://github.com/scrapinghub/dateparser#how-to-use - res: Optional[datetime] = dateparser.parse(ds, settings={"DATE_ORDER": "YMD"}) + res: datetime | None = dateparser.parse(ds, settings={"DATE_ORDER": "YMD"}) if res is not None: return res.timestamp() @@ -98,7 +103,7 @@ def parse_datetime_float(date_str: str) -> float: # probably DateLike input? but a user could specify an order_key # which is an epoch timestamp or a float value which they # expect to be converted to a datetime to compare -@lru_cache(maxsize=None) +@cache def _datelike_to_float(dl: Any) -> float: if isinstance(dl, datetime): return dl.timestamp() @@ -130,11 +135,12 @@ class RangeTuple(NamedTuple): of the timeframe -- 'before' - before and after - anything after 'after' and before 'before', acts as a time range """ + # technically doesn't need to be Optional[Any], # just to make it more clear these can be None - after: Optional[Any] - before: Optional[Any] - within: Optional[Any] + after: Any | None + before: Any | None + within: Any | None Converter = Callable[[Any], Any] @@ -145,9 +151,9 @@ def _parse_range( unparsed_range: RangeTuple, end_parser: Converter, within_parser: Converter, - parsed_range: Optional[RangeTuple] = None, - error_message: Optional[str] = None -) -> Optional[RangeTuple]: + parsed_range: RangeTuple | None = None, + error_message: str | None = None, +) -> RangeTuple | None: if parsed_range is not None: return parsed_range @@ -176,11 +182,11 @@ def _create_range_filter( end_parser: Converter, within_parser: Converter, attr_func: Where, - parsed_range: Optional[RangeTuple] = None, - default_before: Optional[Any] = None, - value_coercion_func: Optional[Converter] = None, - error_message: Optional[str] = None, -) -> Optional[Where]: + parsed_range: RangeTuple | None = None, + default_before: Any | None = None, + value_coercion_func: Converter | None = None, + error_message: str | None = None, +) -> Where | None: """ Handles: - parsing the user input into values that are comparable to items the iterable returns @@ -272,17 +278,17 @@ def _create_range_filter( def select_range( itr: Iterator[ET], *, - where: Optional[Where] = None, - order_key: Optional[str] = None, - order_value: Optional[Where] = None, - order_by_value_type: Optional[Type] = None, - unparsed_range: Optional[RangeTuple] = None, + where: Where | None = None, + order_key: str | None = None, + order_value: Where | None = None, + order_by_value_type: type | None = None, + unparsed_range: RangeTuple | None = None, reverse: bool = False, - limit: Optional[int] = None, + limit: int | None = None, drop_unsorted: bool = False, wrap_unsorted: bool = False, warn_exceptions: bool = False, - warn_func: Optional[Callable[[Exception], None]] = None, + warn_func: Callable[[Exception], None] | None = None, drop_exceptions: bool = False, raise_exceptions: bool = False, ) -> Iterator[ET]: @@ -317,9 +323,10 @@ def select_range( drop_exceptions=drop_exceptions, raise_exceptions=raise_exceptions, warn_exceptions=warn_exceptions, - warn_func=warn_func) + warn_func=warn_func, + ) - order_by_chosen: Optional[OrderFunc] = None + order_by_chosen: OrderFunc | None = None # if the user didn't specify an attribute to order value, but specified a type # we should search for on each value in the iterator @@ -345,7 +352,7 @@ Specify a type or a key to order the value by""") # force drop_unsorted=True so we can use _create_range_filter # sort the iterable by the generated order_by_chosen function itr = select(itr, order_by=order_by_chosen, drop_unsorted=True) - filter_func: Optional[Where] + filter_func: Where | None if order_by_value_type in [datetime, date]: filter_func = _create_range_filter( unparsed_range=unparsed_range, @@ -353,7 +360,8 @@ Specify a type or a key to order the value by""") within_parser=parse_timedelta_float, attr_func=order_by_chosen, # type: ignore[arg-type] default_before=time.time(), - value_coercion_func=_datelike_to_float) + value_coercion_func=_datelike_to_float, + ) elif order_by_value_type in [int, float]: # allow primitives to be converted using the default int(), float() callables filter_func = _create_range_filter( @@ -362,7 +370,8 @@ Specify a type or a key to order the value by""") within_parser=order_by_value_type, attr_func=order_by_chosen, # type: ignore[arg-type] default_before=None, - value_coercion_func=order_by_value_type) + value_coercion_func=order_by_value_type, + ) else: # TODO: add additional kwargs to let the user sort by other values, by specifying the parsers? # would need to allow passing the end_parser, within parser, default before and value_coercion_func... @@ -470,7 +479,7 @@ def test_range_predicate() -> None: # filter from 0 to 5 rn: RangeTuple = RangeTuple("0", "5", None) - zero_to_five_filter: Optional[Where] = int_filter_func(unparsed_range=rn) + zero_to_five_filter: Where | None = int_filter_func(unparsed_range=rn) assert zero_to_five_filter is not None # this is just a Where function, given some input it return True/False if the value is allowed assert zero_to_five_filter(3) is True @@ -483,6 +492,7 @@ def test_range_predicate() -> None: rn = RangeTuple(None, 3, "3.5") assert list(filter(int_filter_func(unparsed_range=rn, attr_func=identity), src())) == ["0", "1", "2"] + def test_parse_range() -> None: from functools import partial diff --git a/my/core/serialize.py b/my/core/serialize.py index ab11a20..e36da8f 100644 --- a/my/core/serialize.py +++ b/my/core/serialize.py @@ -1,9 +1,11 @@ +from __future__ import annotations + import datetime from dataclasses import asdict, is_dataclass from decimal import Decimal -from functools import lru_cache +from functools import cache from pathlib import Path -from typing import Any, Callable, NamedTuple, Optional +from typing import Any, Callable, NamedTuple from .error import error_to_json from .pytest import parametrize @@ -57,12 +59,12 @@ def _default_encode(obj: Any) -> Any: # could possibly run multiple times/raise warning if you provide different 'default' # functions or change the kwargs? The alternative is to maintain all of this at the module # level, which is just as annoying -@lru_cache(maxsize=None) +@cache def _dumps_factory(**kwargs) -> Callable[[Any], str]: use_default: DefaultEncoder = _default_encode # if the user passed an additional 'default' parameter, # try using that to serialize before before _default_encode - _additional_default: Optional[DefaultEncoder] = kwargs.get("default") + _additional_default: DefaultEncoder | None = kwargs.get("default") if _additional_default is not None and callable(_additional_default): def wrapped_default(obj: Any) -> Any: @@ -78,9 +80,9 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: kwargs["default"] = use_default - prefer_factory: Optional[str] = kwargs.pop('_prefer_factory', None) + prefer_factory: str | None = kwargs.pop('_prefer_factory', None) - def orjson_factory() -> Optional[Dumps]: + def orjson_factory() -> Dumps | None: try: import orjson except ModuleNotFoundError: @@ -95,7 +97,7 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: return _orjson_dumps - def simplejson_factory() -> Optional[Dumps]: + def simplejson_factory() -> Dumps | None: try: from simplejson import dumps as simplejson_dumps except ModuleNotFoundError: @@ -115,7 +117,7 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: return _simplejson_dumps - def stdlib_factory() -> Optional[Dumps]: + def stdlib_factory() -> Dumps | None: import json from .warnings import high @@ -150,7 +152,7 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]: def dumps( obj: Any, - default: Optional[DefaultEncoder] = None, + default: DefaultEncoder | None = None, **kwargs, ) -> str: """ diff --git a/my/core/source.py b/my/core/source.py index 52c58c1..a309d13 100644 --- a/my/core/source.py +++ b/my/core/source.py @@ -3,9 +3,12 @@ Decorator to gracefully handle importing a data source, or warning and yielding nothing (or a default) when its not available """ +from __future__ import annotations + import warnings +from collections.abc import Iterable, Iterator from functools import wraps -from typing import Any, Callable, Iterable, Iterator, Optional, TypeVar +from typing import Any, Callable, TypeVar from .warnings import medium @@ -26,8 +29,8 @@ _DEFAULT_ITR = () def import_source( *, default: Iterable[T] = _DEFAULT_ITR, - module_name: Optional[str] = None, - help_url: Optional[str] = None, + module_name: str | None = None, + help_url: str | None = None, ) -> Callable[..., Callable[..., Iterator[T]]]: """ doesn't really play well with types, but is used to catch @@ -50,6 +53,7 @@ def import_source( except (ImportError, AttributeError) as err: from . import core_config as CC from .error import warn_my_config_import_error + suppressed_in_conf = False if module_name is not None and CC.config._is_module_active(module_name) is False: suppressed_in_conf = True @@ -72,5 +76,7 @@ class core: if not matched_config_err and isinstance(err, AttributeError): raise err yield from default + return wrapper + return decorator diff --git a/my/core/sqlite.py b/my/core/sqlite.py index 08a80e5..aa41ab3 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -1,12 +1,16 @@ -from .internal import assert_subpackage; assert_subpackage(__name__) +from __future__ import annotations +from .internal import assert_subpackage # noqa: I001 + +assert_subpackage(__name__) import shutil import sqlite3 +from collections.abc import Iterator from contextlib import contextmanager from pathlib import Path from tempfile import TemporaryDirectory -from typing import Any, Callable, Iterator, Literal, Optional, Tuple, Union, overload +from typing import Any, Callable, Literal, Union, overload from .common import PathIsh from .compat import assert_never @@ -22,6 +26,7 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None: conn.execute('CREATE TABLE testtable (col)') import pytest + with pytest.raises(sqlite3.OperationalError, match='readonly database'): with sqlite_connect_immutable(db) as conn: conn.execute('DROP TABLE testtable') @@ -33,6 +38,7 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None: SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any] + def dict_factory(cursor, row): fields = [column[0] for column in cursor.description] return dict(zip(fields, row)) @@ -40,8 +46,9 @@ def dict_factory(cursor, row): Factory = Union[SqliteRowFactory, Literal['row', 'dict']] + @contextmanager -def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]: +def sqlite_connection(db: PathIsh, *, immutable: bool = False, row_factory: Factory | None = None) -> Iterator[sqlite3.Connection]: dbp = f'file:{db}' # https://www.sqlite.org/draft/uri.html#uriimmutable if immutable: @@ -97,30 +104,32 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection: # and then the return type ends up as Iterator[Tuple[str, ...]], which isn't desirable :( # a bit annoying to have this copy-pasting, but hopefully not a big issue +# fmt: off @overload -def select(cols: Tuple[str ], rest: str, *, db: sqlite3.Connection) -> \ - Iterator[Tuple[Any ]]: ... +def select(cols: tuple[str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[tuple[Any ]]: ... @overload -def select(cols: Tuple[str, str ], rest: str, *, db: sqlite3.Connection) -> \ - Iterator[Tuple[Any, Any ]]: ... +def select(cols: tuple[str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[tuple[Any, Any ]]: ... @overload -def select(cols: Tuple[str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ - Iterator[Tuple[Any, Any, Any ]]: ... +def select(cols: tuple[str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[tuple[Any, Any, Any ]]: ... @overload -def select(cols: Tuple[str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ - Iterator[Tuple[Any, Any, Any, Any ]]: ... +def select(cols: tuple[str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[tuple[Any, Any, Any, Any ]]: ... @overload -def select(cols: Tuple[str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ - Iterator[Tuple[Any, Any, Any, Any, Any ]]: ... +def select(cols: tuple[str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[tuple[Any, Any, Any, Any, Any ]]: ... @overload -def select(cols: Tuple[str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ - Iterator[Tuple[Any, Any, Any, Any, Any, Any ]]: ... +def select(cols: tuple[str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[tuple[Any, Any, Any, Any, Any, Any ]]: ... @overload -def select(cols: Tuple[str, str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ - Iterator[Tuple[Any, Any, Any, Any, Any, Any, Any ]]: ... +def select(cols: tuple[str, str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[tuple[Any, Any, Any, Any, Any, Any, Any ]]: ... @overload -def select(cols: Tuple[str, str, str, str, str, str, str, str], rest: str, *, db: sqlite3.Connection) -> \ - Iterator[Tuple[Any, Any, Any, Any, Any, Any, Any, Any]]: ... +def select(cols: tuple[str, str, str, str, str, str, str, str], rest: str, *, db: sqlite3.Connection) -> \ + Iterator[tuple[Any, Any, Any, Any, Any, Any, Any, Any]]: ... +# fmt: on def select(cols, rest, *, db): # db arg is last cause that results in nicer code formatting.. diff --git a/my/core/stats.py b/my/core/stats.py index 674a8d1..a553db3 100644 --- a/my/core/stats.py +++ b/my/core/stats.py @@ -2,10 +2,13 @@ Helpers for hpi doctor/stats functionality. ''' +from __future__ import annotations + import collections.abc import importlib import inspect import typing +from collections.abc import Iterable, Iterator, Sequence from contextlib import contextmanager from datetime import datetime from pathlib import Path @@ -13,20 +16,13 @@ from types import ModuleType from typing import ( Any, Callable, - Dict, - Iterable, - Iterator, - List, - Optional, Protocol, - Sequence, - Union, cast, ) from .types import asdict -Stats = Dict[str, Any] +Stats = dict[str, Any] class StatsFun(Protocol): @@ -55,10 +51,10 @@ def quick_stats(): def stat( - func: Union[Callable[[], Iterable[Any]], Iterable[Any]], + func: Callable[[], Iterable[Any]] | Iterable[Any], *, quick: bool = False, - name: Optional[str] = None, + name: str | None = None, ) -> Stats: """ Extracts various statistics from a passed iterable/callable, e.g.: @@ -153,8 +149,8 @@ def test_stat() -> None: # -def get_stats(module_name: str, *, guess: bool = False) -> Optional[StatsFun]: - stats: Optional[StatsFun] = None +def get_stats(module_name: str, *, guess: bool = False) -> StatsFun | None: + stats: StatsFun | None = None try: module = importlib.import_module(module_name) except Exception: @@ -167,7 +163,7 @@ def get_stats(module_name: str, *, guess: bool = False) -> Optional[StatsFun]: # TODO maybe could be enough to annotate OUTPUTS or something like that? # then stats could just use them as hints? -def guess_stats(module: ModuleType) -> Optional[StatsFun]: +def guess_stats(module: ModuleType) -> StatsFun | None: """ If the module doesn't have explicitly defined 'stat' function, this is used to try to guess what could be included in stats automatically @@ -206,7 +202,7 @@ def test_guess_stats() -> None: } -def _guess_data_providers(module: ModuleType) -> Dict[str, Callable]: +def _guess_data_providers(module: ModuleType) -> dict[str, Callable]: mfunctions = inspect.getmembers(module, inspect.isfunction) return {k: v for k, v in mfunctions if is_data_provider(v)} @@ -263,7 +259,7 @@ def test_is_data_provider() -> None: lam = lambda: [1, 2] assert not idp(lam) - def has_extra_args(count) -> List[int]: + def has_extra_args(count) -> list[int]: return list(range(count)) assert not idp(has_extra_args) @@ -340,10 +336,10 @@ def test_type_is_iterable() -> None: assert not fun(None) assert not fun(int) assert not fun(Any) - assert not fun(Dict[int, int]) + assert not fun(dict[int, int]) - assert fun(List[int]) - assert fun(Sequence[Dict[str, str]]) + assert fun(list[int]) + assert fun(Sequence[dict[str, str]]) assert fun(Iterable[Any]) @@ -434,7 +430,7 @@ def test_stat_iterable() -> None: # experimental, not sure about it.. -def _guess_datetime(x: Any) -> Optional[datetime]: +def _guess_datetime(x: Any) -> datetime | None: # todo hmm implement without exception.. try: d = asdict(x) diff --git a/my/core/structure.py b/my/core/structure.py index fa26532..bb049e4 100644 --- a/my/core/structure.py +++ b/my/core/structure.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import atexit import os import shutil @@ -5,9 +7,9 @@ import sys import tarfile import tempfile import zipfile +from collections.abc import Generator, Sequence from contextlib import contextmanager from pathlib import Path -from typing import Generator, List, Sequence, Tuple, Union from .logging import make_logger @@ -42,10 +44,10 @@ TARGZ_EXT = {".tar.gz"} @contextmanager def match_structure( base: Path, - expected: Union[str, Sequence[str]], + expected: str | Sequence[str], *, partial: bool = False, -) -> Generator[Tuple[Path, ...], None, None]: +) -> Generator[tuple[Path, ...], None, None]: """ Given a 'base' directory or archive (zip/tar.gz), recursively search for one or more paths that match the pattern described in 'expected'. That can be a single string, or a list @@ -140,8 +142,8 @@ def match_structure( if not searchdir.is_dir(): raise NotADirectoryError(f"Expected either a zip/tar.gz archive or a directory, received {searchdir}") - matches: List[Path] = [] - possible_targets: List[Path] = [searchdir] + matches: list[Path] = [] + possible_targets: list[Path] = [searchdir] while len(possible_targets) > 0: p = possible_targets.pop(0) @@ -172,7 +174,7 @@ def warn_leftover_files() -> None: from . import core_config as CC base_tmp: Path = CC.config.get_tmp_dir() - leftover: List[Path] = list(base_tmp.iterdir()) + leftover: list[Path] = list(base_tmp.iterdir()) if leftover: logger.debug(f"at exit warning: Found leftover files in temporary directory '{leftover}'. this may be because you have multiple hpi processes running -- if so this can be ignored") diff --git a/my/core/tests/auto_stats.py b/my/core/tests/auto_stats.py index d10d4c4..fc49e03 100644 --- a/my/core/tests/auto_stats.py +++ b/my/core/tests/auto_stats.py @@ -2,11 +2,11 @@ Helper 'module' for test_guess_stats """ +from collections.abc import Iterable, Iterator, Sequence from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime, timedelta from pathlib import Path -from typing import Iterable, Iterator, Sequence @dataclass diff --git a/my/core/tests/common.py b/my/core/tests/common.py index 22a74d7..073ea5f 100644 --- a/my/core/tests/common.py +++ b/my/core/tests/common.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import os +from collections.abc import Iterator from contextlib import contextmanager -from typing import Iterator, Optional import pytest @@ -15,7 +17,7 @@ skip_if_uses_optional_deps = pytest.mark.skipif( # TODO maybe move to hpi core? @contextmanager -def tmp_environ_set(key: str, value: Optional[str]) -> Iterator[None]: +def tmp_environ_set(key: str, value: str | None) -> Iterator[None]: prev_value = os.environ.get(key) if value is None: os.environ.pop(key, None) diff --git a/my/core/tests/denylist.py b/my/core/tests/denylist.py index 2688319..73c3165 100644 --- a/my/core/tests/denylist.py +++ b/my/core/tests/denylist.py @@ -1,8 +1,9 @@ import json import warnings +from collections.abc import Iterator from datetime import datetime from pathlib import Path -from typing import Iterator, NamedTuple +from typing import NamedTuple from ..denylist import DenyList diff --git a/my/core/tests/test_cachew.py b/my/core/tests/test_cachew.py index 70ac76f..a0d2267 100644 --- a/my/core/tests/test_cachew.py +++ b/my/core/tests/test_cachew.py @@ -1,6 +1,6 @@ -from .common import skip_if_uses_optional_deps as pytestmark +from __future__ import annotations -from typing import List +from .common import skip_if_uses_optional_deps as pytestmark # TODO ugh, this is very messy.. need to sort out config overriding here @@ -16,7 +16,7 @@ def test_cachew() -> None: # TODO ugh. need doublewrap or something to avoid having to pass parens @mcachew() - def cf() -> List[int]: + def cf() -> list[int]: nonlocal called called += 1 return [1, 2, 3] @@ -43,7 +43,7 @@ def test_cachew_dir_none() -> None: called = 0 @mcachew(cache_path=cache_dir() / 'ctest') - def cf() -> List[int]: + def cf() -> list[int]: nonlocal called called += 1 return [called, called, called] diff --git a/my/core/tests/test_config.py b/my/core/tests/test_config.py index 78d1a62..f6d12ba 100644 --- a/my/core/tests/test_config.py +++ b/my/core/tests/test_config.py @@ -2,8 +2,8 @@ Various tests that are checking behaviour of user config wrt to various things """ -import sys import os +import sys from pathlib import Path import pytest diff --git a/my/core/time.py b/my/core/time.py index fa20a7c..a9b180d 100644 --- a/my/core/time.py +++ b/my/core/time.py @@ -1,5 +1,7 @@ -from functools import lru_cache -from typing import Dict, Sequence +from __future__ import annotations + +from collections.abc import Sequence +from functools import cache, lru_cache import pytz @@ -11,6 +13,7 @@ def user_forced() -> Sequence[str]: # https://stackoverflow.com/questions/36067621/python-all-possible-timezone-abbreviations-for-given-timezone-name-and-vise-ve try: from my.config import time as user_config + return user_config.tz.force_abbreviations # type: ignore[attr-defined] # noqa: TRY300 # note: noqa since we're catching case where config doesn't have attribute here as well except: @@ -19,15 +22,15 @@ def user_forced() -> Sequence[str]: @lru_cache(1) -def _abbr_to_timezone_map() -> Dict[str, pytz.BaseTzInfo]: +def _abbr_to_timezone_map() -> dict[str, pytz.BaseTzInfo]: # also force UTC to always correspond to utc # this makes more sense than Zulu it ends up by default timezones = [*pytz.all_timezones, 'UTC', *user_forced()] - res: Dict[str, pytz.BaseTzInfo] = {} + res: dict[str, pytz.BaseTzInfo] = {} for tzname in timezones: tz = pytz.timezone(tzname) - infos = getattr(tz, '_tzinfos', []) # not sure if can rely on attr always present? + infos = getattr(tz, '_tzinfos', []) # not sure if can rely on attr always present? for info in infos: abbr = info[-1] # todo could support this with a better error handling strategy? @@ -43,7 +46,7 @@ def _abbr_to_timezone_map() -> Dict[str, pytz.BaseTzInfo]: return res -@lru_cache(maxsize=None) +@cache def abbr_to_timezone(abbr: str) -> pytz.BaseTzInfo: return _abbr_to_timezone_map()[abbr] diff --git a/my/core/types.py b/my/core/types.py index b1cf103..dc19c19 100644 --- a/my/core/types.py +++ b/my/core/types.py @@ -1,14 +1,15 @@ -from .internal import assert_subpackage; assert_subpackage(__name__) +from __future__ import annotations + +from .internal import assert_subpackage + +assert_subpackage(__name__) from dataclasses import asdict as dataclasses_asdict from dataclasses import is_dataclass from datetime import datetime -from typing import ( - Any, - Dict, -) +from typing import Any -Json = Dict[str, Any] +Json = dict[str, Any] # for now just serves documentation purposes... but one day might make it statically verifiable where possible? diff --git a/my/core/util.py b/my/core/util.py index a247f81..74e71e1 100644 --- a/my/core/util.py +++ b/my/core/util.py @@ -1,10 +1,12 @@ +from __future__ import annotations + import os import pkgutil import sys +from collections.abc import Iterable from itertools import chain from pathlib import Path from types import ModuleType -from typing import Iterable, List, Optional from .discovery_pure import HPIModule, _is_not_module_src, has_stats, ignored @@ -20,13 +22,14 @@ from .discovery_pure import NOT_HPI_MODULE_VAR assert NOT_HPI_MODULE_VAR in globals() # check name consistency -def is_not_hpi_module(module: str) -> Optional[str]: + +def is_not_hpi_module(module: str) -> str | None: ''' None if a module, otherwise returns reason ''' import importlib.util - path: Optional[str] = None + path: str | None = None try: # TODO annoying, this can cause import of the parent module? spec = importlib.util.find_spec(module) @@ -35,7 +38,7 @@ def is_not_hpi_module(module: str) -> Optional[str]: except Exception as e: # todo a bit misleading.. it actually shouldn't import in most cases, it's just the weird parent module import thing return "import error (possibly missing config entry)" # todo add exc message? - assert path is not None # not sure if can happen? + assert path is not None # not sure if can happen? if _is_not_module_src(Path(path)): return f"marked explicitly (via {NOT_HPI_MODULE_VAR})" @@ -57,9 +60,10 @@ def _iter_all_importables(pkg: ModuleType) -> Iterable[HPIModule]: def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModule]: - from .core_config import config - """Yield all importables under a given path and package.""" + + from .core_config import config # noqa: F401 + for dir_path, dirs, file_names in os.walk(pkg_pth): file_names.sort() # NOTE: sorting dirs in place is intended, it's the way you're supposed to do it with os.walk @@ -82,6 +86,7 @@ def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModu # TODO might need to make it defensive and yield Exception (otherwise hpi doctor might fail for no good reason) # use onerror=? + # ignored explicitly -> not HPI # if enabled in config -> HPI # if disabled in config -> HPI @@ -90,7 +95,7 @@ def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModu # TODO when do we need to recurse? -def _walk_packages(path: Iterable[str], prefix: str='', onerror=None) -> Iterable[HPIModule]: +def _walk_packages(path: Iterable[str], prefix: str = '', onerror=None) -> Iterable[HPIModule]: """ Modified version of https://github.com/python/cpython/blob/d50a0700265536a20bcce3fb108c954746d97625/Lib/pkgutil.py#L53, to avoid importing modules that are skipped @@ -153,8 +158,9 @@ def _walk_packages(path: Iterable[str], prefix: str='', onerror=None) -> Iterabl path = [p for p in path if not seen(p)] yield from _walk_packages(path, mname + '.', onerror) + # deprecate? -def get_modules() -> List[HPIModule]: +def get_modules() -> list[HPIModule]: return list(modules()) @@ -169,14 +175,14 @@ def test_module_detection() -> None: with reset() as cc: cc.disabled_modules = ['my.location.*', 'my.body.*', 'my.workouts.*', 'my.private.*'] mods = {m.name: m for m in modules()} - assert mods['my.demo'] .skip_reason == "has no 'stats()' function" + assert mods['my.demo'].skip_reason == "has no 'stats()' function" with reset() as cc: cc.disabled_modules = ['my.location.*', 'my.body.*', 'my.workouts.*', 'my.private.*', 'my.lastfm'] - cc.enabled_modules = ['my.demo'] + cc.enabled_modules = ['my.demo'] mods = {m.name: m for m in modules()} - assert mods['my.demo'] .skip_reason is None # not skipped + assert mods['my.demo'].skip_reason is None # not skipped assert mods['my.lastfm'].skip_reason == "suppressed in the user config" diff --git a/my/core/utils/concurrent.py b/my/core/utils/concurrent.py index 73944ec..515c3f1 100644 --- a/my/core/utils/concurrent.py +++ b/my/core/utils/concurrent.py @@ -1,6 +1,7 @@ -import sys +from __future__ import annotations + from concurrent.futures import Executor, Future -from typing import Any, Callable, Optional, TypeVar +from typing import Any, Callable, TypeVar from ..compat import ParamSpec @@ -15,7 +16,7 @@ class DummyExecutor(Executor): but also want to provide an option to run the code serially (e.g. for debugging) """ - def __init__(self, max_workers: Optional[int] = 1) -> None: + def __init__(self, max_workers: int | None = 1) -> None: self._shutdown = False self._max_workers = max_workers diff --git a/my/core/utils/imports.py b/my/core/utils/imports.py index 4666a5e..e0fb01d 100644 --- a/my/core/utils/imports.py +++ b/my/core/utils/imports.py @@ -1,27 +1,27 @@ +from __future__ import annotations + import importlib import importlib.util import sys from pathlib import Path from types import ModuleType -from typing import Optional - -from ..common import PathIsh # TODO only used in tests? not sure if useful at all. -def import_file(p: PathIsh, name: Optional[str] = None) -> ModuleType: +def import_file(p: Path | str, name: str | None = None) -> ModuleType: p = Path(p) if name is None: name = p.stem spec = importlib.util.spec_from_file_location(name, p) assert spec is not None, f"Fatal error; Could not create module spec from {name} {p}" foo = importlib.util.module_from_spec(spec) - loader = spec.loader; assert loader is not None + loader = spec.loader + assert loader is not None loader.exec_module(foo) return foo -def import_from(path: PathIsh, name: str) -> ModuleType: +def import_from(path: Path | str, name: str) -> ModuleType: path = str(path) sys.path.append(path) try: @@ -30,7 +30,7 @@ def import_from(path: PathIsh, name: str) -> ModuleType: sys.path.remove(path) -def import_dir(path: PathIsh, extra: str = '') -> ModuleType: +def import_dir(path: Path | str, extra: str = '') -> ModuleType: p = Path(path) if p.parts[0] == '~': p = p.expanduser() # TODO eh. not sure about this.. diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index ae9402d..501ebbe 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -4,17 +4,13 @@ Various helpers/transforms of iterators Ideally this should be as small as possible and we should rely on stdlib itertools or more_itertools """ +from __future__ import annotations + import warnings -from collections.abc import Hashable +from collections.abc import Hashable, Iterable, Iterator, Sized from typing import ( TYPE_CHECKING, Callable, - Dict, - Iterable, - Iterator, - List, - Optional, - Sized, TypeVar, Union, cast, @@ -23,9 +19,8 @@ from typing import ( import more_itertools from decorator import decorator -from ..compat import ParamSpec from .. import warnings as core_warnings - +from ..compat import ParamSpec T = TypeVar('T') K = TypeVar('K') @@ -39,7 +34,7 @@ def _identity(v: T) -> V: # type: ignore[type-var] # ugh. nothing in more_itertools? # perhaps duplicates_everseen? but it doesn't yield non-unique elements? def ensure_unique(it: Iterable[T], *, key: Callable[[T], K]) -> Iterable[T]: - key2item: Dict[K, T] = {} + key2item: dict[K, T] = {} for i in it: k = key(i) pi = key2item.get(k, None) @@ -72,10 +67,10 @@ def make_dict( key: Callable[[T], K], # TODO make value optional instead? but then will need a typing override for it? value: Callable[[T], V] = _identity, -) -> Dict[K, V]: +) -> dict[K, V]: with_keys = ((key(i), i) for i in it) uniques = ensure_unique(with_keys, key=lambda p: p[0]) - res: Dict[K, V] = {} + res: dict[K, V] = {} for k, i in uniques: res[k] = i if value is None else value(i) return res @@ -93,8 +88,8 @@ def test_make_dict() -> None: d = make_dict(it, key=lambda i: i % 2, value=lambda i: i) # check type inference - d2: Dict[str, int] = make_dict(it, key=lambda i: str(i)) - d3: Dict[str, bool] = make_dict(it, key=lambda i: str(i), value=lambda i: i % 2 == 0) + d2: dict[str, int] = make_dict(it, key=lambda i: str(i)) + d3: dict[str, bool] = make_dict(it, key=lambda i: str(i), value=lambda i: i % 2 == 0) LFP = ParamSpec('LFP') @@ -102,7 +97,7 @@ LV = TypeVar('LV') @decorator -def _listify(func: Callable[LFP, Iterable[LV]], *args: LFP.args, **kwargs: LFP.kwargs) -> List[LV]: +def _listify(func: Callable[LFP, Iterable[LV]], *args: LFP.args, **kwargs: LFP.kwargs) -> list[LV]: """ Wraps a function's return value in wrapper (e.g. list) Useful when an algorithm can be expressed more cleanly as a generator @@ -115,7 +110,7 @@ def _listify(func: Callable[LFP, Iterable[LV]], *args: LFP.args, **kwargs: LFP.k # so seems easiest to just use specialize instantiations of decorator instead if TYPE_CHECKING: - def listify(func: Callable[LFP, Iterable[LV]]) -> Callable[LFP, List[LV]]: ... # noqa: ARG001 + def listify(func: Callable[LFP, Iterable[LV]]) -> Callable[LFP, list[LV]]: ... # noqa: ARG001 else: listify = _listify @@ -130,7 +125,7 @@ def test_listify() -> None: yield 2 res = it() - assert_type(res, List[int]) + assert_type(res, list[int]) assert res == [1, 2] @@ -201,24 +196,24 @@ def test_warn_if_empty_list() -> None: ll = [1, 2, 3] @warn_if_empty - def nonempty() -> List[int]: + def nonempty() -> list[int]: return ll with warnings.catch_warnings(record=True) as w: res1 = nonempty() assert len(w) == 0 - assert_type(res1, List[int]) + assert_type(res1, list[int]) assert isinstance(res1, list) assert res1 is ll # object should be unchanged! @warn_if_empty - def empty() -> List[str]: + def empty() -> list[str]: return [] with warnings.catch_warnings(record=True) as w: res2 = empty() assert len(w) == 1 - assert_type(res2, List[str]) + assert_type(res2, list[str]) assert isinstance(res2, list) assert res2 == [] @@ -242,7 +237,7 @@ def check_if_hashable(iterable: Iterable[_HT]) -> Iterable[_HT]: """ NOTE: Despite Hashable bound, typing annotation doesn't guarantee runtime safety Consider hashable type X, and Y that inherits from X, but not hashable - Then l: List[X] = [Y(...)] is a valid expression, and type checks against Hashable, + Then l: list[X] = [Y(...)] is a valid expression, and type checks against Hashable, but isn't runtime hashable """ # Sadly this doesn't work 100% correctly with dataclasses atm... @@ -268,28 +263,27 @@ def check_if_hashable(iterable: Iterable[_HT]) -> Iterable[_HT]: # TODO different policies -- error/warn/ignore? def test_check_if_hashable() -> None: from dataclasses import dataclass - from typing import Set, Tuple import pytest from ..compat import assert_type - x1: List[int] = [1, 2] + x1: list[int] = [1, 2] r1 = check_if_hashable(x1) assert_type(r1, Iterable[int]) assert r1 is x1 - x2: Iterator[Union[int, str]] = iter((123, 'aba')) + x2: Iterator[int | str] = iter((123, 'aba')) r2 = check_if_hashable(x2) assert_type(r2, Iterable[Union[int, str]]) assert list(r2) == [123, 'aba'] - x3: Tuple[object, ...] = (789, 'aba') + x3: tuple[object, ...] = (789, 'aba') r3 = check_if_hashable(x3) assert_type(r3, Iterable[object]) assert r3 is x3 # object should be unchanged - x4: List[Set[int]] = [{1, 2, 3}, {4, 5, 6}] + x4: list[set[int]] = [{1, 2, 3}, {4, 5, 6}] with pytest.raises(Exception): # should be rejected by mypy sice set isn't Hashable, but also throw at runtime r4 = check_if_hashable(x4) # type: ignore[type-var] @@ -307,7 +301,7 @@ def test_check_if_hashable() -> None: class X: a: int - x6: List[X] = [X(a=123)] + x6: list[X] = [X(a=123)] r6 = check_if_hashable(x6) assert x6 is r6 @@ -316,7 +310,7 @@ def test_check_if_hashable() -> None: class Y(X): b: str - x7: List[Y] = [Y(a=123, b='aba')] + x7: list[Y] = [Y(a=123, b='aba')] with pytest.raises(Exception): # ideally that would also be rejected by mypy, but currently there is a bug # which treats all dataclasses as hashable: https://github.com/python/mypy/issues/11463 @@ -331,11 +325,8 @@ _UEU = TypeVar('_UEU') # instead of just iterator # TODO maybe deprecated Callable support? not sure def unique_everseen( - fun: Union[ - Callable[[], Iterable[_UET]], - Iterable[_UET] - ], - key: Optional[Callable[[_UET], _UEU]] = None, + fun: Callable[[], Iterable[_UET]] | Iterable[_UET], + key: Callable[[_UET], _UEU] | None = None, ) -> Iterator[_UET]: import os diff --git a/my/core/warnings.py b/my/core/warnings.py index 2ffc3e4..d67ec7d 100644 --- a/my/core/warnings.py +++ b/my/core/warnings.py @@ -5,14 +5,16 @@ since who looks at the terminal output? E.g. would be nice to propagate the warnings in the UI (it's even a subclass of Exception!) ''' +from __future__ import annotations + import sys import warnings -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import click -def _colorize(x: str, color: Optional[str] = None) -> str: +def _colorize(x: str, color: str | None = None) -> str: if color is None: return x @@ -24,7 +26,7 @@ def _colorize(x: str, color: Optional[str] = None) -> str: return click.style(x, fg=color) -def _warn(message: str, *args, color: Optional[str] = None, **kwargs) -> None: +def _warn(message: str, *args, color: str | None = None, **kwargs) -> None: stacklevel = kwargs.get('stacklevel', 1) kwargs['stacklevel'] = stacklevel + 2 # +1 for this function, +1 for medium/high wrapper warnings.warn(_colorize(message, color=color), *args, **kwargs) # noqa: B028 From 8496d131e7e44b3effcc289762a4218aa1457725 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 19 Oct 2024 22:10:40 +0100 Subject: [PATCH 293/302] general: migrate modules to use 3.9 features --- my/arbtt.py | 23 +++++++----- my/bluemaestro.py | 11 +++--- my/body/blood.py | 36 ++++++++++--------- my/body/exercise/all.py | 6 ++-- my/body/exercise/cardio.py | 1 - my/body/exercise/cross_trainer.py | 18 ++++++---- my/body/sleep/common.py | 5 +-- my/body/sleep/main.py | 5 ++- my/body/weight.py | 8 ++--- my/books/kobo.py | 7 ++-- my/browser/active_browser.py | 8 +++-- my/browser/all.py | 6 ++-- my/browser/export.py | 9 ++--- my/bumble/android.py | 26 +++++++------- my/calendar/holidays.py | 3 +- my/cfg.py | 1 - my/codeforces.py | 9 +++-- my/coding/commits.py | 34 +++++++++--------- my/common.py | 2 +- my/config.py | 36 ++++++++++--------- my/core/_deprecated/kompress.py | 2 +- my/core/common.py | 4 +-- my/demo.py | 6 ++-- my/emfit/__init__.py | 46 +++++++++++++----------- my/endomondo.py | 24 ++++++++----- my/error.py | 2 +- my/experimental/destructive_parsing.py | 9 ++--- my/fbmessenger/__init__.py | 1 + my/fbmessenger/all.py | 6 ++-- my/fbmessenger/android.py | 35 ++++++++++--------- my/fbmessenger/common.py | 18 ++++++---- my/fbmessenger/export.py | 9 +++-- my/foursquare.py | 9 +++-- my/github/all.py | 3 +- my/github/common.py | 21 ++++++----- my/github/gdpr.py | 3 +- my/github/ghexport.py | 25 +++++++++----- my/goodreads.py | 16 +++++---- my/google/maps/_android_protobuf.py | 4 +-- my/google/maps/android.py | 12 +++---- my/google/takeout/html.py | 28 ++++++++------- my/google/takeout/parser.py | 20 ++++++----- my/google/takeout/paths.py | 12 ++++--- my/hackernews/dogsheep.py | 12 +++---- my/hackernews/harmonic.py | 25 +++++++++----- my/hackernews/materialistic.py | 11 +++--- my/hypothesis.py | 10 +++--- my/instagram/all.py | 5 ++- my/instagram/android.py | 37 ++++++++++---------- my/instagram/common.py | 9 ++--- my/instagram/gdpr.py | 19 +++++----- my/instapaper.py | 10 ++++-- my/ip/all.py | 3 +- my/ip/common.py | 7 ++-- my/jawbone/__init__.py | 23 ++++++------ my/jawbone/plots.py | 17 ++++----- my/kobo.py | 31 +++++++++-------- my/kython/kompress.py | 3 +- my/lastfm.py | 14 +++++--- my/location/all.py | 5 ++- my/location/common.py | 11 +++--- my/location/fallback/all.py | 10 +++--- my/location/fallback/common.py | 31 +++++++++-------- my/location/fallback/via_home.py | 32 ++++++++--------- my/location/fallback/via_ip.py | 8 ++--- my/location/google.py | 21 ++++++----- my/location/google_takeout.py | 7 ++-- my/location/google_takeout_semantic.py | 11 +++--- my/location/gpslogger.py | 10 +++--- my/location/home.py | 4 +-- my/location/via_ip.py | 4 +-- my/materialistic.py | 1 + my/media/imdb.py | 10 +++--- my/media/youtube.py | 2 +- my/monzo/monzoexport.py | 5 +-- my/orgmode.py | 10 +++--- my/pdfs.py | 14 ++++---- my/photos/main.py | 29 +++++++++------- my/photos/utils.py | 15 ++++---- my/pinboard.py | 9 ++--- my/pocket.py | 12 ++++--- my/polar.py | 33 ++++++++++-------- my/reddit/__init__.py | 1 + my/reddit/all.py | 7 ++-- my/reddit/common.py | 10 +++--- my/reddit/pushshift.py | 12 +++---- my/reddit/rexport.py | 17 ++++----- my/rescuetime.py | 23 +++++++----- my/roamresearch.py | 29 +++++++++------- my/rss/all.py | 4 +-- my/rss/common.py | 16 +++++---- my/rss/feedbin.py | 8 ++--- my/rss/feedly.py | 3 +- my/rtm.py | 24 ++++++------- my/runnerup.py | 14 ++++---- my/simple.py | 5 ++- my/smscalls.py | 48 ++++++++++++++------------ my/stackexchange/gdpr.py | 20 ++++++++--- my/stackexchange/stexport.py | 3 +- my/taplog.py | 14 ++++---- my/telegram/telegram_backup.py | 30 ++++++++-------- my/tests/bluemaestro.py | 2 +- my/tests/body/weight.py | 6 ++-- my/tests/commits.py | 7 ++-- my/tests/location/fallback.py | 2 +- my/tests/reddit.py | 10 +++--- my/time/tz/common.py | 1 - my/time/tz/main.py | 1 + my/time/tz/via_location.py | 36 +++++++++---------- my/tinder/android.py | 18 +++++----- my/topcoder.py | 8 ++--- my/twitter/all.py | 6 ++-- my/twitter/android.py | 16 ++++----- my/twitter/archive.py | 3 +- my/twitter/common.py | 10 +++--- my/twitter/talon.py | 3 +- my/twitter/twint.py | 10 +++--- my/util/hpi_heartbeat.py | 11 +++--- my/vk/favorites.py | 13 +++---- my/vk/vk_messages_backup.py | 12 +++---- my/whatsapp/android.py | 23 ++++++------ my/youtube/takeout.py | 3 +- my/zotero.py | 22 ++++++------ my/zulip/organization.py | 2 +- ruff.toml | 16 ++++----- 125 files changed, 889 insertions(+), 739 deletions(-) diff --git a/my/arbtt.py b/my/arbtt.py index 2bcf291..5d4bf8e 100644 --- a/my/arbtt.py +++ b/my/arbtt.py @@ -2,20 +2,22 @@ [[https://github.com/nomeata/arbtt#arbtt-the-automatic-rule-based-time-tracker][Arbtt]] time tracking ''' +from __future__ import annotations + REQUIRES = ['ijson', 'cffi'] # NOTE likely also needs libyajl2 from apt or elsewhere? +from collections.abc import Iterable, Sequence from dataclasses import dataclass from pathlib import Path -from typing import Sequence, Iterable, List, Optional def inputs() -> Sequence[Path]: try: from my.config import arbtt as user_config except ImportError: - from .core.warnings import low + from my.core.warnings import low low("Couldn't find 'arbtt' config section, falling back to the default capture.log (usually in HOME dir). Add 'arbtt' section with logfiles = '' to suppress this warning.") return [] else: @@ -55,7 +57,7 @@ class Entry: return fromisoformat(ds) @property - def active(self) -> Optional[str]: + def active(self) -> str | None: # NOTE: WIP, might change this in the future... ait = (w for w in self.json['windows'] if w['active']) a = next(ait, None) @@ -74,17 +76,18 @@ class Entry: def entries() -> Iterable[Entry]: inps = list(inputs()) - base: List[PathIsh] = ['arbtt-dump', '--format=json'] + base: list[PathIsh] = ['arbtt-dump', '--format=json'] - cmds: List[List[PathIsh]] + cmds: list[list[PathIsh]] if len(inps) == 0: cmds = [base] # rely on default else: # otherwise, 'merge' them cmds = [[*base, '--logfile', f] for f in inps] - import ijson.backends.yajl2_cffi as ijson # type: ignore - from subprocess import Popen, PIPE + from subprocess import PIPE, Popen + + import ijson.backends.yajl2_cffi as ijson # type: ignore for cmd in cmds: with Popen(cmd, stdout=PIPE) as p: out = p.stdout; assert out is not None @@ -93,8 +96,8 @@ def entries() -> Iterable[Entry]: def fill_influxdb() -> None: - from .core.influxdb import magic_fill from .core.freezer import Freezer + from .core.influxdb import magic_fill freezer = Freezer(Entry) fit = (freezer.freeze(e) for e in entries()) # TODO crap, influxdb doesn't like None https://github.com/influxdata/influxdb/issues/7722 @@ -106,6 +109,8 @@ def fill_influxdb() -> None: magic_fill(fit, name=f'{entries.__module__}:{entries.__name__}') -from .core import stat, Stats +from .core import Stats, stat + + def stats() -> Stats: return stat(entries) diff --git a/my/bluemaestro.py b/my/bluemaestro.py index 4c33fd1..8c739f0 100644 --- a/my/bluemaestro.py +++ b/my/bluemaestro.py @@ -2,14 +2,17 @@ [[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor """ +from __future__ import annotations + # todo most of it belongs to DAL... but considering so few people use it I didn't bother for now import re import sqlite3 from abc import abstractmethod +from collections.abc import Iterable, Sequence from dataclasses import dataclass from datetime import datetime, timedelta from pathlib import Path -from typing import Iterable, Optional, Protocol, Sequence, Set +from typing import Protocol import pytz @@ -87,17 +90,17 @@ def measurements() -> Iterable[Res[Measurement]]: total = len(paths) width = len(str(total)) - last: Optional[datetime] = None + last: datetime | None = None # tables are immutable, so can save on processing.. - processed_tables: Set[str] = set() + processed_tables: set[str] = set() for idx, path in enumerate(paths): logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') tot = 0 new = 0 # todo assert increasing timestamp? with sqlite_connect_immutable(path) as db: - db_dt: Optional[datetime] = None + db_dt: datetime | None = None try: datas = db.execute( f'SELECT "{path.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index' diff --git a/my/body/blood.py b/my/body/blood.py index fb035eb..867568c 100644 --- a/my/body/blood.py +++ b/my/body/blood.py @@ -2,41 +2,42 @@ Blood tracking (manual org-mode entries) """ +from __future__ import annotations + +from collections.abc import Iterable from datetime import datetime -from typing import Iterable, NamedTuple, Optional +from typing import NamedTuple -from ..core.error import Res -from ..core.orgmode import parse_org_datetime, one_table - - -import pandas as pd import orgparse - +import pandas as pd from my.config import blood as config # type: ignore[attr-defined] +from ..core.error import Res +from ..core.orgmode import one_table, parse_org_datetime + class Entry(NamedTuple): dt: datetime - ketones : Optional[float]=None - glucose : Optional[float]=None + ketones : float | None=None + glucose : float | None=None - vitamin_d : Optional[float]=None - vitamin_b12 : Optional[float]=None + vitamin_d : float | None=None + vitamin_b12 : float | None=None - hdl : Optional[float]=None - ldl : Optional[float]=None - triglycerides: Optional[float]=None + hdl : float | None=None + ldl : float | None=None + triglycerides: float | None=None - source : Optional[str]=None - extra : Optional[str]=None + source : str | None=None + extra : str | None=None Result = Res[Entry] -def try_float(s: str) -> Optional[float]: +def try_float(s: str) -> float | None: l = s.split() if len(l) == 0: return None @@ -105,6 +106,7 @@ def blood_tests_data() -> Iterable[Result]: def data() -> Iterable[Result]: from itertools import chain + from ..core.error import sort_res_by datas = chain(glucose_ketones_data(), blood_tests_data()) return sort_res_by(datas, key=lambda e: e.dt) diff --git a/my/body/exercise/all.py b/my/body/exercise/all.py index e86a5af..d0df747 100644 --- a/my/body/exercise/all.py +++ b/my/body/exercise/all.py @@ -7,10 +7,10 @@ from ...core.pandas import DataFrameT, check_dataframe @check_dataframe def dataframe() -> DataFrameT: # this should be somehow more flexible... - from ...endomondo import dataframe as EDF - from ...runnerup import dataframe as RDF - import pandas as pd + + from ...endomondo import dataframe as EDF + from ...runnerup import dataframe as RDF return pd.concat([ EDF(), RDF(), diff --git a/my/body/exercise/cardio.py b/my/body/exercise/cardio.py index 083b972..d8a6afd 100644 --- a/my/body/exercise/cardio.py +++ b/my/body/exercise/cardio.py @@ -3,7 +3,6 @@ Cardio data, filtered from various data sources ''' from ...core.pandas import DataFrameT, check_dataframe - CARDIO = { 'Running', 'Running, treadmill', diff --git a/my/body/exercise/cross_trainer.py b/my/body/exercise/cross_trainer.py index edbb557..30f96f9 100644 --- a/my/body/exercise/cross_trainer.py +++ b/my/body/exercise/cross_trainer.py @@ -5,16 +5,18 @@ This is probably too specific to my needs, so later I will move it away to a per For now it's worth keeping it here as an example and perhaps utility functions might be useful for other HPI modules. ''' -from datetime import datetime, timedelta -from typing import Optional +from __future__ import annotations -from ...core.pandas import DataFrameT, check_dataframe as cdf -from ...core.orgmode import collect, Table, parse_org_datetime, TypedTable +from datetime import datetime, timedelta + +import pytz from my.config import exercise as config +from ...core.orgmode import Table, TypedTable, collect, parse_org_datetime +from ...core.pandas import DataFrameT +from ...core.pandas import check_dataframe as cdf -import pytz # FIXME how to attach it properly? tz = pytz.timezone('Europe/London') @@ -114,7 +116,7 @@ def dataframe() -> DataFrameT: rows.append(rd) # presumably has an error set continue - idx: Optional[int] + idx: int | None close = edf[edf['start_time'].apply(lambda t: pd_date_diff(t, mdate)).abs() < _DELTA] if len(close) == 0: idx = None @@ -163,7 +165,9 @@ def dataframe() -> DataFrameT: # TODO wtf?? where is speed coming from?? -from ...core import stat, Stats +from ...core import Stats, stat + + def stats() -> Stats: return stat(cross_trainer_data) diff --git a/my/body/sleep/common.py b/my/body/sleep/common.py index 1100814..fc288e5 100644 --- a/my/body/sleep/common.py +++ b/my/body/sleep/common.py @@ -1,5 +1,6 @@ -from ...core import stat, Stats -from ...core.pandas import DataFrameT, check_dataframe as cdf +from ...core import Stats, stat +from ...core.pandas import DataFrameT +from ...core.pandas import check_dataframe as cdf class Combine: diff --git a/my/body/sleep/main.py b/my/body/sleep/main.py index 29b12a7..2460e03 100644 --- a/my/body/sleep/main.py +++ b/my/body/sleep/main.py @@ -1,7 +1,6 @@ -from ... import jawbone -from ... import emfit - +from ... import emfit, jawbone from .common import Combine + _combined = Combine([ jawbone, emfit, diff --git a/my/body/weight.py b/my/body/weight.py index 51e6513..d5478ef 100644 --- a/my/body/weight.py +++ b/my/body/weight.py @@ -2,14 +2,14 @@ Weight data (manually logged) ''' +from collections.abc import Iterator from dataclasses import dataclass from datetime import datetime -from typing import Any, Iterator - -from my.core import make_logger -from my.core.error import Res, extract_error_datetime, set_error_datetime +from typing import Any from my import orgmode +from my.core import make_logger +from my.core.error import Res, extract_error_datetime, set_error_datetime config = Any diff --git a/my/books/kobo.py b/my/books/kobo.py index 2a469d0..899ef31 100644 --- a/my/books/kobo.py +++ b/my/books/kobo.py @@ -1,7 +1,6 @@ -from ..core import warnings +from my.core import warnings warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!') -from ..core.util import __NOT_HPI_MODULE__ - -from ..kobo import * # type: ignore[no-redef] +from my.core.util import __NOT_HPI_MODULE__ +from my.kobo import * # type: ignore[no-redef] diff --git a/my/browser/active_browser.py b/my/browser/active_browser.py index 6f335bd..8051f1b 100644 --- a/my/browser/active_browser.py +++ b/my/browser/active_browser.py @@ -19,16 +19,18 @@ class config(user_config.active_browser): export_path: Paths +from collections.abc import Iterator, Sequence from pathlib import Path -from typing import Sequence, Iterator -from my.core import get_files, Stats, make_logger -from browserexport.merge import read_visits, Visit +from browserexport.merge import Visit, read_visits from sqlite_backup import sqlite_backup +from my.core import Stats, get_files, make_logger + logger = make_logger(__name__) from .common import _patch_browserexport_logs + _patch_browserexport_logs(logger.level) diff --git a/my/browser/all.py b/my/browser/all.py index a7d12b4..feb973a 100644 --- a/my/browser/all.py +++ b/my/browser/all.py @@ -1,9 +1,9 @@ -from typing import Iterator +from collections.abc import Iterator + +from browserexport.merge import Visit, merge_visits from my.core import Stats from my.core.source import import_source -from browserexport.merge import merge_visits, Visit - src_export = import_source(module_name="my.browser.export") src_active = import_source(module_name="my.browser.active_browser") diff --git a/my/browser/export.py b/my/browser/export.py index 1b428b5..351cf6e 100644 --- a/my/browser/export.py +++ b/my/browser/export.py @@ -4,11 +4,12 @@ Parses browser history using [[http://github.com/seanbreckenridge/browserexport] REQUIRES = ["browserexport"] +from collections.abc import Iterator, Sequence from dataclasses import dataclass from pathlib import Path -from typing import Iterator, Sequence -import my.config +from browserexport.merge import Visit, read_and_merge + from my.core import ( Paths, Stats, @@ -18,10 +19,10 @@ from my.core import ( ) from my.core.cachew import mcachew -from browserexport.merge import read_and_merge, Visit - from .common import _patch_browserexport_logs +import my.config # isort: skip + @dataclass class config(my.config.browser.export): diff --git a/my/bumble/android.py b/my/bumble/android.py index 54a0441..3f9fa13 100644 --- a/my/bumble/android.py +++ b/my/bumble/android.py @@ -3,24 +3,24 @@ Bumble data from Android app database (in =/data/data/com.bumble.app/databases/C """ from __future__ import annotations +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime -from typing import Iterator, Sequence, Optional, Dict +from pathlib import Path from more_itertools import unique_everseen -from my.config import bumble as user_config +from my.core import Paths, get_files + +from my.config import bumble as user_config # isort: skip -from ..core import Paths @dataclass class config(user_config.android): # paths[s]/glob to the exported sqlite databases export_path: Paths -from ..core import get_files -from pathlib import Path def inputs() -> Sequence[Path]: return get_files(config.export_path) @@ -43,22 +43,24 @@ class _BaseMessage: @dataclass(unsafe_hash=True) class _Message(_BaseMessage): conversation_id: str - reply_to_id: Optional[str] + reply_to_id: str | None @dataclass(unsafe_hash=True) class Message(_BaseMessage): person: Person - reply_to: Optional[Message] + reply_to: Message | None import json -from typing import Union -from ..core import Res import sqlite3 -from ..core.sqlite import sqlite_connect_immutable, select +from typing import Union + from my.core.compat import assert_never +from ..core import Res +from ..core.sqlite import select, sqlite_connect_immutable + EntitiesRes = Res[Union[Person, _Message]] def _entities() -> Iterator[EntitiesRes]: @@ -120,8 +122,8 @@ _UNKNOWN_PERSON = "UNKNOWN_PERSON" def messages() -> Iterator[Res[Message]]: - id2person: Dict[str, Person] = {} - id2msg: Dict[str, Message] = {} + id2person: dict[str, Person] = {} + id2msg: dict[str, Message] = {} for x in unique_everseen(_entities(), key=_key): if isinstance(x, Exception): yield x diff --git a/my/calendar/holidays.py b/my/calendar/holidays.py index af51696..522672e 100644 --- a/my/calendar/holidays.py +++ b/my/calendar/holidays.py @@ -15,7 +15,8 @@ from my.core.time import zone_to_countrycode @lru_cache(1) def _calendar(): - from workalendar.registry import registry # type: ignore + from workalendar.registry import registry # type: ignore + # todo switch to using time.tz.main once _get_tz stabilizes? from ..time.tz import via_location as LTZ # TODO would be nice to do it dynamically depending on the past timezones... diff --git a/my/cfg.py b/my/cfg.py index e4020b4..9331e8a 100644 --- a/my/cfg.py +++ b/my/cfg.py @@ -1,7 +1,6 @@ import my.config as config from .core import __NOT_HPI_MODULE__ - from .core import warnings as W # still used in Promnesia, maybe in dashboard? diff --git a/my/codeforces.py b/my/codeforces.py index f2d150a..9c6b7c9 100644 --- a/my/codeforces.py +++ b/my/codeforces.py @@ -1,13 +1,12 @@ +import json +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone from functools import cached_property -import json from pathlib import Path -from typing import Dict, Iterator, Sequence - -from my.core import get_files, Res, datetime_aware from my.config import codeforces as config # type: ignore[attr-defined] +from my.core import Res, datetime_aware, get_files def inputs() -> Sequence[Path]: @@ -39,7 +38,7 @@ class Competition: class Parser: def __init__(self, *, inputs: Sequence[Path]) -> None: self.inputs = inputs - self.contests: Dict[ContestId, Contest] = {} + self.contests: dict[ContestId, Contest] = {} def _parse_allcontests(self, p: Path) -> Iterator[Contest]: j = json.loads(p.read_text()) diff --git a/my/coding/commits.py b/my/coding/commits.py index 31c366e..fe17dee 100644 --- a/my/coding/commits.py +++ b/my/coding/commits.py @@ -1,29 +1,32 @@ """ Git commits data for repositories on your filesystem """ + +from __future__ import annotations + REQUIRES = [ 'gitpython', ] - import shutil -from pathlib import Path -from datetime import datetime, timezone +from collections.abc import Iterator, Sequence from dataclasses import dataclass, field -from typing import List, Optional, Iterator, Set, Sequence, cast +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional, cast - -from my.core import PathIsh, LazyLogger, make_config +from my.core import LazyLogger, PathIsh, make_config from my.core.cachew import cache_dir, mcachew from my.core.warnings import high +from my.config import commits as user_config # isort: skip + -from my.config import commits as user_config @dataclass class commits_cfg(user_config): roots: Sequence[PathIsh] = field(default_factory=list) - emails: Optional[Sequence[str]] = None - names: Optional[Sequence[str]] = None + emails: Sequence[str] | None = None + names: Sequence[str] | None = None # experiment to make it lazy? @@ -40,7 +43,6 @@ def config() -> commits_cfg: import git from git.repo.fun import is_git_dir - log = LazyLogger(__name__, level='info') @@ -93,7 +95,7 @@ def _git_root(git_dir: PathIsh) -> Path: return gd # must be bare -def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Commit]: +def _repo_commits_aux(gr: git.Repo, rev: str, emitted: set[str]) -> Iterator[Commit]: # without path might not handle pull heads properly for c in gr.iter_commits(rev=rev): if not by_me(c): @@ -120,7 +122,7 @@ def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Com def repo_commits(repo: PathIsh): gr = git.Repo(str(repo)) - emitted: Set[str] = set() + emitted: set[str] = set() for r in gr.references: yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted) @@ -141,14 +143,14 @@ def canonical_name(repo: Path) -> str: def _fd_path() -> str: # todo move it to core - fd_path: Optional[str] = shutil.which("fdfind") or shutil.which("fd-find") or shutil.which("fd") + fd_path: str | None = shutil.which("fdfind") or shutil.which("fd-find") or shutil.which("fd") if fd_path is None: high("my.coding.commits requires 'fd' to be installed, See https://github.com/sharkdp/fd#installation") assert fd_path is not None return fd_path -def git_repos_in(roots: List[Path]) -> List[Path]: +def git_repos_in(roots: list[Path]) -> list[Path]: from subprocess import check_output outputs = check_output([ _fd_path(), @@ -172,7 +174,7 @@ def git_repos_in(roots: List[Path]) -> List[Path]: return repos -def repos() -> List[Path]: +def repos() -> list[Path]: return git_repos_in(list(map(Path, config().roots))) @@ -190,7 +192,7 @@ def _repo_depends_on(_repo: Path) -> int: raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}") -def _commits(_repos: List[Path]) -> Iterator[Commit]: +def _commits(_repos: list[Path]) -> Iterator[Commit]: for r in _repos: yield from _cached_commits(r) diff --git a/my/common.py b/my/common.py index 1b56fb5..22e9487 100644 --- a/my/common.py +++ b/my/common.py @@ -1,6 +1,6 @@ from .core.warnings import high + high("DEPRECATED! Please use my.core.common instead.") from .core import __NOT_HPI_MODULE__ - from .core.common import * diff --git a/my/config.py b/my/config.py index 2dd9cda..301bf49 100644 --- a/my/config.py +++ b/my/config.py @@ -9,17 +9,18 @@ This file is used for: - mypy: this file provides some type annotations - for loading the actual user config ''' + +from __future__ import annotations + #### NOTE: you won't need this line VVVV in your personal config -from my.core import init # noqa: F401 +from my.core import init # noqa: F401 # isort: skip ### from datetime import tzinfo from pathlib import Path -from typing import List - -from my.core import Paths, PathIsh +from my.core import PathIsh, Paths class hypothesis: @@ -75,14 +76,16 @@ class google: takeout_path: Paths = '' -from typing import Sequence, Union, Tuple -from datetime import datetime, date, timedelta +from collections.abc import Sequence +from datetime import date, datetime, timedelta +from typing import Union + DateIsh = Union[datetime, date, str] -LatLon = Tuple[float, float] +LatLon = tuple[float, float] class location: # todo ugh, need to think about it... mypy wants the type here to be general, otherwise it can't deduce # and we can't import the types from the module itself, otherwise would be circular. common module? - home: Union[LatLon, Sequence[Tuple[DateIsh, LatLon]]] = (1.0, -1.0) + home: LatLon | Sequence[tuple[DateIsh, LatLon]] = (1.0, -1.0) home_accuracy = 30_000.0 class via_ip: @@ -103,6 +106,8 @@ class location: from typing import Literal + + class time: class tz: policy: Literal['keep', 'convert', 'throw'] @@ -121,10 +126,9 @@ class arbtt: logfiles: Paths -from typing import Optional class commits: - emails: Optional[Sequence[str]] - names: Optional[Sequence[str]] + emails: Sequence[str] | None + names: Sequence[str] | None roots: Sequence[PathIsh] @@ -150,8 +154,8 @@ class tinder: class instagram: class android: export_path: Paths - username: Optional[str] - full_name: Optional[str] + username: str | None + full_name: str | None class gdpr: export_path: Paths @@ -169,7 +173,7 @@ class materialistic: class fbmessenger: class fbmessengerexport: export_db: PathIsh - facebook_id: Optional[str] + facebook_id: str | None class android: export_path: Paths @@ -247,7 +251,7 @@ class runnerup: class emfit: export_path: Path timezone: tzinfo - excluded_sids: List[str] + excluded_sids: list[str] class foursquare: @@ -270,7 +274,7 @@ class roamresearch: class whatsapp: class android: export_path: Paths - my_user_id: Optional[str] + my_user_id: str | None class harmonic: diff --git a/my/core/_deprecated/kompress.py b/my/core/_deprecated/kompress.py index ce14fad..c3f333f 100644 --- a/my/core/_deprecated/kompress.py +++ b/my/core/_deprecated/kompress.py @@ -11,7 +11,7 @@ from collections.abc import Iterator, Sequence from datetime import datetime from functools import total_ordering from pathlib import Path -from typing import IO, Any, Union +from typing import IO, Union PathIsh = Union[Path, str] diff --git a/my/core/common.py b/my/core/common.py index 91fe9bd..aa994ea 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -63,7 +63,7 @@ def get_files( if '*' in gs: if glob != DEFAULT_GLOB: warnings.medium(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!") - paths.extend(map(Path, do_glob(gs))) + paths.extend(map(Path, do_glob(gs))) # noqa: PTH207 elif os.path.isdir(str(src)): # noqa: PTH112 # NOTE: we're using os.path here on purpose instead of src.is_dir # the reason is is_dir for archives might return True and then @@ -157,7 +157,7 @@ def get_valid_filename(s: str) -> str: # TODO deprecate and suggest to use one from my.core directly? not sure -from .utils.itertools import unique_everseen +from .utils.itertools import unique_everseen # noqa: F401 ### legacy imports, keeping them here for backwards compatibility ## hiding behind TYPE_CHECKING so it works in runtime diff --git a/my/demo.py b/my/demo.py index 0c54792..fa80b2a 100644 --- a/my/demo.py +++ b/my/demo.py @@ -1,12 +1,14 @@ ''' Just a demo module for testing and documentation purposes ''' +from __future__ import annotations import json +from collections.abc import Iterable, Sequence from dataclasses import dataclass from datetime import datetime, timezone, tzinfo from pathlib import Path -from typing import Iterable, Optional, Protocol, Sequence +from typing import Protocol from my.core import Json, PathIsh, Paths, get_files @@ -20,7 +22,7 @@ class config(Protocol): # this is to check optional attribute handling timezone: tzinfo = timezone.utc - external: Optional[PathIsh] = None + external: PathIsh | None = None @property def external_module(self): diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index 9934903..0d50b06 100644 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -4,31 +4,34 @@ Consumes data exported by https://github.com/karlicoss/emfitexport """ +from __future__ import annotations + REQUIRES = [ 'git+https://github.com/karlicoss/emfitexport', ] -from contextlib import contextmanager import dataclasses -from datetime import datetime, time, timedelta import inspect +from collections.abc import Iterable, Iterator +from contextlib import contextmanager +from datetime import datetime, time, timedelta from pathlib import Path -from typing import Any, Dict, Iterable, Iterator, List, Optional - -from my.core import ( - get_files, - stat, - Res, - Stats, -) -from my.core.cachew import cache_dir, mcachew -from my.core.error import set_error_datetime, extract_error_datetime -from my.core.pandas import DataFrameT - -from my.config import emfit as config +from typing import Any import emfitexport.dal as dal +from my.core import ( + Res, + Stats, + get_files, + stat, +) +from my.core.cachew import cache_dir, mcachew +from my.core.error import extract_error_datetime, set_error_datetime +from my.core.pandas import DataFrameT + +from my.config import emfit as config # isort: skip + Emfit = dal.Emfit @@ -85,7 +88,7 @@ def datas() -> Iterable[Res[Emfit]]: # TODO should be used for jawbone data as well? def pre_dataframe() -> Iterable[Res[Emfit]]: # TODO shit. I need some sort of interrupted sleep detection? - g: List[Emfit] = [] + g: list[Emfit] = [] def flush() -> Iterable[Res[Emfit]]: if len(g) == 0: @@ -112,10 +115,10 @@ def pre_dataframe() -> Iterable[Res[Emfit]]: def dataframe() -> DataFrameT: - dicts: List[Dict[str, Any]] = [] - last: Optional[Emfit] = None + dicts: list[dict[str, Any]] = [] + last: Emfit | None = None for s in pre_dataframe(): - d: Dict[str, Any] + d: dict[str, Any] if isinstance(s, Exception): edt = extract_error_datetime(s) d = { @@ -166,11 +169,12 @@ def stats() -> Stats: @contextmanager def fake_data(nights: int = 500) -> Iterator: - from my.core.cfg import tmp_config from tempfile import TemporaryDirectory import pytz + from my.core.cfg import tmp_config + with TemporaryDirectory() as td: tdir = Path(td) gen = dal.FakeData() @@ -187,7 +191,7 @@ def fake_data(nights: int = 500) -> Iterator: # TODO remove/deprecate it? I think used by timeline -def get_datas() -> List[Emfit]: +def get_datas() -> list[Emfit]: # todo ugh. run lint properly return sorted(datas(), key=lambda e: e.start) # type: ignore diff --git a/my/endomondo.py b/my/endomondo.py index 293a542..7732c00 100644 --- a/my/endomondo.py +++ b/my/endomondo.py @@ -7,13 +7,14 @@ REQUIRES = [ ] # todo use ast in setup.py or doctor to extract the corresponding pip packages? +from collections.abc import Iterable, Sequence from dataclasses import dataclass from pathlib import Path -from typing import Sequence, Iterable + +from my.config import endomondo as user_config from .core import Paths, get_files -from my.config import endomondo as user_config @dataclass class endomondo(user_config): @@ -33,15 +34,17 @@ def inputs() -> Sequence[Path]: import endoexport.dal as dal from endoexport.dal import Point, Workout # noqa: F401 - from .core import Res + + # todo cachew? def workouts() -> Iterable[Res[Workout]]: _dal = dal.DAL(inputs()) yield from _dal.workouts() -from .core.pandas import check_dataframe, DataFrameT +from .core.pandas import DataFrameT, check_dataframe + @check_dataframe def dataframe(*, defensive: bool=True) -> DataFrameT: @@ -75,7 +78,9 @@ def dataframe(*, defensive: bool=True) -> DataFrameT: return df -from .core import stat, Stats +from .core import Stats, stat + + def stats() -> Stats: return { # todo pretty print stats? @@ -86,13 +91,16 @@ def stats() -> Stats: # TODO make sure it's possible to 'advise' functions and override stuff +from collections.abc import Iterator from contextlib import contextmanager -from typing import Iterator + + @contextmanager def fake_data(count: int=100) -> Iterator: - from my.core.cfg import tmp_config - from tempfile import TemporaryDirectory import json + from tempfile import TemporaryDirectory + + from my.core.cfg import tmp_config with TemporaryDirectory() as td: tdir = Path(td) fd = dal.FakeData() diff --git a/my/error.py b/my/error.py index c0b734c..e3c1e11 100644 --- a/my/error.py +++ b/my/error.py @@ -1,6 +1,6 @@ from .core.warnings import high + high("DEPRECATED! Please use my.core.error instead.") from .core import __NOT_HPI_MODULE__ - from .core.error import * diff --git a/my/experimental/destructive_parsing.py b/my/experimental/destructive_parsing.py index b389f7e..0c4092a 100644 --- a/my/experimental/destructive_parsing.py +++ b/my/experimental/destructive_parsing.py @@ -1,5 +1,6 @@ +from collections.abc import Iterator from dataclasses import dataclass -from typing import Any, Iterator, List, Tuple +from typing import Any from my.core.compat import NoneType, assert_never @@ -9,7 +10,7 @@ from my.core.compat import NoneType, assert_never class Helper: manager: 'Manager' item: Any # todo realistically, list or dict? could at least type as indexable or something - path: Tuple[str, ...] + path: tuple[str, ...] def pop_if_primitive(self, *keys: str) -> None: """ @@ -40,9 +41,9 @@ def is_empty(x) -> bool: class Manager: def __init__(self) -> None: - self.helpers: List[Helper] = [] + self.helpers: list[Helper] = [] - def helper(self, item: Any, *, path: Tuple[str, ...] = ()) -> Helper: + def helper(self, item: Any, *, path: tuple[str, ...] = ()) -> Helper: res = Helper(manager=self, item=item, path=path) self.helpers.append(res) return res diff --git a/my/fbmessenger/__init__.py b/my/fbmessenger/__init__.py index 40fb235..f729de9 100644 --- a/my/fbmessenger/__init__.py +++ b/my/fbmessenger/__init__.py @@ -20,6 +20,7 @@ REQUIRES = [ from my.core.hpi_compat import handle_legacy_import + is_legacy_import = handle_legacy_import( parent_module_name=__name__, legacy_submodule_name='export', diff --git a/my/fbmessenger/all.py b/my/fbmessenger/all.py index 13689db..a057dca 100644 --- a/my/fbmessenger/all.py +++ b/my/fbmessenger/all.py @@ -1,10 +1,10 @@ -from typing import Iterator -from my.core import Res, stat, Stats +from collections.abc import Iterator + +from my.core import Res, Stats from my.core.source import import_source from .common import Message, _merge_messages - src_export = import_source(module_name='my.fbmessenger.export') src_android = import_source(module_name='my.fbmessenger.android') diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index effabab..a16d924 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -4,19 +4,20 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab from __future__ import annotations +import sqlite3 +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -import sqlite3 -from typing import Iterator, Sequence, Optional, Dict, Union, List +from typing import Union -from my.core import get_files, Paths, datetime_aware, Res, LazyLogger, make_config +from my.core import LazyLogger, Paths, Res, datetime_aware, get_files, make_config from my.core.common import unique_everseen from my.core.compat import assert_never from my.core.error import echain from my.core.sqlite import sqlite_connection -from my.config import fbmessenger as user_config +from my.config import fbmessenger as user_config # isort: skip logger = LazyLogger(__name__) @@ -27,7 +28,7 @@ class Config(user_config.android): # paths[s]/glob to the exported sqlite databases export_path: Paths - facebook_id: Optional[str] = None + facebook_id: str | None = None # hmm. this is necessary for default value (= None) to work @@ -42,13 +43,13 @@ def inputs() -> Sequence[Path]: @dataclass(unsafe_hash=True) class Sender: id: str - name: Optional[str] + name: str | None @dataclass(unsafe_hash=True) class Thread: id: str - name: Optional[str] # isn't set for groups or one to one messages + name: str | None # isn't set for groups or one to one messages # todo not sure about order of fields... @@ -56,14 +57,14 @@ class Thread: class _BaseMessage: id: str dt: datetime_aware - text: Optional[str] + text: str | None @dataclass(unsafe_hash=True) class _Message(_BaseMessage): thread_id: str sender_id: str - reply_to_id: Optional[str] + reply_to_id: str | None # todo hmm, on the one hand would be kinda nice to inherit common.Message protocol here @@ -72,7 +73,7 @@ class _Message(_BaseMessage): class Message(_BaseMessage): thread: Thread sender: Sender - reply_to: Optional[Message] + reply_to: Message | None Entity = Union[Sender, Thread, _Message] @@ -110,7 +111,7 @@ def _normalise_thread_id(key) -> str: # NOTE: this is sort of copy pasted from other _process_db method # maybe later could unify them def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: - senders: Dict[str, Sender] = {} + senders: dict[str, Sender] = {} for r in db.execute('SELECT CAST(id AS TEXT) AS id, name FROM contacts'): s = Sender( id=r['id'], # looks like it's server id? same used on facebook site @@ -127,7 +128,7 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: # TODO can we get it from db? could infer as the most common id perhaps? self_id = config.facebook_id - thread_users: Dict[str, List[Sender]] = {} + thread_users: dict[str, list[Sender]] = {} for r in db.execute('SELECT CAST(thread_key AS TEXT) AS thread_key, CAST(contact_id AS TEXT) AS contact_id FROM participants'): thread_key = r['thread_key'] user_key = r['contact_id'] @@ -193,7 +194,7 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: def _process_db_threads_db2(db: sqlite3.Connection) -> Iterator[Res[Entity]]: - senders: Dict[str, Sender] = {} + senders: dict[str, Sender] = {} for r in db.execute('''SELECT * FROM thread_users'''): # for messaging_actor_type == 'REDUCED_MESSAGING_ACTOR', name is None # but they are still referenced, so need to keep @@ -207,7 +208,7 @@ def _process_db_threads_db2(db: sqlite3.Connection) -> Iterator[Res[Entity]]: yield s self_id = config.facebook_id - thread_users: Dict[str, List[Sender]] = {} + thread_users: dict[str, list[Sender]] = {} for r in db.execute('SELECT * from thread_participants'): thread_key = r['thread_key'] user_key = r['user_key'] @@ -267,9 +268,9 @@ def contacts() -> Iterator[Res[Sender]]: def messages() -> Iterator[Res[Message]]: - senders: Dict[str, Sender] = {} - msgs: Dict[str, Message] = {} - threads: Dict[str, Thread] = {} + senders: dict[str, Sender] = {} + msgs: dict[str, Message] = {} + threads: dict[str, Thread] = {} for x in unique_everseen(_entities): if isinstance(x, Exception): yield x diff --git a/my/fbmessenger/common.py b/my/fbmessenger/common.py index 33d1b20..0f5a374 100644 --- a/my/fbmessenger/common.py +++ b/my/fbmessenger/common.py @@ -1,6 +1,9 @@ -from my.core import __NOT_HPI_MODULE__ +from __future__ import annotations -from typing import Iterator, Optional, Protocol +from my.core import __NOT_HPI_MODULE__ # isort: skip + +from collections.abc import Iterator +from typing import Protocol from my.core import datetime_aware @@ -10,7 +13,7 @@ class Thread(Protocol): def id(self) -> str: ... @property - def name(self) -> Optional[str]: ... + def name(self) -> str | None: ... class Sender(Protocol): @@ -18,7 +21,7 @@ class Sender(Protocol): def id(self) -> str: ... @property - def name(self) -> Optional[str]: ... + def name(self) -> str | None: ... class Message(Protocol): @@ -29,7 +32,7 @@ class Message(Protocol): def dt(self) -> datetime_aware: ... @property - def text(self) -> Optional[str]: ... + def text(self) -> str | None: ... @property def thread(self) -> Thread: ... @@ -39,8 +42,11 @@ class Message(Protocol): from itertools import chain + from more_itertools import unique_everseen -from my.core import warn_if_empty, Res + +from my.core import Res, warn_if_empty + @warn_if_empty def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: diff --git a/my/fbmessenger/export.py b/my/fbmessenger/export.py index 201fad8..3b06618 100644 --- a/my/fbmessenger/export.py +++ b/my/fbmessenger/export.py @@ -7,16 +7,15 @@ REQUIRES = [ 'git+https://github.com/karlicoss/fbmessengerexport', ] +from collections.abc import Iterator from contextlib import ExitStack, contextmanager from dataclasses import dataclass -from typing import Iterator - -from my.core import PathIsh, Res, stat, Stats -from my.core.warnings import high -from my.config import fbmessenger as user_config import fbmessengerexport.dal as messenger +from my.config import fbmessenger as user_config +from my.core import PathIsh, Res, Stats, stat +from my.core.warnings import high ### # support old style config diff --git a/my/foursquare.py b/my/foursquare.py index 394fdf3..3b418aa 100644 --- a/my/foursquare.py +++ b/my/foursquare.py @@ -2,15 +2,14 @@ Foursquare/Swarm checkins ''' -from datetime import datetime, timezone, timedelta -from itertools import chain import json +from datetime import datetime, timedelta, timezone +from itertools import chain -# TODO pytz for timezone??? - -from my.core import get_files, make_logger from my.config import foursquare as config +# TODO pytz for timezone??? +from my.core import get_files, make_logger logger = make_logger(__name__) diff --git a/my/github/all.py b/my/github/all.py index f885dde..f5e13cf 100644 --- a/my/github/all.py +++ b/my/github/all.py @@ -3,8 +3,7 @@ Unified Github data (merged from GDPR export and periodic API updates) """ from . import gdpr, ghexport - -from .common import merge_events, Results +from .common import Results, merge_events def events() -> Results: diff --git a/my/github/common.py b/my/github/common.py index e54bc4d..22ba47e 100644 --- a/my/github/common.py +++ b/my/github/common.py @@ -1,24 +1,27 @@ """ Github events and their metadata: comments/issues/pull requests """ -from ..core import __NOT_HPI_MODULE__ + +from __future__ import annotations + +from my.core import __NOT_HPI_MODULE__ # isort: skip +from collections.abc import Iterable from datetime import datetime, timezone -from typing import Optional, NamedTuple, Iterable, Set, Tuple +from typing import NamedTuple, Optional -from ..core import warn_if_empty, LazyLogger -from ..core.error import Res +from my.core import make_logger, warn_if_empty +from my.core.error import Res - -logger = LazyLogger(__name__) +logger = make_logger(__name__) class Event(NamedTuple): dt: datetime summary: str eid: str link: Optional[str] - body: Optional[str]=None + body: Optional[str] = None is_bot: bool = False @@ -27,7 +30,7 @@ Results = Iterable[Res[Event]] @warn_if_empty def merge_events(*sources: Results) -> Results: from itertools import chain - emitted: Set[Tuple[datetime, str]] = set() + emitted: set[tuple[datetime, str]] = set() for e in chain(*sources): if isinstance(e, Exception): yield e @@ -52,7 +55,7 @@ def parse_dt(s: str) -> datetime: # experimental way of supportint event ids... not sure class EventIds: @staticmethod - def repo_created(*, dts: str, name: str, ref_type: str, ref: Optional[str]) -> str: + def repo_created(*, dts: str, name: str, ref_type: str, ref: str | None) -> str: return f'{dts}_repocreated_{name}_{ref_type}_{ref}' @staticmethod diff --git a/my/github/gdpr.py b/my/github/gdpr.py index a56ff46..be56454 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -6,8 +6,9 @@ from __future__ import annotations import json from abc import abstractmethod +from collections.abc import Iterator, Sequence from pathlib import Path -from typing import Any, Iterator, Sequence +from typing import Any from my.core import Paths, Res, Stats, get_files, make_logger, stat, warnings from my.core.error import echain diff --git a/my/github/ghexport.py b/my/github/ghexport.py index 80106a5..3e17c10 100644 --- a/my/github/ghexport.py +++ b/my/github/ghexport.py @@ -1,13 +1,17 @@ """ Github data: events, comments, etc. (API data) """ + +from __future__ import annotations + REQUIRES = [ 'git+https://github.com/karlicoss/ghexport', ] + from dataclasses import dataclass -from my.core import Paths from my.config import github as user_config +from my.core import Paths @dataclass @@ -21,7 +25,9 @@ class github(user_config): ### -from my.core.cfg import make_config, Attrs +from my.core.cfg import Attrs, make_config + + def migration(attrs: Attrs) -> Attrs: export_dir = 'export_dir' if export_dir in attrs: # legacy name @@ -41,15 +47,14 @@ except ModuleNotFoundError as e: ############################ +from collections.abc import Sequence from functools import lru_cache from pathlib import Path -from typing import Tuple, Dict, Sequence, Optional -from my.core import get_files, LazyLogger +from my.core import LazyLogger, get_files from my.core.cachew import mcachew -from .common import Event, parse_dt, Results, EventIds - +from .common import Event, EventIds, Results, parse_dt logger = LazyLogger(__name__) @@ -82,7 +87,9 @@ def _events() -> Results: yield e -from my.core import stat, Stats +from my.core import Stats, stat + + def stats() -> Stats: return { **stat(events), @@ -99,7 +106,7 @@ def _log_if_unhandled(e) -> None: Link = str EventId = str Body = str -def _get_summary(e) -> Tuple[str, Optional[Link], Optional[EventId], Optional[Body]]: +def _get_summary(e) -> tuple[str, Link | None, EventId | None, Body | None]: # TODO would be nice to give access to raw event within timeline dts = e['created_at'] eid = e['id'] @@ -195,7 +202,7 @@ def _get_summary(e) -> Tuple[str, Optional[Link], Optional[EventId], Optional[Bo return tp, None, None, None -def _parse_event(d: Dict) -> Event: +def _parse_event(d: dict) -> Event: summary, link, eid, body = _get_summary(d) if eid is None: eid = d['id'] # meh diff --git a/my/goodreads.py b/my/goodreads.py index 864bd64..559efda 100644 --- a/my/goodreads.py +++ b/my/goodreads.py @@ -7,15 +7,18 @@ REQUIRES = [ from dataclasses import dataclass -from my.core import datetime_aware, Paths + from my.config import goodreads as user_config +from my.core import Paths, datetime_aware + @dataclass class goodreads(user_config): # paths[s]/glob to the exported JSON data export_path: Paths -from my.core.cfg import make_config, Attrs +from my.core.cfg import Attrs, make_config + def _migration(attrs: Attrs) -> Attrs: export_dir = 'export_dir' @@ -29,18 +32,19 @@ config = make_config(goodreads, migration=_migration) #############################3 -from my.core import get_files -from typing import Sequence, Iterator +from collections.abc import Iterator, Sequence from pathlib import Path +from my.core import get_files + + def inputs() -> Sequence[Path]: return get_files(config.export_path) from datetime import datetime + import pytz - - from goodrexport import dal diff --git a/my/google/maps/_android_protobuf.py b/my/google/maps/_android_protobuf.py index 1d43ae0..615623d 100644 --- a/my/google/maps/_android_protobuf.py +++ b/my/google/maps/_android_protobuf.py @@ -1,8 +1,8 @@ -from my.core import __NOT_HPI_MODULE__ +from my.core import __NOT_HPI_MODULE__ # isort: skip # NOTE: this tool was quite useful https://github.com/aj3423/aproto -from google.protobuf import descriptor_pool, descriptor_pb2, message_factory +from google.protobuf import descriptor_pb2, descriptor_pool, message_factory TYPE_STRING = descriptor_pb2.FieldDescriptorProto.TYPE_STRING TYPE_BYTES = descriptor_pb2.FieldDescriptorProto.TYPE_BYTES diff --git a/my/google/maps/android.py b/my/google/maps/android.py index 279231a..95ecacf 100644 --- a/my/google/maps/android.py +++ b/my/google/maps/android.py @@ -7,20 +7,20 @@ REQUIRES = [ "protobuf", # for parsing blobs from the database ] +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -from typing import Any, Iterator, Optional, Sequence +from typing import Any from urllib.parse import quote -from my.core import datetime_aware, get_files, LazyLogger, Paths, Res +from my.core import LazyLogger, Paths, Res, datetime_aware, get_files from my.core.common import unique_everseen from my.core.sqlite import sqlite_connection -import my.config - from ._android_protobuf import parse_labeled, parse_list, parse_place +import my.config # isort: skip logger = LazyLogger(__name__) @@ -59,8 +59,8 @@ class Place: updated_at: datetime_aware # TODO double check it's utc? title: str location: Location - address: Optional[str] - note: Optional[str] + address: str | None + note: str | None @property def place_url(self) -> str: diff --git a/my/google/takeout/html.py b/my/google/takeout/html.py index 750beac..3f2b5db 100644 --- a/my/google/takeout/html.py +++ b/my/google/takeout/html.py @@ -2,18 +2,22 @@ Google Takeout exports: browsing history, search/youtube/google play activity ''' -from enum import Enum +from __future__ import annotations + +from my.core import __NOT_HPI_MODULE__ # isort: skip + import re -from pathlib import Path +from collections.abc import Iterable from datetime import datetime +from enum import Enum from html.parser import HTMLParser -from typing import List, Optional, Any, Callable, Iterable, Tuple +from pathlib import Path +from typing import Any, Callable from urllib.parse import unquote import pytz -from ...core.time import abbr_to_timezone - +from my.core.time import abbr_to_timezone # NOTE: https://bugs.python.org/issue22377 %Z doesn't work properly _TIME_FORMATS = [ @@ -36,7 +40,7 @@ def parse_dt(s: str) -> datetime: s, tzabbr = s.rsplit(maxsplit=1) tz = abbr_to_timezone(tzabbr) - dt: Optional[datetime] = None + dt: datetime | None = None for fmt in _TIME_FORMATS: try: dt = datetime.strptime(s, fmt) @@ -73,7 +77,7 @@ class State(Enum): Url = str Title = str -Parsed = Tuple[datetime, Url, Title] +Parsed = tuple[datetime, Url, Title] Callback = Callable[[datetime, Url, Title], None] @@ -83,9 +87,9 @@ class TakeoutHTMLParser(HTMLParser): super().__init__() self.state: State = State.OUTSIDE - self.title_parts: List[str] = [] - self.title: Optional[str] = None - self.url: Optional[str] = None + self.title_parts: list[str] = [] + self.title: str | None = None + self.url: str | None = None self.callback = callback @@ -148,7 +152,7 @@ class TakeoutHTMLParser(HTMLParser): def read_html(tpath: Path, file: str) -> Iterable[Parsed]: - results: List[Parsed] = [] + results: list[Parsed] = [] def cb(dt: datetime, url: Url, title: Title) -> None: results.append((dt, url, title)) parser = TakeoutHTMLParser(callback=cb) @@ -156,5 +160,3 @@ def read_html(tpath: Path, file: str) -> Iterable[Parsed]: data = fo.read() parser.feed(data) return results - -from ...core import __NOT_HPI_MODULE__ diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 170553a..80c2be1 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -14,24 +14,27 @@ the cachew cache REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] +import os +from collections.abc import Sequence from contextlib import ExitStack from dataclasses import dataclass -import os -from typing import List, Sequence, cast from pathlib import Path -from my.core import make_config, stat, Stats, get_files, Paths, make_logger +from typing import cast + +from google_takeout_parser.parse_html.html_time_utils import ABBR_TIMEZONES + +from my.core import Paths, Stats, get_files, make_config, make_logger, stat from my.core.cachew import mcachew from my.core.error import ErrorPolicy from my.core.structure import match_structure - from my.core.time import user_forced -from google_takeout_parser.parse_html.html_time_utils import ABBR_TIMEZONES + ABBR_TIMEZONES.extend(user_forced()) import google_takeout_parser -from google_takeout_parser.path_dispatch import TakeoutParser -from google_takeout_parser.merge import GoogleEventSet, CacheResults +from google_takeout_parser.merge import CacheResults, GoogleEventSet from google_takeout_parser.models import BaseEvent +from google_takeout_parser.path_dispatch import TakeoutParser # see https://github.com/seanbreckenridge/dotfiles/blob/master/.config/my/my/config/__init__.py for an example from my.config import google as user_config @@ -56,6 +59,7 @@ logger = make_logger(__name__, level="warning") # patch the takeout parser logger to match the computed loglevel from google_takeout_parser.log import setup as setup_takeout_logger + setup_takeout_logger(logger.level) @@ -83,7 +87,7 @@ except ImportError: google_takeout_version = str(getattr(google_takeout_parser, '__version__', 'unknown')) -def _cachew_depends_on() -> List[str]: +def _cachew_depends_on() -> list[str]: exports = sorted([str(p) for p in inputs()]) # add google takeout parser pip version to hash, so this re-creates on breaking changes exports.insert(0, f"google_takeout_version: {google_takeout_version}") diff --git a/my/google/takeout/paths.py b/my/google/takeout/paths.py index 948cf2e..6a523e2 100644 --- a/my/google/takeout/paths.py +++ b/my/google/takeout/paths.py @@ -2,13 +2,17 @@ Module for locating and accessing [[https://takeout.google.com][Google Takeout]] data ''' +from __future__ import annotations + +from my.core import __NOT_HPI_MODULE__ # isort: skip + from abc import abstractmethod +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Protocol from more_itertools import last -from my.core import __NOT_HPI_MODULE__, Paths, get_files +from my.core import Paths, get_files class config: @@ -33,7 +37,7 @@ def make_config() -> config: return combined_config() -def get_takeouts(*, path: Optional[str] = None) -> Iterable[Path]: +def get_takeouts(*, path: str | None = None) -> Iterable[Path]: """ Sometimes google splits takeout into multiple archives, so we need to detect the ones that contain the path we need """ @@ -45,7 +49,7 @@ def get_takeouts(*, path: Optional[str] = None) -> Iterable[Path]: yield takeout -def get_last_takeout(*, path: Optional[str] = None) -> Optional[Path]: +def get_last_takeout(*, path: str | None = None) -> Path | None: return last(get_takeouts(path=path), default=None) diff --git a/my/hackernews/dogsheep.py b/my/hackernews/dogsheep.py index de6c58d..8303284 100644 --- a/my/hackernews/dogsheep.py +++ b/my/hackernews/dogsheep.py @@ -3,14 +3,14 @@ Hackernews data via Dogsheep [[hacker-news-to-sqlite][https://github.com/dogshee """ from __future__ import annotations +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -from typing import Iterator, Sequence, Optional -from my.core import get_files, Paths, Res, datetime_aware -from my.core.sqlite import sqlite_connection import my.config +from my.core import Paths, Res, datetime_aware, get_files +from my.core.sqlite import sqlite_connection from .common import hackernews_link @@ -33,9 +33,9 @@ class Item: id: str type: str created: datetime_aware # checked and it's utc - title: Optional[str] # only present for Story - text_html: Optional[str] # should be present for Comment and might for Story - url: Optional[str] # might be present for Story + title: str | None # only present for Story + text_html: str | None # should be present for Comment and might for Story + url: str | None # might be present for Story # todo process 'deleted'? fields? # todo process 'parent'? diff --git a/my/hackernews/harmonic.py b/my/hackernews/harmonic.py index 3b4ae61..08a82e6 100644 --- a/my/hackernews/harmonic.py +++ b/my/hackernews/harmonic.py @@ -1,17 +1,22 @@ """ [[https://play.google.com/store/apps/details?id=com.simon.harmonichackernews][Harmonic]] app for Hackernews """ + +from __future__ import annotations + REQUIRES = ['lxml', 'orjson'] +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone -import orjson from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Sequence, TypedDict, cast +from typing import Any, TypedDict, cast +import orjson from lxml import etree from more_itertools import one +import my.config from my.core import ( Paths, Res, @@ -22,8 +27,10 @@ from my.core import ( stat, ) from my.core.common import unique_everseen -import my.config -from .common import hackernews_link, SavedBase + +from .common import SavedBase, hackernews_link + +import my.config # isort: skip logger = make_logger(__name__) @@ -43,7 +50,7 @@ class Cached(TypedDict): created_at_i: int id: str points: int - test: Optional[str] + test: str | None title: str type: str # TODO Literal['story', 'comment']? comments are only in 'children' field tho url: str @@ -94,16 +101,16 @@ def _saved() -> Iterator[Res[Saved]]: # TODO defensive for each item! tr = etree.parse(path) - res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORIES_STRINGS"]'))) + res = one(cast(list[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORIES_STRINGS"]'))) cached_ids = [x.text.split('-')[0] for x in res] - cached: Dict[str, Cached] = {} + cached: dict[str, Cached] = {} for sid in cached_ids: - res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORY{sid}"]'))) + res = one(cast(list[Any], tr.xpath(f'//*[@name="{_PREFIX}_CACHED_STORY{sid}"]'))) j = orjson.loads(res.text) cached[sid] = j - res = one(cast(List[Any], tr.xpath(f'//*[@name="{_PREFIX}_BOOKMARKS"]'))) + res = one(cast(list[Any], tr.xpath(f'//*[@name="{_PREFIX}_BOOKMARKS"]'))) for x in res.text.split('-'): ids, item_timestamp = x.split('q') # not sure if timestamp is any useful? diff --git a/my/hackernews/materialistic.py b/my/hackernews/materialistic.py index 4d5cd47..ccf285b 100644 --- a/my/hackernews/materialistic.py +++ b/my/hackernews/materialistic.py @@ -1,19 +1,20 @@ """ [[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews """ +from collections.abc import Iterator, Sequence from datetime import datetime, timezone from pathlib import Path -from typing import Any, Dict, Iterator, NamedTuple, Sequence +from typing import Any, NamedTuple from more_itertools import unique_everseen -from my.core import get_files, datetime_aware, make_logger +from my.core import datetime_aware, get_files, make_logger from my.core.sqlite import sqlite_connection -from my.config import materialistic as config # todo migrate config to my.hackernews.materialistic - from .common import hackernews_link +# todo migrate config to my.hackernews.materialistic +from my.config import materialistic as config # isort: skip logger = make_logger(__name__) @@ -22,7 +23,7 @@ def inputs() -> Sequence[Path]: return get_files(config.export_path) -Row = Dict[str, Any] +Row = dict[str, Any] class Saved(NamedTuple): diff --git a/my/hypothesis.py b/my/hypothesis.py index 82104cd..15e854b 100644 --- a/my/hypothesis.py +++ b/my/hypothesis.py @@ -4,20 +4,22 @@ REQUIRES = [ 'git+https://github.com/karlicoss/hypexport', ] +from collections.abc import Iterator, Sequence from dataclasses import dataclass from pathlib import Path -from typing import Iterator, Sequence, TYPE_CHECKING +from typing import TYPE_CHECKING from my.core import ( - get_files, - stat, Paths, Res, Stats, + get_files, + stat, ) from my.core.cfg import make_config from my.core.hpi_compat import always_supports_sequence -import my.config + +import my.config # isort: skip @dataclass diff --git a/my/instagram/all.py b/my/instagram/all.py index 8007399..214e6ac 100644 --- a/my/instagram/all.py +++ b/my/instagram/all.py @@ -1,11 +1,10 @@ -from typing import Iterator +from collections.abc import Iterator -from my.core import Res, stat, Stats +from my.core import Res, Stats, stat from my.core.source import import_source from .common import Message, _merge_messages - src_gdpr = import_source(module_name='my.instagram.gdpr') @src_gdpr def _messages_gdpr() -> Iterator[Res[Message]]: diff --git a/my/instagram/android.py b/my/instagram/android.py index 96b75d2..12c11d3 100644 --- a/my/instagram/android.py +++ b/my/instagram/android.py @@ -3,30 +3,29 @@ Bumble data from Android app database (in =/data/data/com.instagram.android/data """ from __future__ import annotations +import json +import sqlite3 +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime -import json from pathlib import Path -import sqlite3 -from typing import Iterator, Sequence, Optional, Dict, Union from my.core import ( - get_files, - Paths, - make_config, - make_logger, - datetime_naive, Json, + Paths, Res, assert_never, + datetime_naive, + get_files, + make_config, + make_logger, ) -from my.core.common import unique_everseen from my.core.cachew import mcachew +from my.core.common import unique_everseen from my.core.error import echain -from my.core.sqlite import sqlite_connect_immutable, select - -from my.config import instagram as user_config +from my.core.sqlite import select, sqlite_connect_immutable +from my.config import instagram as user_config # isort: skip logger = make_logger(__name__) @@ -38,8 +37,8 @@ class instagram_android_config(user_config.android): # sadly doesn't seem easy to extract user's own handle/name from the db... # todo maybe makes more sense to keep in parent class? not sure... - username: Optional[str] = None - full_name: Optional[str] = None + username: str | None = None + full_name: str | None = None config = make_config(instagram_android_config) @@ -101,13 +100,13 @@ class MessageError(RuntimeError): return self.rest == other.rest -def _parse_message(j: Json) -> Optional[_Message]: +def _parse_message(j: Json) -> _Message | None: id = j['item_id'] t = j['item_type'] tid = j['thread_key']['thread_id'] uid = j['user_id'] created = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000) - text: Optional[str] = None + text: str | None = None if t == 'text': text = j['text'] elif t == 'reel_share': @@ -133,7 +132,7 @@ def _parse_message(j: Json) -> Optional[_Message]: ) -def _process_db(db: sqlite3.Connection) -> Iterator[Res[Union[User, _Message]]]: +def _process_db(db: sqlite3.Connection) -> Iterator[Res[User | _Message]]: # TODO ugh. seems like no way to extract username? # sometimes messages (e.g. media_share) contain it in message field # but generally it's not present. ugh @@ -175,7 +174,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Res[Union[User, _Message]]]: yield e -def _entities() -> Iterator[Res[Union[User, _Message]]]: +def _entities() -> Iterator[Res[User | _Message]]: # NOTE: definitely need to merge multiple, app seems to recycle old messages # TODO: hmm hard to guarantee timestamp ordering when we use synthetic input data... # todo use TypedDict? @@ -194,7 +193,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: @mcachew(depends_on=inputs) def messages() -> Iterator[Res[Message]]: - id2user: Dict[str, User] = {} + id2user: dict[str, User] = {} for x in unique_everseen(_entities): if isinstance(x, Exception): yield x diff --git a/my/instagram/common.py b/my/instagram/common.py index 4df07a1..17d130f 100644 --- a/my/instagram/common.py +++ b/my/instagram/common.py @@ -1,9 +1,10 @@ +from collections.abc import Iterator from dataclasses import replace from datetime import datetime from itertools import chain -from typing import Iterator, Dict, Any, Protocol +from typing import Any, Protocol -from my.core import warn_if_empty, Res +from my.core import Res, warn_if_empty class User(Protocol): @@ -40,7 +41,7 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: # ugh. seems that GDPR thread ids are completely uncorrelated to any android ids (tried searching over all sqlite dump) # so the only way to correlate is to try and match messages # we also can't use unique_everseen here, otherwise will never get a chance to unify threads - mmap: Dict[str, Message] = {} + mmap: dict[str, Message] = {} thread_map = {} user_map = {} @@ -60,7 +61,7 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: user_map[m.user.id] = mm.user else: # not emitted yet, need to emit - repls: Dict[str, Any] = {} + repls: dict[str, Any] = {} tid = thread_map.get(m.thread_id) if tid is not None: repls['thread_id'] = tid diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 1415d55..7454a04 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -2,26 +2,27 @@ Instagram data (uses [[https://www.instagram.com/download/request][official GDPR export]]) """ +from __future__ import annotations + +import json +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime -import json from pathlib import Path -from typing import Iterator, Sequence, Dict, Union from more_itertools import bucket from my.core import ( - get_files, Paths, - datetime_naive, Res, assert_never, + datetime_naive, + get_files, make_logger, ) from my.core.common import unique_everseen -from my.config import instagram as user_config - +from my.config import instagram as user_config # isort: skip logger = make_logger(__name__) @@ -70,7 +71,7 @@ def _decode(s: str) -> str: return s.encode('latin-1').decode('utf8') -def _entities() -> Iterator[Res[Union[User, _Message]]]: +def _entities() -> Iterator[Res[User | _Message]]: # it's worth processing all previous export -- sometimes instagram removes some metadata from newer ones # NOTE: here there are basically two options # - process inputs as is (from oldest to newest) @@ -84,7 +85,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]: yield from _entitites_from_path(path) -def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]: +def _entitites_from_path(path: Path) -> Iterator[Res[User | _Message]]: # TODO make sure it works both with plan directory # idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here # e.g. possible options are: @@ -202,7 +203,7 @@ def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]: # TODO basically copy pasted from android.py... hmm def messages() -> Iterator[Res[Message]]: - id2user: Dict[str, User] = {} + id2user: dict[str, User] = {} for x in unique_everseen(_entities): if isinstance(x, Exception): yield x diff --git a/my/instapaper.py b/my/instapaper.py index df1f70b..d79e7e4 100644 --- a/my/instapaper.py +++ b/my/instapaper.py @@ -7,10 +7,10 @@ REQUIRES = [ from dataclasses import dataclass -from .core import Paths - from my.config import instapaper as user_config +from .core import Paths + @dataclass class instapaper(user_config): @@ -22,6 +22,7 @@ class instapaper(user_config): from .core.cfg import make_config + config = make_config(instapaper) @@ -39,9 +40,12 @@ Bookmark = dal.Bookmark Page = dal.Page -from typing import Sequence, Iterable +from collections.abc import Iterable, Sequence from pathlib import Path + from .core import get_files + + def inputs() -> Sequence[Path]: return get_files(config.export_path) diff --git a/my/ip/all.py b/my/ip/all.py index 46c1fec..e8277c1 100644 --- a/my/ip/all.py +++ b/my/ip/all.py @@ -9,10 +9,9 @@ For an example of how this could be used, see https://github.com/seanbreckenridg REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] -from typing import Iterator +from collections.abc import Iterator from my.core import Stats, warn_if_empty - from my.ip.common import IP diff --git a/my/ip/common.py b/my/ip/common.py index 244ddc5..ef54ee3 100644 --- a/my/ip/common.py +++ b/my/ip/common.py @@ -2,11 +2,12 @@ Provides location/timezone data from IP addresses, using [[https://github.com/seanbreckenridge/ipgeocache][ipgeocache]] """ -from my.core import __NOT_HPI_MODULE__ +from my.core import __NOT_HPI_MODULE__ # isort: skip import ipaddress -from typing import NamedTuple, Iterator, Tuple +from collections.abc import Iterator from datetime import datetime +from typing import NamedTuple import ipgeocache @@ -22,7 +23,7 @@ class IP(NamedTuple): return ipgeocache.get(self.addr) @property - def latlon(self) -> Tuple[float, float]: + def latlon(self) -> tuple[float, float]: loc: str = self.ipgeocache()["loc"] lat, _, lon = loc.partition(",") return float(lat), float(lon) diff --git a/my/jawbone/__init__.py b/my/jawbone/__init__.py index 35112ba..463d735 100644 --- a/my/jawbone/__init__.py +++ b/my/jawbone/__init__.py @@ -1,10 +1,11 @@ from __future__ import annotations -from typing import Dict, Any, List, Iterable import json +from collections.abc import Iterable +from datetime import date, datetime, time, timedelta from functools import lru_cache -from datetime import datetime, date, time, timedelta from pathlib import Path +from typing import Any import pytz @@ -14,7 +15,6 @@ logger = make_logger(__name__) from my.config import jawbone as config # type: ignore[attr-defined] - BDIR = config.export_dir PHASES_FILE = BDIR / 'phases.json' SLEEPS_FILE = BDIR / 'sleeps.json' @@ -24,7 +24,7 @@ GRAPHS_DIR = BDIR / 'graphs' XID = str # TODO how to shared with backup thing? -Phases = Dict[XID, Any] +Phases = dict[XID, Any] @lru_cache(1) def get_phases() -> Phases: return json.loads(PHASES_FILE.read_text()) @@ -89,7 +89,7 @@ class SleepEntry: # TODO might be useful to cache these?? @property - def phases(self) -> List[datetime]: + def phases(self) -> list[datetime]: # TODO make sure they are consistent with emfit? return [self._fromts(i['time']) for i in get_phases()[self.xid]] @@ -100,12 +100,13 @@ class SleepEntry: return str(self) -def load_sleeps() -> List[SleepEntry]: +def load_sleeps() -> list[SleepEntry]: sleeps = json.loads(SLEEPS_FILE.read_text()) return [SleepEntry(js) for js in sleeps] -from ..core.error import Res, set_error_datetime, extract_error_datetime +from ..core.error import Res, extract_error_datetime, set_error_datetime + def pre_dataframe() -> Iterable[Res[SleepEntry]]: from more_itertools import bucket @@ -129,9 +130,9 @@ def pre_dataframe() -> Iterable[Res[SleepEntry]]: def dataframe(): - dicts: List[Dict[str, Any]] = [] + dicts: list[dict[str, Any]] = [] for s in pre_dataframe(): - d: Dict[str, Any] + d: dict[str, Any] if isinstance(s, Exception): dt = extract_error_datetime(s) d = { @@ -181,7 +182,7 @@ def plot_one(sleep: SleepEntry, fig, axes, xlims=None, *, showtext=True): print(f"{sleep.xid} span: {span}") # pip install imageio - from imageio import imread # type: ignore + from imageio import imread # type: ignore img = imread(sleep.graph) # all of them are 300x300 images apparently @@ -260,8 +261,8 @@ def predicate(sleep: SleepEntry): # TODO move to dashboard def plot() -> None: - from matplotlib.figure import Figure # type: ignore[import-not-found] import matplotlib.pyplot as plt # type: ignore[import-not-found] + from matplotlib.figure import Figure # type: ignore[import-not-found] # TODO FIXME melatonin data melatonin_data = {} # type: ignore[var-annotated] diff --git a/my/jawbone/plots.py b/my/jawbone/plots.py index d26d606..5968412 100755 --- a/my/jawbone/plots.py +++ b/my/jawbone/plots.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 # TODO this should be in dashboard -from pathlib import Path # from kython.plotting import * from csv import DictReader +from pathlib import Path +from typing import Any, NamedTuple -from typing import Dict, Any, NamedTuple +import matplotlib.pylab as pylab # type: ignore # sleep = [] # with open('2017.csv', 'r') as fo: @@ -12,16 +13,14 @@ from typing import Dict, Any, NamedTuple # for line in islice(reader, 0, 10): # sleep # print(line) - -import matplotlib.pyplot as plt # type: ignore +import matplotlib.pyplot as plt # type: ignore from numpy import genfromtxt -import matplotlib.pylab as pylab # type: ignore pylab.rcParams['figure.figsize'] = (32.0, 24.0) pylab.rcParams['font.size'] = 10 jawboneDataFeatures = Path(__file__).parent / 'features.csv' # Data File Path -featureDesc: Dict[str, str] = {} +featureDesc: dict[str, str] = {} for x in genfromtxt(jawboneDataFeatures, dtype='unicode', delimiter=','): featureDesc[x[0]] = x[1] @@ -52,7 +51,7 @@ class SleepData(NamedTuple): quality: float # ??? @classmethod - def from_jawbone_dict(cls, d: Dict[str, Any]): + def from_jawbone_dict(cls, d: dict[str, Any]): return cls( date=d['DATE'], asleep_time=_safe_mins(_safe_float(d['s_asleep_time'])), @@ -75,7 +74,7 @@ class SleepData(NamedTuple): def iter_useful(data_file: str): - with open(data_file) as fo: + with Path(data_file).open() as fo: reader = DictReader(fo) for d in reader: dt = SleepData.from_jawbone_dict(d) @@ -95,6 +94,7 @@ files = [ ] from kython import concat, parse_date # type: ignore + useful = concat(*(list(iter_useful(str(f))) for f in files)) # for u in useful: @@ -108,6 +108,7 @@ dates = [parse_date(u.date, yearfirst=True, dayfirst=False) for u in useful] # TODO don't need this anymore? it's gonna be in dashboards package from kython.plotting import plot_timestamped # type: ignore + for attr, lims, mavg, fig in [ ('light', (0, 400), 5, None), ('deep', (0, 600), 5, None), diff --git a/my/kobo.py b/my/kobo.py index 85bc50f..b4a1575 100644 --- a/my/kobo.py +++ b/my/kobo.py @@ -7,21 +7,22 @@ REQUIRES = [ 'kobuddy', ] +from collections.abc import Iterator from dataclasses import dataclass -from typing import Iterator - -from my.core import ( - get_files, - stat, - Paths, - Stats, -) -from my.core.cfg import make_config -import my.config import kobuddy -from kobuddy import Highlight, get_highlights from kobuddy import * +from kobuddy import Highlight, get_highlights + +from my.core import ( + Paths, + Stats, + get_files, + stat, +) +from my.core.cfg import make_config + +import my.config # isort: skip @dataclass @@ -51,7 +52,7 @@ def stats() -> Stats: ## TODO hmm. not sure if all this really belongs here?... perhaps orger? -from typing import Callable, Union, List +from typing import Callable, Union # TODO maybe type over T? _Predicate = Callable[[str], bool] @@ -69,17 +70,17 @@ def from_predicatish(p: Predicatish) -> _Predicate: return p -def by_annotation(predicatish: Predicatish, **kwargs) -> List[Highlight]: +def by_annotation(predicatish: Predicatish, **kwargs) -> list[Highlight]: pred = from_predicatish(predicatish) - res: List[Highlight] = [] + res: list[Highlight] = [] for h in get_highlights(**kwargs): if pred(h.annotation): res.append(h) return res -def get_todos() -> List[Highlight]: +def get_todos() -> list[Highlight]: def with_todo(ann): if ann is None: ann = '' diff --git a/my/kython/kompress.py b/my/kython/kompress.py index 01e24e4..a5d9c29 100644 --- a/my/kython/kompress.py +++ b/my/kython/kompress.py @@ -1,5 +1,4 @@ -from my.core import __NOT_HPI_MODULE__ -from my.core import warnings +from my.core import __NOT_HPI_MODULE__, warnings warnings.high('my.kython.kompress is deprecated, please use "kompress" library directly. See https://github.com/karlicoss/kompress') diff --git a/my/lastfm.py b/my/lastfm.py index d20ebf3..cd9fa8b 100644 --- a/my/lastfm.py +++ b/my/lastfm.py @@ -3,9 +3,9 @@ Last.fm scrobbles ''' from dataclasses import dataclass -from my.core import Paths, Json, make_logger, get_files -from my.config import lastfm as user_config +from my.config import lastfm as user_config +from my.core import Json, Paths, get_files, make_logger logger = make_logger(__name__) @@ -19,13 +19,15 @@ class lastfm(user_config): from my.core.cfg import make_config + config = make_config(lastfm) -from datetime import datetime, timezone import json +from collections.abc import Iterable, Sequence +from datetime import datetime, timezone from pathlib import Path -from typing import NamedTuple, Sequence, Iterable +from typing import NamedTuple from my.core.cachew import mcachew @@ -76,7 +78,9 @@ def scrobbles() -> Iterable[Scrobble]: yield Scrobble(raw=raw) -from my.core import stat, Stats +from my.core import Stats, stat + + def stats() -> Stats: return stat(scrobbles) diff --git a/my/location/all.py b/my/location/all.py index fd88721..c6e8cab 100644 --- a/my/location/all.py +++ b/my/location/all.py @@ -2,14 +2,13 @@ Merges location data from multiple sources """ -from typing import Iterator +from collections.abc import Iterator -from my.core import Stats, LazyLogger +from my.core import LazyLogger, Stats from my.core.source import import_source from .common import Location - logger = LazyLogger(__name__, level="warning") diff --git a/my/location/common.py b/my/location/common.py index f406370..4c47ef0 100644 --- a/my/location/common.py +++ b/my/location/common.py @@ -1,12 +1,13 @@ -from datetime import date, datetime -from typing import Union, Tuple, Optional, Iterable, TextIO, Iterator, Protocol -from dataclasses import dataclass +from my.core import __NOT_HPI_MODULE__ # isort: skip -from my.core import __NOT_HPI_MODULE__ +from collections.abc import Iterable, Iterator +from dataclasses import dataclass +from datetime import date, datetime +from typing import Optional, Protocol, TextIO, Union DateIsh = Union[datetime, date, str] -LatLon = Tuple[float, float] +LatLon = tuple[float, float] class LocationProtocol(Protocol): diff --git a/my/location/fallback/all.py b/my/location/fallback/all.py index a5daa05..d340148 100644 --- a/my/location/fallback/all.py +++ b/my/location/fallback/all.py @@ -1,14 +1,16 @@ # TODO: add config here which passes kwargs to estimate_from (under_accuracy) # overwritable by passing the kwarg name here to the top-level estimate_location -from typing import Iterator, Optional +from __future__ import annotations + +from collections.abc import Iterator from my.core.source import import_source from my.location.fallback.common import ( - estimate_from, - FallbackLocation, DateExact, + FallbackLocation, LocationEstimator, + estimate_from, ) @@ -24,7 +26,7 @@ def fallback_estimators() -> Iterator[LocationEstimator]: yield _home_estimate -def estimate_location(dt: DateExact, *, first_match: bool=False, under_accuracy: Optional[int] = None) -> FallbackLocation: +def estimate_location(dt: DateExact, *, first_match: bool=False, under_accuracy: int | None = None) -> FallbackLocation: loc = estimate_from(dt, estimators=list(fallback_estimators()), first_match=first_match, under_accuracy=under_accuracy) # should never happen if the user has home configured if loc is None: diff --git a/my/location/fallback/common.py b/my/location/fallback/common.py index 13bc603..622b2f5 100644 --- a/my/location/fallback/common.py +++ b/my/location/fallback/common.py @@ -1,9 +1,12 @@ from __future__ import annotations -from dataclasses import dataclass -from typing import Optional, Callable, Sequence, Iterator, List, Union -from datetime import datetime, timedelta, timezone -from ..common import LocationProtocol, Location +from collections.abc import Iterator, Sequence +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from typing import Callable, Union + +from ..common import Location, LocationProtocol + DateExact = Union[datetime, float, int] # float/int as epoch timestamps Second = float @@ -13,10 +16,10 @@ class FallbackLocation(LocationProtocol): lat: float lon: float dt: datetime - duration: Optional[Second] = None - accuracy: Optional[float] = None - elevation: Optional[float] = None - datasource: Optional[str] = None # which module provided this, useful for debugging + duration: Second | None = None + accuracy: float | None = None + elevation: float | None = None + datasource: str | None = None # which module provided this, useful for debugging def to_location(self, *, end: bool = False) -> Location: ''' @@ -43,9 +46,9 @@ class FallbackLocation(LocationProtocol): lon: float, dt: datetime, end_dt: datetime, - accuracy: Optional[float] = None, - elevation: Optional[float] = None, - datasource: Optional[str] = None, + accuracy: float | None = None, + elevation: float | None = None, + datasource: str | None = None, ) -> FallbackLocation: ''' Create FallbackLocation from a start date and an end date @@ -93,13 +96,13 @@ def estimate_from( estimators: LocationEstimators, *, first_match: bool = False, - under_accuracy: Optional[int] = None, -) -> Optional[FallbackLocation]: + under_accuracy: int | None = None, +) -> FallbackLocation | None: ''' first_match: if True, return the first location found under_accuracy: if set, only return locations with accuracy under this value ''' - found: List[FallbackLocation] = [] + found: list[FallbackLocation] = [] for loc in _iter_estimate_from(dt, estimators): if under_accuracy is not None and loc.accuracy is not None and loc.accuracy > under_accuracy: continue diff --git a/my/location/fallback/via_home.py b/my/location/fallback/via_home.py index e44c59d..f88fee0 100644 --- a/my/location/fallback/via_home.py +++ b/my/location/fallback/via_home.py @@ -2,25 +2,22 @@ Simple location provider, serving as a fallback when more detailed data isn't available ''' +from __future__ import annotations + +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, time, timezone -from functools import lru_cache -from typing import Sequence, Tuple, Union, cast, List, Iterator +from functools import cache +from typing import cast from my.config import location as user_config +from my.location.common import DateIsh, LatLon +from my.location.fallback.common import DateExact, FallbackLocation -from my.location.common import LatLon, DateIsh -from my.location.fallback.common import FallbackLocation, DateExact @dataclass class Config(user_config): - home: Union[ - LatLon, # either single, 'current' location - Sequence[Tuple[ # or, a sequence of location history - DateIsh, # date when you moved to - LatLon, # the location - ]] - ] + home: LatLon | Sequence[tuple[DateIsh, LatLon]] # default ~30km accuracy # this is called 'home_accuracy' since it lives on the base location.config object, @@ -29,13 +26,13 @@ class Config(user_config): # TODO could make current Optional and somehow determine from system settings? @property - def _history(self) -> Sequence[Tuple[datetime, LatLon]]: + def _history(self) -> Sequence[tuple[datetime, LatLon]]: home1 = self.home # todo ugh, can't test for isnstance LatLon, it's a tuple itself - home2: Sequence[Tuple[DateIsh, LatLon]] + home2: Sequence[tuple[DateIsh, LatLon]] if isinstance(home1[0], tuple): # already a sequence - home2 = cast(Sequence[Tuple[DateIsh, LatLon]], home1) + home2 = cast(Sequence[tuple[DateIsh, LatLon]], home1) else: # must be a pair of coordinates. also doesn't really matter which date to pick? loc = cast(LatLon, home1) @@ -60,10 +57,11 @@ class Config(user_config): from ...core.cfg import make_config + config = make_config(Config) -@lru_cache(maxsize=None) +@cache def get_location(dt: datetime) -> LatLon: ''' Interpolates the location at dt @@ -74,8 +72,8 @@ def get_location(dt: datetime) -> LatLon: # TODO: in python3.8, use functools.cached_property instead? -@lru_cache(maxsize=None) -def homes_cached() -> List[Tuple[datetime, LatLon]]: +@cache +def homes_cached() -> list[tuple[datetime, LatLon]]: return list(config._history) diff --git a/my/location/fallback/via_ip.py b/my/location/fallback/via_ip.py index 79a452c..732af67 100644 --- a/my/location/fallback/via_ip.py +++ b/my/location/fallback/via_ip.py @@ -7,8 +7,8 @@ REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] from dataclasses import dataclass from datetime import timedelta -from my.core import Stats, make_config from my.config import location +from my.core import Stats, make_config from my.core.warnings import medium @@ -24,13 +24,13 @@ class ip_config(location.via_ip): config = make_config(ip_config) +from collections.abc import Iterator from functools import lru_cache -from typing import Iterator, List from my.core import make_logger from my.core.compat import bisect_left from my.location.common import Location -from my.location.fallback.common import FallbackLocation, DateExact, _datetime_timestamp +from my.location.fallback.common import DateExact, FallbackLocation, _datetime_timestamp logger = make_logger(__name__, level="warning") @@ -60,7 +60,7 @@ def locations() -> Iterator[Location]: @lru_cache(1) -def _sorted_fallback_locations() -> List[FallbackLocation]: +def _sorted_fallback_locations() -> list[FallbackLocation]: fl = list(filter(lambda l: l.duration is not None, fallback_locations())) logger.debug(f"Fallback locations: {len(fl)}, sorting...:") fl.sort(key=lambda l: l.dt.timestamp()) diff --git a/my/location/google.py b/my/location/google.py index b966ec6..750c847 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -3,28 +3,27 @@ Location data from Google Takeout DEPRECATED: setup my.google.takeout.parser and use my.location.google_takeout instead """ +from __future__ import annotations REQUIRES = [ 'geopy', # checking that coordinates are valid 'ijson', ] +import re +from collections.abc import Iterable, Sequence from datetime import datetime, timezone from itertools import islice from pathlib import Path -from subprocess import Popen, PIPE -from typing import Iterable, NamedTuple, Optional, Sequence, IO, Tuple -import re +from subprocess import PIPE, Popen +from typing import IO, NamedTuple, Optional # pip3 install geopy -import geopy # type: ignore +import geopy # type: ignore -from my.core import stat, Stats, make_logger +from my.core import Stats, make_logger, stat, warnings from my.core.cachew import cache_dir, mcachew -from my.core import warnings - - warnings.high("Please set up my.google.takeout.parser module for better takeout support") @@ -43,7 +42,7 @@ class Location(NamedTuple): alt: Optional[float] -TsLatLon = Tuple[int, int, int] +TsLatLon = tuple[int, int, int] def _iter_via_ijson(fo) -> Iterable[TsLatLon]: @@ -51,10 +50,10 @@ def _iter_via_ijson(fo) -> Iterable[TsLatLon]: # todo extract to common? try: # pip3 install ijson cffi - import ijson.backends.yajl2_cffi as ijson # type: ignore + import ijson.backends.yajl2_cffi as ijson # type: ignore except: warnings.medium("Falling back to default ijson because 'cffi' backend isn't found. It's up to 2x faster, you might want to check it out") - import ijson # type: ignore + import ijson # type: ignore for d in ijson.items(fo, 'locations.item'): yield ( diff --git a/my/location/google_takeout.py b/my/location/google_takeout.py index eb757ce..cb5bef3 100644 --- a/my/location/google_takeout.py +++ b/my/location/google_takeout.py @@ -4,13 +4,14 @@ Extracts locations using google_takeout_parser -- no shared code with the deprec REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] -from typing import Iterator +from collections.abc import Iterator -from my.google.takeout.parser import events, _cachew_depends_on from google_takeout_parser.models import Location as GoogleLocation -from my.core import stat, Stats, LazyLogger +from my.core import LazyLogger, Stats, stat from my.core.cachew import mcachew +from my.google.takeout.parser import _cachew_depends_on, events + from .common import Location logger = LazyLogger(__name__) diff --git a/my/location/google_takeout_semantic.py b/my/location/google_takeout_semantic.py index 5f2c055..7bddfa8 100644 --- a/my/location/google_takeout_semantic.py +++ b/my/location/google_takeout_semantic.py @@ -7,21 +7,24 @@ Extracts semantic location history using google_takeout_parser REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] +from collections.abc import Iterator from dataclasses import dataclass -from typing import Iterator, List -from my.google.takeout.parser import events, _cachew_depends_on as _parser_cachew_depends_on from google_takeout_parser.models import PlaceVisit as SemanticLocation -from my.core import make_config, stat, LazyLogger, Stats +from my.core import LazyLogger, Stats, make_config, stat from my.core.cachew import mcachew from my.core.error import Res +from my.google.takeout.parser import _cachew_depends_on as _parser_cachew_depends_on +from my.google.takeout.parser import events + from .common import Location logger = LazyLogger(__name__) from my.config import location as user_config + @dataclass class semantic_locations_config(user_config.google_takeout_semantic): # a value between 0 and 100, 100 being the most confident @@ -36,7 +39,7 @@ config = make_config(semantic_locations_config) # add config to cachew dependency so it recomputes on config changes -def _cachew_depends_on() -> List[str]: +def _cachew_depends_on() -> list[str]: dep = _parser_cachew_depends_on() dep.insert(0, f"require_confidence={config.require_confidence} accuracy={config.accuracy}") return dep diff --git a/my/location/gpslogger.py b/my/location/gpslogger.py index 6d158a0..bbbf70e 100644 --- a/my/location/gpslogger.py +++ b/my/location/gpslogger.py @@ -20,20 +20,20 @@ class config(location.gpslogger): accuracy: float = 50.0 -from itertools import chain +from collections.abc import Iterator, Sequence from datetime import datetime, timezone +from itertools import chain from pathlib import Path -from typing import Iterator, Sequence, List import gpxpy from gpxpy.gpx import GPXXMLSyntaxException from more_itertools import unique_everseen -from my.core import Stats, LazyLogger +from my.core import LazyLogger, Stats from my.core.cachew import mcachew from my.core.common import get_files -from .common import Location +from .common import Location logger = LazyLogger(__name__, level="warning") @@ -49,7 +49,7 @@ def inputs() -> Sequence[Path]: return sorted(get_files(config.export_path, glob="*.gpx", sort=False), key=_input_sort_key) -def _cachew_depends_on() -> List[float]: +def _cachew_depends_on() -> list[float]: return [p.stat().st_mtime for p in inputs()] diff --git a/my/location/home.py b/my/location/home.py index f6e6978..c82dda7 100644 --- a/my/location/home.py +++ b/my/location/home.py @@ -1,7 +1,7 @@ -from .fallback.via_home import * - from my.core.warnings import high +from .fallback.via_home import * + high( "my.location.home is deprecated, use my.location.fallback.via_home instead, or estimate locations using the higher-level my.location.fallback.all.estimate_location" ) diff --git a/my/location/via_ip.py b/my/location/via_ip.py index df48f8b..d465ad0 100644 --- a/my/location/via_ip.py +++ b/my/location/via_ip.py @@ -1,7 +1,7 @@ REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] -from .fallback.via_ip import * - from my.core.warnings import high +from .fallback.via_ip import * + high("my.location.via_ip is deprecated, use my.location.fallback.via_ip instead") diff --git a/my/materialistic.py b/my/materialistic.py index 8a6a997..45af3f9 100644 --- a/my/materialistic.py +++ b/my/materialistic.py @@ -1,4 +1,5 @@ from .core.warnings import high + high("DEPRECATED! Please use my.hackernews.materialistic instead.") from .hackernews.materialistic import * diff --git a/my/media/imdb.py b/my/media/imdb.py index df31032..131f6a7 100644 --- a/my/media/imdb.py +++ b/my/media/imdb.py @@ -1,10 +1,12 @@ import csv +from collections.abc import Iterator from datetime import datetime -from typing import Iterator, List, NamedTuple +from typing import NamedTuple -from ..core import get_files +from my.core import get_files + +from my.config import imdb as config # isort: skip -from my.config import imdb as config def _get_last(): return max(get_files(config.export_path)) @@ -31,7 +33,7 @@ def iter_movies() -> Iterator[Movie]: yield Movie(created=created, title=title, rating=rating) -def get_movies() -> List[Movie]: +def get_movies() -> list[Movie]: return sorted(iter_movies(), key=lambda m: m.created) diff --git a/my/media/youtube.py b/my/media/youtube.py index 3ddbc14..9a38c43 100644 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -1,4 +1,4 @@ -from my.core import __NOT_HPI_MODULE__ +from my.core import __NOT_HPI_MODULE__ # isort: skip from typing import TYPE_CHECKING diff --git a/my/monzo/monzoexport.py b/my/monzo/monzoexport.py index 3aa0cf5..f5e1cd1 100644 --- a/my/monzo/monzoexport.py +++ b/my/monzo/monzoexport.py @@ -5,16 +5,17 @@ REQUIRES = [ 'git+https://github.com/karlicoss/monzoexport', ] +from collections.abc import Iterator, Sequence from dataclasses import dataclass from pathlib import Path -from typing import Sequence, Iterator from my.core import ( Paths, get_files, make_logger, ) -import my.config + +import my.config # isort: skip @dataclass diff --git a/my/orgmode.py b/my/orgmode.py index cf14e43..10f53c0 100644 --- a/my/orgmode.py +++ b/my/orgmode.py @@ -1,15 +1,17 @@ ''' Programmatic access and queries to org-mode files on the filesystem ''' +from __future__ import annotations REQUIRES = [ 'orgparse', ] import re +from collections.abc import Iterable, Sequence from datetime import datetime from pathlib import Path -from typing import Iterable, List, NamedTuple, Optional, Sequence, Tuple +from typing import NamedTuple, Optional import orgparse @@ -34,7 +36,7 @@ def make_config() -> config: class OrgNote(NamedTuple): created: Optional[datetime] heading: str - tags: List[str] + tags: list[str] def inputs() -> Sequence[Path]: @@ -45,7 +47,7 @@ def inputs() -> Sequence[Path]: _rgx = re.compile(orgparse.date.gene_timestamp_regex(brtype='inactive'), re.VERBOSE) -def _created(n: orgparse.OrgNode) -> Tuple[Optional[datetime], str]: +def _created(n: orgparse.OrgNode) -> tuple[datetime | None, str]: heading = n.heading # meh.. support in orgparse? pp = {} if n.is_root() else n.properties @@ -68,7 +70,7 @@ def _created(n: orgparse.OrgNode) -> Tuple[Optional[datetime], str]: def to_note(x: orgparse.OrgNode) -> OrgNote: # ugh. hack to merely make it cacheable heading = x.heading - created: Optional[datetime] + created: datetime | None try: c, heading = _created(x) if isinstance(c, datetime): diff --git a/my/pdfs.py b/my/pdfs.py index de9324d..eefd573 100644 --- a/my/pdfs.py +++ b/my/pdfs.py @@ -1,6 +1,7 @@ ''' PDF documents and annotations on your filesystem ''' +from __future__ import annotations as _annotations REQUIRES = [ 'git+https://github.com/0xabu/pdfannots', @@ -8,9 +9,10 @@ REQUIRES = [ ] import time +from collections.abc import Iterator, Sequence from datetime import datetime from pathlib import Path -from typing import Iterator, List, NamedTuple, Optional, Protocol, Sequence, TYPE_CHECKING +from typing import TYPE_CHECKING, NamedTuple, Optional, Protocol import pdfannots from more_itertools import bucket @@ -72,7 +74,7 @@ class Annotation(NamedTuple): created: Optional[datetime] # note: can be tz unaware in some bad pdfs... @property - def date(self) -> Optional[datetime]: + def date(self) -> datetime | None: # legacy name return self.created @@ -93,7 +95,7 @@ def _as_annotation(*, raw: pdfannots.Annotation, path: str) -> Annotation: ) -def get_annots(p: Path) -> List[Annotation]: +def get_annots(p: Path) -> list[Annotation]: b = time.time() with p.open('rb') as fo: doc = pdfannots.process_file(fo, emit_progress_to=None) @@ -150,17 +152,17 @@ class Pdf(NamedTuple): annotations: Sequence[Annotation] @property - def created(self) -> Optional[datetime]: + def created(self) -> datetime | None: annots = self.annotations return None if len(annots) == 0 else annots[-1].created @property - def date(self) -> Optional[datetime]: + def date(self) -> datetime | None: # legacy return self.created -def annotated_pdfs(*, filelist: Optional[Sequence[PathIsh]] = None) -> Iterator[Res[Pdf]]: +def annotated_pdfs(*, filelist: Sequence[PathIsh] | None = None) -> Iterator[Res[Pdf]]: if filelist is not None: # hacky... keeping it backwards compatible # https://github.com/karlicoss/HPI/pull/74 diff --git a/my/photos/main.py b/my/photos/main.py index bf912e4..f98cb15 100644 --- a/my/photos/main.py +++ b/my/photos/main.py @@ -1,27 +1,30 @@ """ Photos and videos on your filesystem, their GPS and timestamps """ + +from __future__ import annotations + REQUIRES = [ 'geopy', 'magic', ] # NOTE: also uses fdfind to search photos +import json +from collections.abc import Iterable, Iterator from concurrent.futures import ProcessPoolExecutor as Pool from datetime import datetime -import json from pathlib import Path -from typing import Optional, NamedTuple, Iterator, Iterable, List +from typing import NamedTuple, Optional from geopy.geocoders import Nominatim # type: ignore from my.core import LazyLogger -from my.core.error import Res, sort_res_by from my.core.cachew import cache_dir, mcachew +from my.core.error import Res, sort_res_by from my.core.mime import fastermime -from my.config import photos as config # type: ignore[attr-defined] - +from my.config import photos as config # type: ignore[attr-defined] # isort: skip logger = LazyLogger(__name__) @@ -55,15 +58,15 @@ class Photo(NamedTuple): return f'{config.base_url}{self._basename}' -from .utils import get_exif_from_file, ExifTags, Exif, dt_from_path, convert_ref +from .utils import Exif, ExifTags, convert_ref, dt_from_path, get_exif_from_file Result = Res[Photo] -def _make_photo_aux(*args, **kwargs) -> List[Result]: +def _make_photo_aux(*args, **kwargs) -> list[Result]: # for the process pool.. return list(_make_photo(*args, **kwargs)) -def _make_photo(photo: Path, mtype: str, *, parent_geo: Optional[LatLon]) -> Iterator[Result]: +def _make_photo(photo: Path, mtype: str, *, parent_geo: LatLon | None) -> Iterator[Result]: exif: Exif if any(x in mtype for x in ['image/png', 'image/x-ms-bmp', 'video']): # TODO don't remember why.. @@ -77,7 +80,7 @@ def _make_photo(photo: Path, mtype: str, *, parent_geo: Optional[LatLon]) -> Ite yield e exif = {} - def _get_geo() -> Optional[LatLon]: + def _get_geo() -> LatLon | None: meta = exif.get(ExifTags.GPSINFO, {}) if ExifTags.LAT in meta and ExifTags.LON in meta: return LatLon( @@ -87,7 +90,7 @@ def _make_photo(photo: Path, mtype: str, *, parent_geo: Optional[LatLon]) -> Ite return parent_geo # TODO aware on unaware? - def _get_dt() -> Optional[datetime]: + def _get_dt() -> datetime | None: edt = exif.get(ExifTags.DATETIME, None) if edt is not None: dtimes = edt.replace(' 24', ' 00') # jeez maybe log it? @@ -123,7 +126,7 @@ def _make_photo(photo: Path, mtype: str, *, parent_geo: Optional[LatLon]) -> Ite def _candidates() -> Iterable[Res[str]]: # TODO that could be a bit slow if there are to many extra files? - from subprocess import Popen, PIPE + from subprocess import PIPE, Popen # TODO could extract this to common? # TODO would be nice to reuse get_files (or even let it use find) # that way would be easier to exclude @@ -162,7 +165,7 @@ def _photos(candidates: Iterable[Res[str]]) -> Iterator[Result]: from functools import lru_cache @lru_cache(None) - def get_geo(d: Path) -> Optional[LatLon]: + def get_geo(d: Path) -> LatLon | None: geof = d / 'geo.json' if not geof.exists(): if d == d.parent: @@ -214,5 +217,7 @@ def print_all() -> None: # todo cachew -- invalidate if function code changed? from ..core import Stats, stat + + def stats() -> Stats: return stat(photos) diff --git a/my/photos/utils.py b/my/photos/utils.py index c614c4a..e88def2 100644 --- a/my/photos/utils.py +++ b/my/photos/utils.py @@ -1,11 +1,13 @@ +from __future__ import annotations + +from ..core import __NOT_HPI_MODULE__ # isort: skip + from pathlib import Path -from typing import Dict import PIL.Image -from PIL.ExifTags import TAGS, GPSTAGS +from PIL.ExifTags import GPSTAGS, TAGS - -Exif = Dict +Exif = dict # TODO PIL.ExifTags.TAGS @@ -62,18 +64,15 @@ def convert_ref(cstr, ref: str) -> float: import re from datetime import datetime -from typing import Optional # TODO surely there is a library that does it?? # TODO this belongs to a private overlay or something # basically have a function that patches up dates after the files were yielded.. _DT_REGEX = re.compile(r'\D(\d{8})\D*(\d{6})\D') -def dt_from_path(p: Path) -> Optional[datetime]: +def dt_from_path(p: Path) -> datetime | None: name = p.stem mm = _DT_REGEX.search(name) if mm is None: return None dates = mm.group(1) + mm.group(2) return datetime.strptime(dates, "%Y%m%d%H%M%S") - -from ..core import __NOT_HPI_MODULE__ diff --git a/my/pinboard.py b/my/pinboard.py index ef4ca36..e98dc78 100644 --- a/my/pinboard.py +++ b/my/pinboard.py @@ -5,15 +5,16 @@ REQUIRES = [ 'git+https://github.com/karlicoss/pinbexport', ] +from collections.abc import Iterator, Sequence from dataclasses import dataclass from pathlib import Path -from typing import Iterator, Sequence - -from my.core import get_files, Paths, Res -import my.config import pinbexport.dal as pinbexport +from my.core import Paths, Res, get_files + +import my.config # isort: skip + @dataclass class config(my.config.pinboard): # TODO rename to pinboard.pinbexport? diff --git a/my/pocket.py b/my/pocket.py index b638fba..ff9a788 100644 --- a/my/pocket.py +++ b/my/pocket.py @@ -7,10 +7,10 @@ REQUIRES = [ from dataclasses import dataclass from typing import TYPE_CHECKING -from .core import Paths - from my.config import pocket as user_config +from .core import Paths + @dataclass class pocket(user_config): @@ -23,6 +23,7 @@ class pocket(user_config): from .core.cfg import make_config + config = make_config(pocket) @@ -37,7 +38,7 @@ except ModuleNotFoundError as e: Article = dal.Article -from typing import Sequence, Iterable +from collections.abc import Iterable, Sequence # todo not sure if should be defensive against empty? @@ -51,9 +52,12 @@ def articles() -> Iterable[Article]: yield from _dal().articles() -from .core import stat, Stats +from .core import Stats, stat + + def stats() -> Stats: from itertools import chain + from more_itertools import ilen return { **stat(articles), diff --git a/my/polar.py b/my/polar.py index e52bb14..2172014 100644 --- a/my/polar.py +++ b/my/polar.py @@ -1,11 +1,12 @@ """ [[https://github.com/burtonator/polar-bookshelf][Polar]] articles and highlights """ +from __future__ import annotations + from pathlib import Path -from typing import cast, TYPE_CHECKING +from typing import TYPE_CHECKING, cast - -import my.config +import my.config # isort: skip # todo use something similar to tz.via_location for config fallback if not TYPE_CHECKING: @@ -20,8 +21,11 @@ if user_config is None: pass -from .core import PathIsh from dataclasses import dataclass + +from .core import PathIsh + + @dataclass class polar(user_config): ''' @@ -32,20 +36,21 @@ class polar(user_config): from .core import make_config + config = make_config(polar) # todo not sure where it keeps stuff on Windows? # https://github.com/burtonator/polar-bookshelf/issues/296 -from datetime import datetime -from typing import List, Dict, Iterable, NamedTuple, Sequence, Optional import json +from collections.abc import Iterable, Sequence +from datetime import datetime +from typing import NamedTuple -from .core import LazyLogger, Json, Res +from .core import Json, LazyLogger, Res from .core.compat import fromisoformat from .core.error import echain, sort_res_by -from .core.konsume import wrap, Zoomable, Wdict - +from .core.konsume import Wdict, Zoomable, wrap logger = LazyLogger(__name__) @@ -65,7 +70,7 @@ class Highlight(NamedTuple): comments: Sequence[Comment] tags: Sequence[str] page: int # 1-indexed - color: Optional[str] = None + color: str | None = None Uid = str @@ -73,7 +78,7 @@ class Book(NamedTuple): created: datetime uid: Uid path: Path - title: Optional[str] + title: str | None # TODO hmmm. I think this needs to be defensive as well... # think about it later. items: Sequence[Highlight] @@ -129,7 +134,7 @@ class Loader: pi['dimensions'].consume_all() # TODO how to make it nicer? - cmap: Dict[Hid, List[Comment]] = {} + cmap: dict[Hid, list[Comment]] = {} vals = list(comments) for v in vals: cid = v['id'].zoom() @@ -163,7 +168,7 @@ class Loader: h['rects'].ignore() # TODO make it more generic.. - htags: List[str] = [] + htags: list[str] = [] if 'tags' in h: ht = h['tags'].zoom() for _k, v in list(ht.items()): @@ -242,7 +247,7 @@ def iter_entries() -> Iterable[Result]: yield err -def get_entries() -> List[Result]: +def get_entries() -> list[Result]: # sorting by first annotation is reasonable I guess??? # todo perhaps worth making it a pattern? X() returns iterable, get_X returns reasonably sorted list? return list(sort_res_by(iter_entries(), key=lambda e: e.created)) diff --git a/my/reddit/__init__.py b/my/reddit/__init__.py index e81aaf9..f344eeb 100644 --- a/my/reddit/__init__.py +++ b/my/reddit/__init__.py @@ -20,6 +20,7 @@ REQUIRES = [ from my.core.hpi_compat import handle_legacy_import + is_legacy_import = handle_legacy_import( parent_module_name=__name__, legacy_submodule_name='rexport', diff --git a/my/reddit/all.py b/my/reddit/all.py index daedba1..27e22df 100644 --- a/my/reddit/all.py +++ b/my/reddit/all.py @@ -1,8 +1,9 @@ -from typing import Iterator -from my.core import stat, Stats +from collections.abc import Iterator + +from my.core import Stats, stat from my.core.source import import_source -from .common import Save, Upvote, Comment, Submission, _merge_comments +from .common import Comment, Save, Submission, Upvote, _merge_comments # Man... ideally an all.py file isn't this verbose, but # reddit just feels like that much of a complicated source and diff --git a/my/reddit/common.py b/my/reddit/common.py index c01258b..40f9f6e 100644 --- a/my/reddit/common.py +++ b/my/reddit/common.py @@ -2,12 +2,14 @@ This defines Protocol classes, which make sure that each different type of shared models have a standardized interface """ -from my.core import __NOT_HPI_MODULE__ -from typing import Set, Iterator, Protocol +from my.core import __NOT_HPI_MODULE__ # isort: skip + +from collections.abc import Iterator from itertools import chain +from typing import Protocol -from my.core import datetime_aware, Json +from my.core import Json, datetime_aware # common fields across all the Protocol classes, so generic code can be written @@ -49,7 +51,7 @@ class Submission(RedditBase, Protocol): def _merge_comments(*sources: Iterator[Comment]) -> Iterator[Comment]: #from .rexport import logger #ignored = 0 - emitted: Set[str] = set() + emitted: set[str] = set() for e in chain(*sources): uid = e.id if uid in emitted: diff --git a/my/reddit/pushshift.py b/my/reddit/pushshift.py index 9580005..1bfa048 100644 --- a/my/reddit/pushshift.py +++ b/my/reddit/pushshift.py @@ -10,13 +10,13 @@ REQUIRES = [ from dataclasses import dataclass +# note: keeping pushshift import before config import, so it's handled gracefully by import_source +from pushshift_comment_export.dal import PComment, read_file + +from my.config import reddit as uconfig from my.core import Paths, Stats, stat from my.core.cfg import make_config -# note: keeping pushshift import before config import, so it's handled gracefully by import_source -from pushshift_comment_export.dal import read_file, PComment - -from my.config import reddit as uconfig @dataclass class pushshift_config(uconfig.pushshift): @@ -29,10 +29,10 @@ class pushshift_config(uconfig.pushshift): config = make_config(pushshift_config) -from my.core import get_files -from typing import Sequence, Iterator +from collections.abc import Iterator, Sequence from pathlib import Path +from my.core import get_files def inputs() -> Sequence[Path]: diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index 5dcd7d9..cb6af01 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -7,23 +7,24 @@ REQUIRES = [ 'git+https://github.com/karlicoss/rexport', ] -from dataclasses import dataclass import inspect +from collections.abc import Iterator, Sequence +from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Iterator, Sequence +from typing import TYPE_CHECKING from my.core import ( - get_files, - make_logger, - warnings, - stat, Paths, Stats, + get_files, + make_logger, + stat, + warnings, ) from my.core.cachew import mcachew -from my.core.cfg import make_config, Attrs +from my.core.cfg import Attrs, make_config -from my.config import reddit as uconfig +from my.config import reddit as uconfig # isort: skip logger = make_logger(__name__) diff --git a/my/rescuetime.py b/my/rescuetime.py index 76a0d4c..0c9fd28 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -5,16 +5,15 @@ REQUIRES = [ 'git+https://github.com/karlicoss/rescuexport', ] -from pathlib import Path +from collections.abc import Iterable, Sequence from datetime import timedelta -from typing import Sequence, Iterable +from pathlib import Path -from my.core import get_files, make_logger, stat, Stats +from my.core import Stats, get_files, make_logger, stat from my.core.cachew import mcachew from my.core.error import Res, split_errors -from my.config import rescuetime as config - +from my.config import rescuetime as config # isort: skip logger = make_logger(__name__) @@ -24,6 +23,7 @@ def inputs() -> Sequence[Path]: import rescuexport.dal as dal + DAL = dal.DAL Entry = dal.Entry @@ -43,6 +43,8 @@ def groups(gap: timedelta=timedelta(hours=3)) -> Iterable[Res[Sequence[Entry]]]: # todo automatic dataframe interface? from .core.pandas import DataFrameT, as_dataframe + + def dataframe() -> DataFrameT: return as_dataframe(entries()) @@ -56,16 +58,19 @@ def stats() -> Stats: # basically, hack config and populate it with fake data? fake data generated by DAL, but the rest is handled by this? +from collections.abc import Iterator from contextlib import contextmanager -from typing import Iterator + + # todo take seed, or what? @contextmanager def fake_data(rows: int=1000) -> Iterator: # todo also disable cachew automatically for such things? - from my.core.cfg import tmp_config - from my.core.cachew import disabled_cachew - from tempfile import TemporaryDirectory import json + from tempfile import TemporaryDirectory + + from my.core.cachew import disabled_cachew + from my.core.cfg import tmp_config with disabled_cachew(), TemporaryDirectory() as td: tdir = Path(td) f = tdir / 'rescuetime.json' diff --git a/my/roamresearch.py b/my/roamresearch.py index 2fe06d4..7322774 100644 --- a/my/roamresearch.py +++ b/my/roamresearch.py @@ -1,16 +1,19 @@ """ [[https://roamresearch.com][Roam]] data """ -from datetime import datetime, timezone -from pathlib import Path -from itertools import chain -import re -from typing import NamedTuple, Iterator, List, Optional +from __future__ import annotations -from .core import get_files, LazyLogger, Json +import re +from collections.abc import Iterator +from datetime import datetime, timezone +from itertools import chain +from pathlib import Path +from typing import NamedTuple from my.config import roamresearch as config +from .core import Json, LazyLogger, get_files + logger = LazyLogger(__name__) @@ -57,15 +60,15 @@ class Node(NamedTuple): return datetime.fromtimestamp(rt / 1000, tz=timezone.utc) @property - def title(self) -> Optional[str]: + def title(self) -> str | None: return self.raw.get(Keys.TITLE) @property - def body(self) -> Optional[str]: + def body(self) -> str | None: return self.raw.get(Keys.STRING) @property - def children(self) -> List['Node']: + def children(self) -> list[Node]: # TODO cache? needs a key argument (because of Json) ch = self.raw.get(Keys.CHILDREN, []) return list(map(Node, ch)) @@ -95,7 +98,7 @@ class Node(NamedTuple): # - heading -- notes that haven't been created yet return len(self.body or '') == 0 and len(self.children) == 0 - def traverse(self) -> Iterator['Node']: + def traverse(self) -> Iterator[Node]: # not sure about __iter__, because might be a bit unintuitive that it's recursive.. yield self for c in self.children: @@ -120,7 +123,7 @@ class Node(NamedTuple): return f'Node(created={self.created}, title={self.title}, body={self.body})' @staticmethod - def make(raw: Json) -> Iterator['Node']: + def make(raw: Json) -> Iterator[Node]: is_empty = set(raw.keys()) == {Keys.EDITED, Keys.EDIT_EMAIL, Keys.TITLE} # not sure about that... but daily notes end up like that if is_empty: @@ -130,11 +133,11 @@ class Node(NamedTuple): class Roam: - def __init__(self, raw: List[Json]) -> None: + def __init__(self, raw: list[Json]) -> None: self.raw = raw @property - def notes(self) -> List[Node]: + def notes(self) -> list[Node]: return list(chain.from_iterable(map(Node.make, self.raw))) def traverse(self) -> Iterator[Node]: diff --git a/my/rss/all.py b/my/rss/all.py index b4dbdbd..e10e4d2 100644 --- a/my/rss/all.py +++ b/my/rss/all.py @@ -3,9 +3,9 @@ Unified RSS data, merged from different services I used historically ''' # NOTE: you can comment out the sources you're not using -from . import feedbin, feedly +from collections.abc import Iterable -from typing import Iterable +from . import feedbin, feedly from .common import Subscription, compute_subscriptions diff --git a/my/rss/common.py b/my/rss/common.py index bb75297..bf9506e 100644 --- a/my/rss/common.py +++ b/my/rss/common.py @@ -1,10 +1,12 @@ -from my.core import __NOT_HPI_MODULE__ +from __future__ import annotations +from my.core import __NOT_HPI_MODULE__ # isort: skip + +from collections.abc import Iterable, Sequence from dataclasses import dataclass, replace from itertools import chain -from typing import Optional, List, Dict, Iterable, Tuple, Sequence -from my.core import warn_if_empty, datetime_aware +from my.core import datetime_aware, warn_if_empty @dataclass @@ -13,16 +15,16 @@ class Subscription: url: str id: str # TODO not sure about it... # eh, not all of them got reasonable 'created' time - created_at: Optional[datetime_aware] + created_at: datetime_aware | None subscribed: bool = True # snapshot of subscriptions at time -SubscriptionState = Tuple[datetime_aware, Sequence[Subscription]] +SubscriptionState = tuple[datetime_aware, Sequence[Subscription]] @warn_if_empty -def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscription]: +def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> list[Subscription]: """ Keeps track of everything I ever subscribed to. In addition, keeps track of unsubscribed as well (so you'd remember when and why you unsubscribed) @@ -30,7 +32,7 @@ def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscri states = list(chain.from_iterable(sources)) # TODO keep 'source'/'provider'/'service' attribute? - by_url: Dict[str, Subscription] = {} + by_url: dict[str, Subscription] = {} # ah. dates are used for sorting for _when, state in sorted(states): # TODO use 'when'? diff --git a/my/rss/feedbin.py b/my/rss/feedbin.py index dc13a17..5f4da0a 100644 --- a/my/rss/feedbin.py +++ b/my/rss/feedbin.py @@ -3,15 +3,15 @@ Feedbin RSS reader """ import json +from collections.abc import Iterator, Sequence from pathlib import Path -from typing import Iterator, Sequence -from my.core import get_files, stat, Stats +from my.core import Stats, get_files, stat from my.core.compat import fromisoformat + from .common import Subscription, SubscriptionState -from my.config import feedbin as config - +from my.config import feedbin as config # isort: skip def inputs() -> Sequence[Path]: return get_files(config.export_path) diff --git a/my/rss/feedly.py b/my/rss/feedly.py index 127ef61..9bf5429 100644 --- a/my/rss/feedly.py +++ b/my/rss/feedly.py @@ -4,9 +4,10 @@ Feedly RSS reader import json from abc import abstractmethod +from collections.abc import Iterator, Sequence from datetime import datetime, timezone from pathlib import Path -from typing import Iterator, Protocol, Sequence +from typing import Protocol from my.core import Paths, get_files diff --git a/my/rtm.py b/my/rtm.py index b559ba4..217c969 100644 --- a/my/rtm.py +++ b/my/rtm.py @@ -6,21 +6,19 @@ REQUIRES = [ 'icalendar', ] +import re +from collections.abc import Iterator from datetime import datetime from functools import cached_property -import re -from typing import Dict, List, Iterator -from my.core import make_logger, get_files -from my.core.utils.itertools import make_dict - -from my.config import rtm as config - - -from more_itertools import bucket import icalendar # type: ignore from icalendar.cal import Todo # type: ignore +from more_itertools import bucket +from my.core import get_files, make_logger +from my.core.utils.itertools import make_dict + +from my.config import rtm as config # isort: skip logger = make_logger(__name__) @@ -32,14 +30,14 @@ class MyTodo: self.revision = revision @cached_property - def notes(self) -> List[str]: + def notes(self) -> list[str]: # TODO can there be multiple?? desc = self.todo['DESCRIPTION'] notes = re.findall(r'---\n\n(.*?)\n\nUpdated:', desc, flags=re.DOTALL) return notes @cached_property - def tags(self) -> List[str]: + def tags(self) -> list[str]: desc = self.todo['DESCRIPTION'] [tags_str] = re.findall(r'\nTags: (.*?)\n', desc, flags=re.DOTALL) if tags_str == 'none': @@ -92,11 +90,11 @@ class DAL: for t in self.cal.walk('VTODO'): yield MyTodo(t, self.revision) - def get_todos_by_uid(self) -> Dict[str, MyTodo]: + def get_todos_by_uid(self) -> dict[str, MyTodo]: todos = self.all_todos() return make_dict(todos, key=lambda t: t.uid) - def get_todos_by_title(self) -> Dict[str, List[MyTodo]]: + def get_todos_by_title(self) -> dict[str, list[MyTodo]]: todos = self.all_todos() bucketed = bucket(todos, lambda todo: todo.title) return {k: list(bucketed[k]) for k in bucketed} diff --git a/my/runnerup.py b/my/runnerup.py index a21075a..f5d7d1e 100644 --- a/my/runnerup.py +++ b/my/runnerup.py @@ -6,17 +6,15 @@ REQUIRES = [ 'python-tcxparser', ] +from collections.abc import Iterable from datetime import timedelta from pathlib import Path -from typing import Iterable - -from my.core import Res, get_files, Json -from my.core.compat import fromisoformat import tcxparser # type: ignore[import-untyped] from my.config import runnerup as config - +from my.core import Json, Res, get_files +from my.core.compat import fromisoformat # TODO later, use a proper namedtuple? Workout = Json @@ -70,6 +68,8 @@ def workouts() -> Iterable[Res[Workout]]: from .core.pandas import DataFrameT, check_dataframe, error_to_row + + @check_dataframe def dataframe() -> DataFrameT: def it(): @@ -85,6 +85,8 @@ def dataframe() -> DataFrameT: return df -from .core import stat, Stats +from .core import Stats, stat + + def stats() -> Stats: return stat(dataframe) diff --git a/my/simple.py b/my/simple.py index 7462291..b7f25cd 100644 --- a/my/simple.py +++ b/my/simple.py @@ -1,12 +1,11 @@ ''' Just a demo module for testing and documentation purposes ''' +from collections.abc import Iterator from dataclasses import dataclass -from typing import Iterator - -from my.core import make_config from my.config import simple as user_config +from my.core import make_config @dataclass diff --git a/my/smscalls.py b/my/smscalls.py index 78bf7ee..ccaac72 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -2,6 +2,7 @@ Phone calls and SMS messages Exported using https://play.google.com/store/apps/details?id=com.riteshsahu.SMSBackupRestore&hl=en_US """ +from __future__ import annotations # See: https://www.synctech.com.au/sms-backup-restore/fields-in-xml-backup-files/ for schema @@ -9,8 +10,9 @@ REQUIRES = ['lxml'] from dataclasses import dataclass -from my.core import get_files, stat, Paths, Stats from my.config import smscalls as user_config +from my.core import Paths, Stats, get_files, stat + @dataclass class smscalls(user_config): @@ -18,11 +20,13 @@ class smscalls(user_config): export_path: Paths from my.core.cfg import make_config + config = make_config(smscalls) +from collections.abc import Iterator from datetime import datetime, timezone from pathlib import Path -from typing import NamedTuple, Iterator, Set, Tuple, Optional, Any, Dict, List +from typing import Any, NamedTuple import lxml.etree as etree @@ -33,7 +37,7 @@ class Call(NamedTuple): dt: datetime dt_readable: str duration_s: int - who: Optional[str] + who: str | None # type - 1 = Incoming, 2 = Outgoing, 3 = Missed, 4 = Voicemail, 5 = Rejected, 6 = Refused List. call_type: int @@ -50,7 +54,7 @@ class Call(NamedTuple): # All the field values are read as-is from the underlying database and no conversion is done by the app in most cases. # # The '(Unknown)' is just what my android phone does, not sure if there are others -UNKNOWN: Set[str] = {'(Unknown)'} +UNKNOWN: set[str] = {'(Unknown)'} def _extract_calls(path: Path) -> Iterator[Res[Call]]: @@ -83,7 +87,7 @@ def calls() -> Iterator[Res[Call]]: files = get_files(config.export_path, glob='calls-*.xml') # TODO always replacing with the latter is good, we get better contact names?? - emitted: Set[datetime] = set() + emitted: set[datetime] = set() for p in files: for c in _extract_calls(p): if isinstance(c, Exception): @@ -98,7 +102,7 @@ def calls() -> Iterator[Res[Call]]: class Message(NamedTuple): dt: datetime dt_readable: str - who: Optional[str] + who: str | None message: str phone_number: str # type - 1 = Received, 2 = Sent, 3 = Draft, 4 = Outbox, 5 = Failed, 6 = Queued @@ -112,7 +116,7 @@ class Message(NamedTuple): def messages() -> Iterator[Res[Message]]: files = get_files(config.export_path, glob='sms-*.xml') - emitted: Set[Tuple[datetime, Optional[str], bool]] = set() + emitted: set[tuple[datetime, str | None, bool]] = set() for p in files: for c in _extract_messages(p): if isinstance(c, Exception): @@ -155,20 +159,20 @@ class MMSContentPart(NamedTuple): sequence_index: int content_type: str filename: str - text: Optional[str] - data: Optional[str] + text: str | None + data: str | None class MMS(NamedTuple): dt: datetime dt_readable: str - parts: List[MMSContentPart] + parts: list[MMSContentPart] # NOTE: these is often something like 'Name 1, Name 2', but might be different depending on your client - who: Optional[str] + who: str | None # NOTE: This can be a single phone number, or multiple, split by '~' or ','. Its better to think # of this as a 'key' or 'conversation ID', phone numbers are also present in 'addresses' phone_number: str - addresses: List[Tuple[str, int]] + addresses: list[tuple[str, int]] # 1 = Received, 2 = Sent, 3 = Draft, 4 = Outbox message_type: int @@ -194,7 +198,7 @@ class MMS(NamedTuple): def mms() -> Iterator[Res[MMS]]: files = get_files(config.export_path, glob='sms-*.xml') - emitted: Set[Tuple[datetime, Optional[str], str]] = set() + emitted: set[tuple[datetime, str | None, str]] = set() for p in files: for c in _extract_mms(p): if isinstance(c, Exception): @@ -207,7 +211,7 @@ def mms() -> Iterator[Res[MMS]]: yield c -def _resolve_null_str(value: Optional[str]) -> Optional[str]: +def _resolve_null_str(value: str | None) -> str | None: if value is None: return None # hmm.. theres some risk of the text actually being 'null', but theres @@ -235,7 +239,7 @@ def _extract_mms(path: Path) -> Iterator[Res[MMS]]: yield RuntimeError(f'Missing one or more required attributes [date, readable_date, msg_box, address] in {mxml_str}') continue - addresses: List[Tuple[str, int]] = [] + addresses: list[tuple[str, int]] = [] for addr_parent in mxml.findall('addrs'): for addr in addr_parent.findall('addr'): addr_data = addr.attrib @@ -250,7 +254,7 @@ def _extract_mms(path: Path) -> Iterator[Res[MMS]]: continue addresses.append((user_address, int(user_type))) - content: List[MMSContentPart] = [] + content: list[MMSContentPart] = [] for part_root in mxml.findall('parts'): @@ -267,8 +271,8 @@ def _extract_mms(path: Path) -> Iterator[Res[MMS]]: # # man, attrib is some internal cpython ._Attrib type which can't # be typed by any sort of mappingproxy. maybe a protocol could work..? - part_data: Dict[str, Any] = part.attrib # type: ignore - seq: Optional[str] = part_data.get('seq') + part_data: dict[str, Any] = part.attrib # type: ignore + seq: str | None = part_data.get('seq') if seq == '-1': continue @@ -276,13 +280,13 @@ def _extract_mms(path: Path) -> Iterator[Res[MMS]]: yield RuntimeError(f'seq must be a number, was seq={seq} {type(seq)} in {part_data}') continue - charset_type: Optional[str] = _resolve_null_str(part_data.get('ct')) - filename: Optional[str] = _resolve_null_str(part_data.get('name')) + charset_type: str | None = _resolve_null_str(part_data.get('ct')) + filename: str | None = _resolve_null_str(part_data.get('name')) # in some cases (images, cards), the filename is set in 'cl' instead if filename is None: filename = _resolve_null_str(part_data.get('cl')) - text: Optional[str] = _resolve_null_str(part_data.get('text')) - data: Optional[str] = _resolve_null_str(part_data.get('data')) + text: str | None = _resolve_null_str(part_data.get('text')) + data: str | None = _resolve_null_str(part_data.get('data')) if charset_type is None or filename is None or (text is None and data is None): yield RuntimeError(f'Missing one or more required attributes [ct, name, (text, data)] must be present in {part_data}') diff --git a/my/stackexchange/gdpr.py b/my/stackexchange/gdpr.py index 5292bef..78987be 100644 --- a/my/stackexchange/gdpr.py +++ b/my/stackexchange/gdpr.py @@ -6,8 +6,11 @@ Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][officia ### config from dataclasses import dataclass + from my.config import stackexchange as user_config -from my.core import PathIsh, make_config, get_files, Json +from my.core import Json, PathIsh, get_files, make_config + + @dataclass class stackexchange(user_config): gdpr_path: PathIsh # path to GDPR zip file @@ -17,9 +20,13 @@ config = make_config(stackexchange) # TODO just merge all of them and then filter?.. not sure -from my.core.compat import fromisoformat -from typing import NamedTuple, Iterable +from collections.abc import Iterable from datetime import datetime +from typing import NamedTuple + +from my.core.compat import fromisoformat + + class Vote(NamedTuple): j: Json # todo ip? @@ -62,7 +69,10 @@ class Vote(NamedTuple): # todo expose vote type? import json + from ..core.error import Res + + def votes() -> Iterable[Res[Vote]]: # TODO there is also some site specific stuff in qa/ directory.. not sure if its' more detailed # todo should be defensive? not sure if present when user has no votes @@ -74,6 +84,8 @@ def votes() -> Iterable[Res[Vote]]: yield Vote(r) -from ..core import stat, Stats +from ..core import Stats, stat + + def stats() -> Stats: return stat(votes) diff --git a/my/stackexchange/stexport.py b/my/stackexchange/stexport.py index 812a155..111ed28 100644 --- a/my/stackexchange/stexport.py +++ b/my/stackexchange/stexport.py @@ -16,7 +16,8 @@ from my.core import ( make_config, stat, ) -import my.config + +import my.config # isort: skip @dataclass diff --git a/my/taplog.py b/my/taplog.py index 51eeb72..5e64a72 100644 --- a/my/taplog.py +++ b/my/taplog.py @@ -1,24 +1,26 @@ ''' [[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data ''' -from datetime import datetime -from typing import NamedTuple, Dict, Optional, Iterable +from __future__ import annotations -from my.core import get_files, stat, Stats -from my.core.sqlite import sqlite_connection +from collections.abc import Iterable +from datetime import datetime +from typing import NamedTuple from my.config import taplog as user_config +from my.core import Stats, get_files, stat +from my.core.sqlite import sqlite_connection class Entry(NamedTuple): - row: Dict + row: dict @property def id(self) -> str: return str(self.row['_id']) @property - def number(self) -> Optional[float]: + def number(self) -> float | None: ns = self.row['number'] # TODO ?? if isinstance(ns, str): diff --git a/my/telegram/telegram_backup.py b/my/telegram/telegram_backup.py index ff4f904..eea7e50 100644 --- a/my/telegram/telegram_backup.py +++ b/my/telegram/telegram_backup.py @@ -1,17 +1,17 @@ """ Telegram data via [fabianonline/telegram_backup](https://github.com/fabianonline/telegram_backup) tool """ +from __future__ import annotations +import sqlite3 +from collections.abc import Iterator from dataclasses import dataclass from datetime import datetime, timezone -from struct import unpack_from, calcsize -import sqlite3 -from typing import Dict, Iterator, Optional - -from my.core import datetime_aware, PathIsh -from my.core.sqlite import sqlite_connection +from struct import calcsize, unpack_from from my.config import telegram as user_config +from my.core import PathIsh, datetime_aware +from my.core.sqlite import sqlite_connection @dataclass @@ -23,17 +23,17 @@ class config(user_config.telegram_backup): @dataclass class Chat: id: str - name: Optional[str] + name: str | None # not all users have short handle + groups don't have them either? # TODO hmm some groups have it -- it's just the tool doesn't dump them?? - handle: Optional[str] + handle: str | None # not sure if need type? @dataclass class User: id: str - name: Optional[str] + name: str | None @dataclass @@ -44,7 +44,7 @@ class Message: chat: Chat sender: User text: str - extra_media_info: Optional[str] = None + extra_media_info: str | None = None @property def permalink(self) -> str: @@ -61,7 +61,7 @@ class Message: -Chats = Dict[str, Chat] +Chats = dict[str, Chat] def _message_from_row(r: sqlite3.Row, *, chats: Chats, with_extra_media_info: bool) -> Message: ts = r['time'] # desktop export uses UTC (checked by exporting in winter time vs summer time) @@ -70,7 +70,7 @@ def _message_from_row(r: sqlite3.Row, *, chats: Chats, with_extra_media_info: bo chat = chats[r['source_id']] sender = chats[r['sender_id']] - extra_media_info: Optional[str] = None + extra_media_info: str | None = None if with_extra_media_info and r['has_media'] == 1: # also it's quite hacky, so at least for now it's just an optional attribute behind the flag # defensive because it's a bit tricky to correctly parse without a proper api parser.. @@ -90,7 +90,7 @@ def _message_from_row(r: sqlite3.Row, *, chats: Chats, with_extra_media_info: bo ) -def messages(*, extra_where: Optional[str]=None, with_extra_media_info: bool=False) -> Iterator[Message]: +def messages(*, extra_where: str | None=None, with_extra_media_info: bool=False) -> Iterator[Message]: messages_query = 'SELECT * FROM messages WHERE message_type NOT IN ("service_message", "empty_message")' if extra_where is not None: messages_query += ' AND ' + extra_where @@ -106,7 +106,7 @@ def messages(*, extra_where: Optional[str]=None, with_extra_media_info: bool=Fal for r in db.execute('SELECT * FROM users ORDER BY id'): first = r["first_name"] last = r["last_name"] - name: Optional[str] + name: str | None if first is not None and last is not None: name = f'{first} {last}' else: @@ -121,7 +121,7 @@ def messages(*, extra_where: Optional[str]=None, with_extra_media_info: bool=Fal yield _message_from_row(r, chats=chats, with_extra_media_info=with_extra_media_info) -def _extract_extra_media_info(data: bytes) -> Optional[str]: +def _extract_extra_media_info(data: bytes) -> str | None: # ugh... very hacky, but it does manage to extract from 90% of messages that have media pos = 0 diff --git a/my/tests/bluemaestro.py b/my/tests/bluemaestro.py index 2d7c81e..d139a8f 100644 --- a/my/tests/bluemaestro.py +++ b/my/tests/bluemaestro.py @@ -1,4 +1,4 @@ -from typing import Iterator +from collections.abc import Iterator import pytest from more_itertools import one diff --git a/my/tests/body/weight.py b/my/tests/body/weight.py index 069e940..f26ccf2 100644 --- a/my/tests/body/weight.py +++ b/my/tests/body/weight.py @@ -1,8 +1,10 @@ from pathlib import Path -import pytz -from my.core.cfg import tmp_config + import pytest +import pytz + from my.body.weight import from_orgmode +from my.core.cfg import tmp_config def test_body_weight() -> None: diff --git a/my/tests/commits.py b/my/tests/commits.py index c967027..48e349f 100644 --- a/my/tests/commits.py +++ b/my/tests/commits.py @@ -1,14 +1,11 @@ import os from pathlib import Path -from more_itertools import bucket import pytest - - -from my.core.cfg import tmp_config +from more_itertools import bucket from my.coding.commits import commits - +from my.core.cfg import tmp_config pytestmark = pytest.mark.skipif( os.name == 'nt', diff --git a/my/tests/location/fallback.py b/my/tests/location/fallback.py index 10a4e5b..c09b902 100644 --- a/my/tests/location/fallback.py +++ b/my/tests/location/fallback.py @@ -2,8 +2,8 @@ To test my.location.fallback_location.all """ +from collections.abc import Iterator from datetime import datetime, timedelta, timezone -from typing import Iterator import pytest from more_itertools import ilen diff --git a/my/tests/reddit.py b/my/tests/reddit.py index 4f1ec51..4ddccf8 100644 --- a/my/tests/reddit.py +++ b/my/tests/reddit.py @@ -1,16 +1,14 @@ import pytest from more_itertools import consume -from my.core.cfg import tmp_config -from my.core.utils.itertools import ensure_unique - -from .common import testdata - - # deliberately use mixed style imports on the top level and inside the methods to test tmp_config stuff # todo won't really be necessary once we migrate to lazy user config import my.reddit.all as my_reddit_all import my.reddit.rexport as my_reddit_rexport +from my.core.cfg import tmp_config +from my.core.utils.itertools import ensure_unique + +from .common import testdata def test_basic_1() -> None: diff --git a/my/time/tz/common.py b/my/time/tz/common.py index 13c8ac0..c0dd262 100644 --- a/my/time/tz/common.py +++ b/my/time/tz/common.py @@ -3,7 +3,6 @@ from typing import Callable, Literal, cast from my.core import datetime_aware - ''' Depending on the specific data provider and your level of paranoia you might expect different behaviour.. E.g.: - if your objects already have tz info, you might not need to call localize() at all diff --git a/my/time/tz/main.py b/my/time/tz/main.py index fafc5fe..bdd36b1 100644 --- a/my/time/tz/main.py +++ b/my/time/tz/main.py @@ -6,6 +6,7 @@ from datetime import datetime from my.core import datetime_aware + # todo hmm, kwargs isn't mypy friendly.. but specifying types would require duplicating default args. uhoh def localize(dt: datetime, **kwargs) -> datetime_aware: # todo document patterns for combining multiple data sources diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 4920333..58b5bf7 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -2,6 +2,8 @@ Timezone data provider, guesses timezone based on location data (e.g. GPS) ''' +from __future__ import annotations + REQUIRES = [ # for determining timezone by coordinate 'timezonefinder', @@ -10,6 +12,7 @@ REQUIRES = [ import heapq import os from collections import Counter +from collections.abc import Iterable, Iterator from dataclasses import dataclass from datetime import date, datetime from functools import lru_cache @@ -17,14 +20,7 @@ from itertools import groupby from typing import ( TYPE_CHECKING, Any, - Dict, - Iterable, - Iterator, - List, - Optional, Protocol, - Set, - Tuple, ) import pytz @@ -102,7 +98,7 @@ def _timezone_finder(*, fast: bool) -> Any: # for backwards compatibility -def _locations() -> Iterator[Tuple[LatLon, datetime_aware]]: +def _locations() -> Iterator[tuple[LatLon, datetime_aware]]: try: import my.location.all @@ -125,7 +121,7 @@ def _locations() -> Iterator[Tuple[LatLon, datetime_aware]]: # TODO: could use heapmerge or sort the underlying iterators somehow? # see https://github.com/karlicoss/HPI/pull/237#discussion_r858372934 -def _sorted_locations() -> List[Tuple[LatLon, datetime_aware]]: +def _sorted_locations() -> list[tuple[LatLon, datetime_aware]]: return sorted(_locations(), key=lambda x: x[1]) @@ -140,7 +136,7 @@ class DayWithZone: zone: Zone -def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> Iterator[DayWithZone]: +def _find_tz_for_locs(finder: Any, locs: Iterable[tuple[LatLon, datetime]]) -> Iterator[DayWithZone]: for (lat, lon), dt in locs: # TODO right. its _very_ slow... zone = finder.timezone_at(lat=lat, lng=lon) @@ -172,7 +168,7 @@ def _iter_local_dates() -> Iterator[DayWithZone]: # TODO: warnings doesn't actually warn? # warnings = [] - locs: Iterable[Tuple[LatLon, datetime]] + locs: Iterable[tuple[LatLon, datetime]] locs = _sorted_locations() if cfg.sort_locations else _locations() yield from _find_tz_for_locs(finder, locs) @@ -187,7 +183,7 @@ def _iter_local_dates_fallback() -> Iterator[DayWithZone]: cfg = make_config() - def _fallback_locations() -> Iterator[Tuple[LatLon, datetime]]: + def _fallback_locations() -> Iterator[tuple[LatLon, datetime]]: for loc in sorted(flocs(), key=lambda x: x.dt): yield ((loc.lat, loc.lon), loc.dt) @@ -225,14 +221,14 @@ def _iter_tzs() -> Iterator[DayWithZone]: # we need to sort them first before we can do a groupby by_day = lambda p: p.day - local_dates: List[DayWithZone] = sorted(_iter_local_dates(), key=by_day) + local_dates: list[DayWithZone] = sorted(_iter_local_dates(), key=by_day) logger.debug(f"no. of items using exact locations: {len(local_dates)}") - local_dates_fallback: List[DayWithZone] = sorted(_iter_local_dates_fallback(), key=by_day) + local_dates_fallback: list[DayWithZone] = sorted(_iter_local_dates_fallback(), key=by_day) # find days that are in fallback but not in local_dates (i.e., missing days) - local_dates_set: Set[date] = {d.day for d in local_dates} - use_fallback_days: List[DayWithZone] = [d for d in local_dates_fallback if d.day not in local_dates_set] + local_dates_set: set[date] = {d.day for d in local_dates} + use_fallback_days: list[DayWithZone] = [d for d in local_dates_fallback if d.day not in local_dates_set] logger.debug(f"no. of items being used from fallback locations: {len(use_fallback_days)}") # combine local_dates and missing days from fallback into a sorted list @@ -246,20 +242,20 @@ def _iter_tzs() -> Iterator[DayWithZone]: @lru_cache(1) -def _day2zone() -> Dict[date, pytz.BaseTzInfo]: +def _day2zone() -> dict[date, pytz.BaseTzInfo]: # NOTE: kinda unfortunate that this will have to process all days before returning result for just one # however otherwise cachew cache might never be initialized properly # so we'll always end up recomputing everyting during subsequent runs return {dz.day: pytz.timezone(dz.zone) for dz in _iter_tzs()} -def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]: +def _get_day_tz(d: date) -> pytz.BaseTzInfo | None: return _day2zone().get(d) # ok to cache, there are only a few home locations? @lru_cache(None) -def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]: +def _get_home_tz(loc: LatLon) -> pytz.BaseTzInfo | None: (lat, lng) = loc finder = _timezone_finder(fast=False) # ok to use slow here for better precision zone = finder.timezone_at(lat=lat, lng=lng) @@ -270,7 +266,7 @@ def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]: return pytz.timezone(zone) -def get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: +def get_tz(dt: datetime) -> pytz.BaseTzInfo | None: ''' Given a datetime, returns the timezone for that date. ''' diff --git a/my/tinder/android.py b/my/tinder/android.py index d9b256b..a09794f 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -3,20 +3,22 @@ Tinder data from Android app database (in =/data/data/com.tinder/databases/tinde """ from __future__ import annotations -from collections import defaultdict, Counter +import sqlite3 +from collections import Counter, defaultdict +from collections.abc import Iterator, Mapping, Sequence from dataclasses import dataclass from datetime import datetime, timezone from itertools import chain from pathlib import Path -import sqlite3 -from typing import Sequence, Iterator, Union, Dict, List, Mapping +from typing import Union -from my.core import Paths, get_files, Res, stat, Stats, datetime_aware, make_logger +from my.core import Paths, Res, Stats, datetime_aware, get_files, make_logger, stat from my.core.common import unique_everseen from my.core.compat import assert_never from my.core.error import echain from my.core.sqlite import sqlite_connection -import my.config + +import my.config # isort: skip logger = make_logger(__name__) @@ -164,8 +166,8 @@ def _parse_msg(row: sqlite3.Row) -> _Message: # todo maybe it's rich_entities method? def entities() -> Iterator[Res[Entity]]: - id2person: Dict[str, Person] = {} - id2match: Dict[str, Match] = {} + id2person: dict[str, Person] = {} + id2match: dict[str, Match] = {} for x in unique_everseen(_entities): if isinstance(x, Exception): yield x @@ -217,7 +219,7 @@ def messages() -> Iterator[Res[Message]]: # todo not sure, maybe it's not fundamental enough to keep here... def match2messages() -> Iterator[Res[Mapping[Match, Sequence[Message]]]]: - res: Dict[Match, List[Message]] = defaultdict(list) + res: dict[Match, list[Message]] = defaultdict(list) for x in entities(): if isinstance(x, Exception): yield x diff --git a/my/topcoder.py b/my/topcoder.py index 07f71be..56403e2 100644 --- a/my/topcoder.py +++ b/my/topcoder.py @@ -1,14 +1,14 @@ +import json +from collections.abc import Iterator, Sequence from dataclasses import dataclass from functools import cached_property -import json from pathlib import Path -from typing import Iterator, Sequence -from my.core import get_files, Res, datetime_aware +from my.core import Res, datetime_aware, get_files from my.core.compat import fromisoformat from my.experimental.destructive_parsing import Manager -from my.config import topcoder as config # type: ignore[attr-defined] +from my.config import topcoder as config # type: ignore[attr-defined] # isort: skip def inputs() -> Sequence[Path]: diff --git a/my/twitter/all.py b/my/twitter/all.py index 4714021..c2c471e 100644 --- a/my/twitter/all.py +++ b/my/twitter/all.py @@ -1,11 +1,11 @@ """ Unified Twitter data (merged from the archive and periodic updates) """ -from typing import Iterator +from collections.abc import Iterator + from ..core import Res from ..core.source import import_source -from .common import merge_tweets, Tweet - +from .common import Tweet, merge_tweets # NOTE: you can comment out the sources you don't need src_twint = import_source(module_name='my.twitter.twint') diff --git a/my/twitter/android.py b/my/twitter/android.py index ada04ae..88c9389 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -4,21 +4,21 @@ Twitter data from official app for Android from __future__ import annotations +import re +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -import re from struct import unpack_from -from typing import Iterator, Sequence, Set -from my.core import datetime_aware, get_files, LazyLogger, Paths, Res +from my.core import LazyLogger, Paths, Res, datetime_aware, get_files from my.core.common import unique_everseen from my.core.sqlite import sqlite_connect_immutable -import my.config - from .common import permalink +import my.config # isort: skip + logger = LazyLogger(__name__) @@ -155,7 +155,7 @@ _SELECT_OWN_TWEETS = '_SELECT_OWN_TWEETS' def get_own_user_id(conn) -> str: # unclear what's the reliable way to query it, so we use multiple different ones and arbitrate # NOTE: 'SELECT DISTINCT ev_owner_id FROM lists' doesn't work, might include lists from other people? - res: Set[str] = set() + res: set[str] = set() # need to cast as it's int by default for q in [ 'SELECT DISTINCT CAST(list_mapping_user_id AS TEXT) FROM list_mapping', @@ -239,7 +239,7 @@ def _process_one(f: Path, *, where: str) -> Iterator[Res[Tweet]]: NOT (statuses.in_r_user_id == -1 AND statuses.in_r_status_id == -1 AND statuses.conversation_id == 0) ''' - def _query_one(*, where: str, quoted: Set[int]) -> Iterator[Res[Tweet]]: + def _query_one(*, where: str, quoted: set[int]) -> Iterator[Res[Tweet]]: for ( tweet_id, user_username, @@ -263,7 +263,7 @@ def _process_one(f: Path, *, where: str) -> Iterator[Res[Tweet]]: text=content, ) - quoted: Set[int] = set() + quoted: set[int] = set() yield from _query_one(where=db_where, quoted=quoted) # get quoted tweets 'recursively' # TODO maybe do it for favs/bookmarks too? not sure diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 1573754..c9d2dbc 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -7,6 +7,7 @@ from __future__ import annotations import html import json # hmm interesting enough, orjson didn't give much speedup here? from abc import abstractmethod +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime from functools import cached_property @@ -14,8 +15,6 @@ from itertools import chain from pathlib import Path from typing import ( TYPE_CHECKING, - Iterator, - Sequence, ) from more_itertools import unique_everseen diff --git a/my/twitter/common.py b/my/twitter/common.py index 258216f..8c346f6 100644 --- a/my/twitter/common.py +++ b/my/twitter/common.py @@ -1,17 +1,19 @@ -from my.core import __NOT_HPI_MODULE__ +from my.core import __NOT_HPI_MODULE__ # isort: skip +from collections.abc import Iterator from itertools import chain -from typing import Iterator, Any +from typing import Any from more_itertools import unique_everseen - # TODO add proper Protocol for Tweet Tweet = Any TweetId = str -from my.core import warn_if_empty, Res +from my.core import Res, warn_if_empty + + @warn_if_empty def merge_tweets(*sources: Iterator[Res[Tweet]]) -> Iterator[Res[Tweet]]: def key(r: Res[Tweet]): diff --git a/my/twitter/talon.py b/my/twitter/talon.py index 1b79727..dbf2e2e 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -7,10 +7,11 @@ from __future__ import annotations import re import sqlite3 from abc import abstractmethod +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -from typing import Iterator, Sequence, Union +from typing import Union from my.core import Paths, Res, datetime_aware, get_files from my.core.common import unique_everseen diff --git a/my/twitter/twint.py b/my/twitter/twint.py index ceb5406..5106923 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -1,17 +1,17 @@ """ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export. """ +from collections.abc import Iterator from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -from typing import NamedTuple, Iterator, List +from typing import NamedTuple - -from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats +from my.core import Json, LazyLogger, Paths, Res, Stats, datetime_aware, get_files, stat from my.core.cfg import make_config from my.core.sqlite import sqlite_connection -from my.config import twint as user_config +from my.config import twint as user_config # isort: skip # TODO move to twitter.twint config structure @@ -76,7 +76,7 @@ class Tweet(NamedTuple): return text @property - def urls(self) -> List[str]: + def urls(self) -> list[str]: ustr = self.row['urls'] if len(ustr) == 0: return [] diff --git a/my/util/hpi_heartbeat.py b/my/util/hpi_heartbeat.py index 84790a4..6dcac7e 100644 --- a/my/util/hpi_heartbeat.py +++ b/my/util/hpi_heartbeat.py @@ -5,12 +5,13 @@ In particular the behaviour of import_original_module function The idea of testing is that overlays extend this module, and add their own items to items(), and the checker asserts all overlays have contributed. """ -from my.core import __NOT_HPI_MODULE__ +from my.core import __NOT_HPI_MODULE__ # isort: skip + +import sys +from collections.abc import Iterator from dataclasses import dataclass from datetime import datetime -import sys -from typing import Iterator, List NOW = datetime.now() @@ -19,10 +20,10 @@ NOW = datetime.now() class Item: dt: datetime message: str - path: List[str] + path: list[str] -def get_pkg_path() -> List[str]: +def get_pkg_path() -> list[str]: pkg = sys.modules[__package__] return list(pkg.__path__) diff --git a/my/vk/favorites.py b/my/vk/favorites.py index 9caae6d..5f278ff 100644 --- a/my/vk/favorites.py +++ b/my/vk/favorites.py @@ -1,20 +1,21 @@ # todo: uses my private export script?, timezone +from __future__ import annotations + +import json +from collections.abc import Iterable, Iterator from dataclasses import dataclass from datetime import datetime, timezone -import json -from typing import Iterator, Iterable, Optional - -from my.core import Json, datetime_aware, stat, Stats -from my.core.error import Res from my.config import vk as config # type: ignore[attr-defined] +from my.core import Json, Stats, datetime_aware, stat +from my.core.error import Res @dataclass class Favorite: dt: datetime_aware title: str - url: Optional[str] + url: str | None text: str diff --git a/my/vk/vk_messages_backup.py b/my/vk/vk_messages_backup.py index c73587f..4f593c8 100644 --- a/my/vk/vk_messages_backup.py +++ b/my/vk/vk_messages_backup.py @@ -2,18 +2,16 @@ VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) ''' # note: could reuse the original repo, but little point I guess since VK closed their API +import json +from collections.abc import Iterator from dataclasses import dataclass from datetime import datetime -import json -from typing import Dict, Iterator import pytz -from my.core import stat, Stats, Json, Res, datetime_aware, get_files -from my.core.common import unique_everseen - from my.config import vk_messages_backup as config - +from my.core import Json, Res, Stats, datetime_aware, get_files, stat +from my.core.common import unique_everseen # I think vk_messages_backup used this tz? # not sure if vk actually used to return this tz in api? @@ -45,7 +43,7 @@ class Message: body: str -Users = Dict[Uid, User] +Users = dict[Uid, User] def users() -> Users: diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py index 3dfed3e..27ee743 100644 --- a/my/whatsapp/android.py +++ b/my/whatsapp/android.py @@ -3,18 +3,19 @@ Whatsapp data from Android app database (in =/data/data/com.whatsapp/databases/m """ from __future__ import annotations +import sqlite3 +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -import sqlite3 -from typing import Union, Sequence, Iterator, Optional +from typing import Union -from my.core import get_files, Paths, datetime_aware, Res, make_logger, make_config +from my.core import Paths, Res, datetime_aware, get_files, make_config, make_logger from my.core.common import unique_everseen from my.core.error import echain, notnone from my.core.sqlite import sqlite_connection -import my.config +import my.config # isort: skip logger = make_logger(__name__) @@ -23,7 +24,7 @@ logger = make_logger(__name__) class Config(my.config.whatsapp.android): # paths[s]/glob to the exported sqlite databases export_path: Paths - my_user_id: Optional[str] = None + my_user_id: str | None = None config = make_config(Config) @@ -38,13 +39,13 @@ class Chat: id: str # todo not sure how to support renames? # could change Chat object itself, but this won't work well with incremental processing.. - name: Optional[str] + name: str | None @dataclass(unsafe_hash=True) class Sender: id: str - name: Optional[str] + name: str | None @dataclass(unsafe_hash=True) @@ -53,7 +54,7 @@ class Message: id: str dt: datetime_aware sender: Sender - text: Optional[str] + text: str | None Entity = Union[Chat, Sender, Message] @@ -125,9 +126,9 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Entity]: ts: int = notnone(r['timestamp']) dt = datetime.fromtimestamp(ts / 1000, tz=timezone.utc) - text: Optional[str] = r['text_data'] - media_file_path: Optional[str] = r['file_path'] - media_file_size: Optional[int] = r['file_size'] + text: str | None = r['text_data'] + media_file_path: str | None = r['file_path'] + media_file_size: int | None = r['file_size'] message_type = r['message_type'] diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index f29b2e3..703715f 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -1,7 +1,8 @@ from __future__ import annotations +from collections.abc import Iterable, Iterator from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Iterable, Iterator +from typing import TYPE_CHECKING, Any from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings from my.core.compat import deprecated diff --git a/my/zotero.py b/my/zotero.py index 4440aae..8eb34ba 100644 --- a/my/zotero.py +++ b/my/zotero.py @@ -1,14 +1,16 @@ +from __future__ import annotations as _annotations + +import json +import sqlite3 +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone -import json -from typing import Iterator, Optional, Dict, Any, Sequence from pathlib import Path -import sqlite3 +from typing import Any -from my.core import make_logger, Res, datetime_aware +from my.core import Res, datetime_aware, make_logger from my.core.sqlite import sqlite_copy_and_open - logger = make_logger(__name__) @@ -26,7 +28,7 @@ class Item: """Corresponds to 'Zotero item'""" file: Path title: str - url: Optional[Url] + url: Url | None tags: Sequence[str] @@ -39,8 +41,8 @@ class Annotation: page: int """0-indexed""" - text: Optional[str] - comment: Optional[str] + text: str | None + comment: str | None tags: Sequence[str] color_hex: str """Original hex-encoded color in zotero""" @@ -97,7 +99,7 @@ WHERE ID.fieldID = 13 AND IA.itemID = ? # TODO maybe exclude 'private' methods from detection? -def _query_raw() -> Iterator[Res[Dict[str, Any]]]: +def _query_raw() -> Iterator[Res[dict[str, Any]]]: [db] = inputs() with sqlite_copy_and_open(db) as conn: @@ -157,7 +159,7 @@ def _hex2human(color_hex: str) -> str: }.get(color_hex, color_hex) -def _parse_annotation(r: Dict) -> Annotation: +def _parse_annotation(r: dict) -> Annotation: text = r['text'] comment = r['comment'] # todo use json query for this? diff --git a/my/zulip/organization.py b/my/zulip/organization.py index 2e0df4b..d0cfcb7 100644 --- a/my/zulip/organization.py +++ b/my/zulip/organization.py @@ -6,11 +6,11 @@ from __future__ import annotations import json from abc import abstractmethod +from collections.abc import Iterator, Sequence from dataclasses import dataclass from datetime import datetime, timezone from itertools import count from pathlib import Path -from typing import Iterator, Sequence from my.core import ( Json, diff --git a/ruff.toml b/ruff.toml index 5fbd657..3d803e7 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,4 +1,4 @@ -target-version = "py38" # NOTE: inferred from pyproject.toml if present +target-version = "py39" # NOTE: inferred from pyproject.toml if present lint.extend-select = [ "F", # flakes rules -- default, but extend just in case @@ -26,8 +26,8 @@ lint.extend-select = [ "TID", # various imports suggestions "TRY", # various exception handling rules "UP", # detect deprecated python stdlib stuff - # "FA", # suggest using from __future__ import annotations TODO enable later after we make sure cachew works? - # "PTH", # pathlib migration -- TODO enable later + "FA", # suggest using from __future__ import annotations + "PTH", # pathlib migration "ARG", # unused argument checks # "A", # builtin shadowing -- TODO handle later # "EM", # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying @@ -35,6 +35,11 @@ lint.extend-select = [ # "ALL", # uncomment this to check for new rules! ] +# Preserve types, even if a file imports `from __future__ import annotations` +# we need this for cachew to work with HPI types on 3.9 +# can probably remove after 3.10? +lint.pyupgrade.keep-runtime-typing = true + lint.ignore = [ "D", # annoying nags about docstrings "N", # pep naming @@ -68,11 +73,6 @@ lint.ignore = [ "F841", # Local variable `count` is assigned to but never used ### -### TODO should be fine to use these with from __future__ import annotations? -### there was some issue with cachew though... double check this? - "UP006", # use type instead of Type - "UP007", # use X | Y instead of Union -### "RUF100", # unused noqa -- handle later "RUF012", # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs From a2b397ec4a83e6fded7c758470c49f6f18f2ab81 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 22 Oct 2024 20:50:37 +0100 Subject: [PATCH 294/302] my.whatsapp.android: adapt to new db format --- my/books/kobo.py | 2 +- my/whatsapp/android.py | 33 ++++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/my/books/kobo.py b/my/books/kobo.py index 899ef31..40b7ed7 100644 --- a/my/books/kobo.py +++ b/my/books/kobo.py @@ -3,4 +3,4 @@ from my.core import warnings warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!') from my.core.util import __NOT_HPI_MODULE__ -from my.kobo import * # type: ignore[no-redef] +from my.kobo import * diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py index 27ee743..3cd4436 100644 --- a/my/whatsapp/android.py +++ b/my/whatsapp/android.py @@ -1,6 +1,7 @@ """ Whatsapp data from Android app database (in =/data/data/com.whatsapp/databases/msgstore.db=) """ + from __future__ import annotations import sqlite3 @@ -63,11 +64,27 @@ Entity = Union[Chat, Sender, Message] def _process_db(db: sqlite3.Connection) -> Iterator[Entity]: # TODO later, split out Chat/Sender objects separately to safe on object creation, similar to other android data sources + try: + db.execute('SELECT jid_row_id FROM chat_view') + except sqlite3.OperationalError as oe: + if 'jid_row_id' not in str(oe): + raise oe + new_version_202410 = False + else: + new_version_202410 = True + + if new_version_202410: + chat_id_col = 'jid.raw_string' + jid_join = 'JOIN jid ON jid._id == chat_view.jid_row_id' + else: + chat_id_col = 'chat_view.raw_string_jid' + jid_join = '' + chats = {} for r in db.execute( - ''' - SELECT raw_string_jid AS chat_id, subject - FROM chat_view + f''' + SELECT {chat_id_col} AS chat_id, subject + FROM chat_view {jid_join} WHERE chat_id IS NOT NULL /* seems that it might be null for chats that are 'recycled' (the db is more like an LRU cache) */ ''' ): @@ -89,6 +106,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Entity]: ): # TODO seems that msgstore.db doesn't have contact names # perhaps should extract from wa.db and match against wa_contacts.jid? + # TODO these can also be chats? not sure if need to include... s = Sender( id=r['raw_string'], name=None, @@ -100,9 +118,9 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Entity]: # so even if it seems as if it has a column (e.g. for attachment path), there is actually no such data # so makes more sense to just query message column directly for r in db.execute( - ''' + f''' SELECT - C.raw_string_jid AS chat_id, + {chat_id_col} AS chat_id, M.key_id, M.timestamp, sender_jid_row_id, M.from_me, @@ -111,8 +129,9 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Entity]: MM.file_size, M.message_type FROM message AS M - LEFT JOIN chat_view AS C ON M.chat_row_id = C._id - LEFT JOIN message_media AS MM ON M._id = MM.message_row_id + LEFT JOIN chat_view ON M.chat_row_id = chat_view._id + {jid_join} + left JOIN message_media AS MM ON M._id = MM.message_row_id WHERE M.key_id != -1 /* key_id -1 is some sort of fake message where everything is null */ /* type 7 seems to be some dummy system message. sometimes contain chat name, but usually null, so ignore them From 7ab6f0d5cbce2241ba8a7848ff1bf18e147d26cf Mon Sep 17 00:00:00 2001 From: purarue <7804791+purarue@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:39:00 -0700 Subject: [PATCH 295/302] chore: update urls --- README.org | 4 ++-- doc/DENYLIST.md | 6 +++--- doc/MODULES.org | 12 ++++++------ doc/MODULE_DESIGN.org | 8 ++++---- doc/OVERLAYS.org | 2 +- doc/QUERY.md | 6 +++--- doc/SETUP.org | 2 +- misc/.flake8-karlicoss | 2 +- my/browser/active_browser.py | 2 +- my/browser/export.py | 2 +- my/google/takeout/parser.py | 10 +++++----- my/ip/all.py | 4 ++-- my/ip/common.py | 2 +- my/location/fallback/via_ip.py | 2 +- my/location/google_takeout.py | 2 +- my/location/google_takeout_semantic.py | 2 +- my/location/via_ip.py | 2 +- my/reddit/pushshift.py | 6 +++--- 18 files changed, 38 insertions(+), 38 deletions(-) diff --git a/README.org b/README.org index c065a0c..79621a5 100644 --- a/README.org +++ b/README.org @@ -723,10 +723,10 @@ If you want to write modules for personal use but don't want to merge them into Other HPI Repositories: -- [[https://github.com/seanbreckenridge/HPI][seanbreckenridge/HPI]] +- [[https://github.com/purarue/HPI][purarue/HPI]] - [[https://github.com/madelinecameron/hpi][madelinecameron/HPI]] -If you want to create your own to create your own modules/override something here, you can use the [[https://github.com/seanbreckenridge/HPI-template][template]]. +If you want to create your own to create your own modules/override something here, you can use the [[https://github.com/purarue/HPI-template][template]]. * Related links :PROPERTIES: diff --git a/doc/DENYLIST.md b/doc/DENYLIST.md index 440715c..3d8dea0 100644 --- a/doc/DENYLIST.md +++ b/doc/DENYLIST.md @@ -76,7 +76,7 @@ This would typically be used in an overridden `all.py` file, or in a one-off scr which you may want to filter out some items from a source, progressively adding more items to the denylist as you go. -A potential `my/ip/all.py` file might look like (Sidenote: `discord` module from [here](https://github.com/seanbreckenridge/HPI)): +A potential `my/ip/all.py` file might look like (Sidenote: `discord` module from [here](https://github.com/purarue/HPI)): ```python from typing import Iterator @@ -119,9 +119,9 @@ python3 -c 'from my.ip import all; all.deny.deny_cli(all.ips())' To edit the `all.py`, you could either: - install it as editable (`python3 -m pip install --user -e ./HPI`), and then edit the file directly -- or, create a namespace package, which splits the package across multiple directories. For info on that see [`MODULE_DESIGN`](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#namespace-packages), [`reorder_editable`](https://github.com/seanbreckenridge/reorder_editable), and possibly the [`HPI-template`](https://github.com/seanbreckenridge/HPI-template) to create your own HPI namespace package to create your own `all.py` file. +- or, create a namespace package, which splits the package across multiple directories. For info on that see [`MODULE_DESIGN`](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#namespace-packages), [`reorder_editable`](https://github.com/purarue/reorder_editable), and possibly the [`HPI-template`](https://github.com/purarue/HPI-template) to create your own HPI namespace package to create your own `all.py` file. -For a real example of this see, [seanbreckenridge/HPI-personal](https://github.com/seanbreckenridge/HPI-personal/blob/master/my/ip/all.py) +For a real example of this see, [purarue/HPI-personal](https://github.com/purarue/HPI-personal/blob/master/my/ip/all.py) Sidenote: the reason why we want to specifically override the all.py and not just create a script that filters out the items you're diff --git a/doc/MODULES.org b/doc/MODULES.org index 9f48024..347d88d 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -76,7 +76,7 @@ The config snippets below are meant to be modified accordingly and *pasted into You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works. -For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[https://github.com/seanbreckenridge/dotfiles/blob/master/.config/my/my/config/__init__.py][config]] +For an extensive/complex example, you can check out ~@purarue~'s [[https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py][config]] # Nested Configurations before the doc generation using the block below ** [[file:../my/reddit][my.reddit]] @@ -96,7 +96,7 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http class pushshift: ''' - Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments + Uses [[https://github.com/purarue/pushshift_comment_export][pushshift]] to get access to old comments ''' # path[s]/glob to the exported JSON data @@ -106,7 +106,7 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http ** [[file:../my/browser/][my.browser]] - Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]] + Parses browser history using [[http://github.com/purarue/browserexport][browserexport]] #+begin_src python class browser: @@ -132,7 +132,7 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http You might also be able to use [[file:../my/location/via_ip.py][my.location.via_ip]] which uses =my.ip.all= to provide geolocation data for an IPs (though no IPs are provided from any - of the sources here). For an example of usage, see [[https://github.com/seanbreckenridge/HPI/tree/master/my/ip][here]] + of the sources here). For an example of usage, see [[https://github.com/purarue/HPI/tree/master/my/ip][here]] #+begin_src python class location: @@ -256,9 +256,9 @@ for cls, p in modules: ** [[file:../my/google/takeout/parser.py][my.google.takeout.parser]] - Parses Google Takeout using [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] + Parses Google Takeout using [[https://github.com/purarue/google_takeout_parser][google_takeout_parser]] - See [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] for more information about how to export and organize your takeouts + See [[https://github.com/purarue/google_takeout_parser][google_takeout_parser]] for more information about how to export and organize your takeouts If the =DISABLE_TAKEOUT_CACHE= environment variable is set, this won't cache individual exports in =~/.cache/google_takeout_parser= diff --git a/doc/MODULE_DESIGN.org b/doc/MODULE_DESIGN.org index 7aedf2f..442dbf2 100644 --- a/doc/MODULE_DESIGN.org +++ b/doc/MODULE_DESIGN.org @@ -67,7 +67,7 @@ If you want to disable a source, you have a few options. ... that suppresses the warning message and lets you use ~my.location.all~ without having to change any lines of code -Another benefit is that all the custom sources/data is localized to the ~all.py~ file, so a user can override the ~all.py~ (see the sections below on ~namespace packages~) file in their own HPI repository, adding additional sources without having to maintain a fork and patching in changes as things eventually change. For a 'real world' example of that, see [[https://github.com/seanbreckenridge/HPI#partially-in-usewith-overrides][seanbreckenridge]]s location and ip modules. +Another benefit is that all the custom sources/data is localized to the ~all.py~ file, so a user can override the ~all.py~ (see the sections below on ~namespace packages~) file in their own HPI repository, adding additional sources without having to maintain a fork and patching in changes as things eventually change. For a 'real world' example of that, see [[https://github.com/purarue/HPI#partially-in-usewith-overrides][purarue]]s location and ip modules. This is of course not required for personal or single file modules, its just the pattern that seems to have the least amount of friction for the user, while being extendable, and without using a bulky plugin system to let users add additional sources. @@ -208,13 +208,13 @@ Where ~lastfm.py~ is your version of ~my.lastfm~, which you've copied from this Then, running ~python3 -m pip install -e .~ in that directory would install that as part of the namespace package, and assuming (see below for possible issues) this appears on ~sys.path~ before the upstream repository, your ~lastfm.py~ file overrides the upstream. Adding more files, like ~my.some_new_module~ into that directory immediately updates the global ~my~ package -- allowing you to quickly add new modules without having to re-install. -If you install both directories as editable packages (which has the benefit of any changes you making in either repository immediately updating the globally installed ~my~ package), there are some concerns with which editable install appears on your ~sys.path~ first. If you wanted your modules to override the upstream modules, yours would have to appear on the ~sys.path~ first (this is the same reason that =custom_lastfm_overlay= must be at the front of your ~PYTHONPATH~). For more details and examples on dealing with editable namespace packages in the context of HPI, see the [[https://github.com/seanbreckenridge/reorder_editable][reorder_editable]] repository. +If you install both directories as editable packages (which has the benefit of any changes you making in either repository immediately updating the globally installed ~my~ package), there are some concerns with which editable install appears on your ~sys.path~ first. If you wanted your modules to override the upstream modules, yours would have to appear on the ~sys.path~ first (this is the same reason that =custom_lastfm_overlay= must be at the front of your ~PYTHONPATH~). For more details and examples on dealing with editable namespace packages in the context of HPI, see the [[https://github.com/purarue/reorder_editable][reorder_editable]] repository. There is no limit to how many directories you could install into a single namespace package, which could be a possible way for people to install additional HPI modules, without worrying about the module count here becoming too large to manage. -There are some other users [[https://github.com/hpi/hpi][who have begun publishing their own modules]] as namespace packages, which you could potentially install and use, in addition to this repository, if any of those interest you. If you want to create your own you can use the [[https://github.com/seanbreckenridge/HPI-template][template]] to get started. +There are some other users [[https://github.com/hpi/hpi][who have begun publishing their own modules]] as namespace packages, which you could potentially install and use, in addition to this repository, if any of those interest you. If you want to create your own you can use the [[https://github.com/purarue/HPI-template][template]] to get started. -Though, enabling this many modules may make ~hpi doctor~ look pretty busy. You can explicitly choose to enable/disable modules with a list of modules/regexes in your [[https://github.com/karlicoss/HPI/blob/f559e7cb899107538e6c6bbcf7576780604697ef/my/core/core_config.py#L24-L55][core config]], see [[https://github.com/seanbreckenridge/dotfiles/blob/a1a77c581de31bd55a6af3d11b8af588614a207e/.config/my/my/config/__init__.py#L42-L72][here]] for an example. +Though, enabling this many modules may make ~hpi doctor~ look pretty busy. You can explicitly choose to enable/disable modules with a list of modules/regexes in your [[https://github.com/karlicoss/HPI/blob/f559e7cb899107538e6c6bbcf7576780604697ef/my/core/core_config.py#L24-L55][core config]], see [[https://github.com/purarue/dotfiles/blob/a1a77c581de31bd55a6af3d11b8af588614a207e/.config/my/my/config/__init__.py#L42-L72][here]] for an example. You may use the other modules or [[https://github.com/karlicoss/hpi-personal-overlay][my overlay]] as reference, but python packaging is already a complicated issue, before adding complexities like namespace packages and editable installs on top of it... If you're having trouble extending HPI in this fashion, you can open an issue here, preferably with a link to your code/repository and/or ~setup.py~ you're trying to use. diff --git a/doc/OVERLAYS.org b/doc/OVERLAYS.org index 1e6cf8f..a573007 100644 --- a/doc/OVERLAYS.org +++ b/doc/OVERLAYS.org @@ -66,7 +66,7 @@ This basically means that modules will be searched in both paths, with overlay t ** Installing with =--use-pep517= -See here for discussion https://github.com/seanbreckenridge/reorder_editable/issues/2, but TLDR it should work similarly. +See here for discussion https://github.com/purarue/reorder_editable/issues/2, but TLDR it should work similarly. * Testing runtime behaviour (editable install) diff --git a/doc/QUERY.md b/doc/QUERY.md index b672dff..a85450a 100644 --- a/doc/QUERY.md +++ b/doc/QUERY.md @@ -99,7 +99,7 @@ Commit(committed_dt=datetime.datetime(2023, 4, 14, 23, 9, 1, tzinfo=datetime.tim authored_dt=datetime.datetime(2023, 4, 14, 23, 4, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), message='sources.smscalls: propogate errors if there are breaking ' 'schema changes', - repo='/home/sean/Repos/promnesia-fork', + repo='/home/username/Repos/promnesia-fork', sha='22a434fca9a28df9b0915ccf16368df129d2c9ce', ref='refs/heads/smscalls-handle-result') ``` @@ -195,7 +195,7 @@ To preview, you can use something like [`qgis`](https://qgis.org/en/site/) or fo chicago trip -(Sidenote: this is [`@seanbreckenridge`](https://github.com/seanbreckenridge/)s locations, on a trip to Chicago) +(Sidenote: this is [`@purarue`](https://github.com/purarue/)s locations, on a trip to Chicago) ## Python reference @@ -301,4 +301,4 @@ The `hpi query` command is a CLI wrapper around the code in [`query.py`](../my/c If you specify a range, drop_unsorted is forced to be True ``` -Those can be imported and accept any sort of iterator, `hpi query` just defaults to the output of functions here. As an example, see [`listens`](https://github.com/seanbreckenridge/HPI-personal/blob/master/scripts/listens) which just passes an generator (iterator) as the first argument to `query_range` +Those can be imported and accept any sort of iterator, `hpi query` just defaults to the output of functions here. As an example, see [`listens`](https://github.com/purarue/HPI-personal/blob/master/scripts/listens) which just passes an generator (iterator) as the first argument to `query_range` diff --git a/doc/SETUP.org b/doc/SETUP.org index 0fced62..ee9571c 100644 --- a/doc/SETUP.org +++ b/doc/SETUP.org @@ -387,7 +387,7 @@ But there is an extra caveat: rexport is already coming with nice [[https://gith Several other HPI modules are following a similar pattern: hypothesis, instapaper, pinboard, kobo, etc. -Since the [[https://github.com/karlicoss/rexport#api-limitations][reddit API has limited results]], you can use [[https://github.com/seanbreckenridge/pushshift_comment_export][my.reddit.pushshift]] to access older reddit comments, which both then get merged into =my.reddit.all.comments= +Since the [[https://github.com/karlicoss/rexport#api-limitations][reddit API has limited results]], you can use [[https://github.com/purarue/pushshift_comment_export][my.reddit.pushshift]] to access older reddit comments, which both then get merged into =my.reddit.all.comments= ** Twitter diff --git a/misc/.flake8-karlicoss b/misc/.flake8-karlicoss index 3c98b96..5933253 100644 --- a/misc/.flake8-karlicoss +++ b/misc/.flake8-karlicoss @@ -32,6 +32,6 @@ ignore = # # as a reference: -# https://github.com/seanbreckenridge/cookiecutter-template/blob/master/%7B%7Bcookiecutter.module_name%7D%7D/setup.cfg +# https://github.com/purarue/cookiecutter-template/blob/master/%7B%7Bcookiecutter.module_name%7D%7D/setup.cfg # and this https://github.com/karlicoss/HPI/pull/151 # find ./my | entr flake8 --ignore=E402,E501,E741,W503,E266,E302,E305,E203,E261,E252,E251,E221,W291,E225,E303,E702,E202,F841,E731,E306,E127 E722,E231 my | grep -v __NOT_HPI_MODULE__ diff --git a/my/browser/active_browser.py b/my/browser/active_browser.py index 8051f1b..1686fc5 100644 --- a/my/browser/active_browser.py +++ b/my/browser/active_browser.py @@ -1,5 +1,5 @@ """ -Parses active browser history by backing it up with [[http://github.com/seanbreckenridge/sqlite_backup][sqlite_backup]] +Parses active browser history by backing it up with [[http://github.com/purarue/sqlite_backup][sqlite_backup]] """ REQUIRES = ["browserexport", "sqlite_backup"] diff --git a/my/browser/export.py b/my/browser/export.py index 351cf6e..52ade0e 100644 --- a/my/browser/export.py +++ b/my/browser/export.py @@ -1,5 +1,5 @@ """ -Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]] +Parses browser history using [[http://github.com/purarue/browserexport][browserexport]] """ REQUIRES = ["browserexport"] diff --git a/my/google/takeout/parser.py b/my/google/takeout/parser.py index 80c2be1..13fd04a 100644 --- a/my/google/takeout/parser.py +++ b/my/google/takeout/parser.py @@ -1,7 +1,7 @@ """ -Parses Google Takeout using [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] +Parses Google Takeout using [[https://github.com/purarue/google_takeout_parser][google_takeout_parser]] -See [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] for more information +See [[https://github.com/purarue/google_takeout_parser][google_takeout_parser]] for more information about how to export and organize your takeouts If the DISABLE_TAKEOUT_CACHE environment variable is set, this won't cache individual @@ -12,7 +12,7 @@ zip files of the exports, which are temporarily unpacked while creating the cachew cache """ -REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] +REQUIRES = ["git+https://github.com/purarue/google_takeout_parser"] import os from collections.abc import Sequence @@ -36,7 +36,7 @@ from google_takeout_parser.merge import CacheResults, GoogleEventSet from google_takeout_parser.models import BaseEvent from google_takeout_parser.path_dispatch import TakeoutParser -# see https://github.com/seanbreckenridge/dotfiles/blob/master/.config/my/my/config/__init__.py for an example +# see https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py for an example from my.config import google as user_config @@ -123,7 +123,7 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: else: results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True)) for m in results: - # e.g. /home/sean/data/google_takeout/Takeout-1634932457.zip") -> 'Takeout-1634932457' + # e.g. /home/username/data/google_takeout/Takeout-1634932457.zip") -> 'Takeout-1634932457' # means that zipped takeouts have nice filenames from cachew cw_id, _, _ = path.name.rpartition(".") # each takeout result is cached as well, in individual databases per-type diff --git a/my/ip/all.py b/my/ip/all.py index e8277c1..c267383 100644 --- a/my/ip/all.py +++ b/my/ip/all.py @@ -3,10 +3,10 @@ An example all.py stub module that provides ip data To use this, you'd add IP providers that yield IPs to the 'ips' function -For an example of how this could be used, see https://github.com/seanbreckenridge/HPI/tree/master/my/ip +For an example of how this could be used, see https://github.com/purarue/HPI/tree/master/my/ip """ -REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] +REQUIRES = ["git+https://github.com/purarue/ipgeocache"] from collections.abc import Iterator diff --git a/my/ip/common.py b/my/ip/common.py index ef54ee3..b551281 100644 --- a/my/ip/common.py +++ b/my/ip/common.py @@ -1,5 +1,5 @@ """ -Provides location/timezone data from IP addresses, using [[https://github.com/seanbreckenridge/ipgeocache][ipgeocache]] +Provides location/timezone data from IP addresses, using [[https://github.com/purarue/ipgeocache][ipgeocache]] """ from my.core import __NOT_HPI_MODULE__ # isort: skip diff --git a/my/location/fallback/via_ip.py b/my/location/fallback/via_ip.py index 732af67..8b50878 100644 --- a/my/location/fallback/via_ip.py +++ b/my/location/fallback/via_ip.py @@ -2,7 +2,7 @@ Converts IP addresses provided by my.location.ip to estimated locations """ -REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] +REQUIRES = ["git+https://github.com/purarue/ipgeocache"] from dataclasses import dataclass from datetime import timedelta diff --git a/my/location/google_takeout.py b/my/location/google_takeout.py index cb5bef3..8613257 100644 --- a/my/location/google_takeout.py +++ b/my/location/google_takeout.py @@ -2,7 +2,7 @@ Extracts locations using google_takeout_parser -- no shared code with the deprecated my.location.google """ -REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] +REQUIRES = ["git+https://github.com/purarue/google_takeout_parser"] from collections.abc import Iterator diff --git a/my/location/google_takeout_semantic.py b/my/location/google_takeout_semantic.py index 7bddfa8..e84a932 100644 --- a/my/location/google_takeout_semantic.py +++ b/my/location/google_takeout_semantic.py @@ -5,7 +5,7 @@ Extracts semantic location history using google_takeout_parser # This is a separate module to prevent ImportError and a new config block from breaking # previously functional my.location.google_takeout locations -REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] +REQUIRES = ["git+https://github.com/purarue/google_takeout_parser"] from collections.abc import Iterator from dataclasses import dataclass diff --git a/my/location/via_ip.py b/my/location/via_ip.py index d465ad0..240ec5f 100644 --- a/my/location/via_ip.py +++ b/my/location/via_ip.py @@ -1,4 +1,4 @@ -REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] +REQUIRES = ["git+https://github.com/purarue/ipgeocache"] from my.core.warnings import high diff --git a/my/reddit/pushshift.py b/my/reddit/pushshift.py index 1bfa048..12f592b 100644 --- a/my/reddit/pushshift.py +++ b/my/reddit/pushshift.py @@ -1,11 +1,11 @@ """ Gives you access to older comments possibly not accessible with rexport using pushshift -See https://github.com/seanbreckenridge/pushshift_comment_export +See https://github.com/purarue/pushshift_comment_export """ REQUIRES = [ - "git+https://github.com/seanbreckenridge/pushshift_comment_export", + "git+https://github.com/purarue/pushshift_comment_export", ] from dataclasses import dataclass @@ -21,7 +21,7 @@ from my.core.cfg import make_config @dataclass class pushshift_config(uconfig.pushshift): ''' - Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments + Uses [[https://github.com/purarue/pushshift_comment_export][pushshift]] to get access to old comments ''' # path[s]/glob to the exported JSON data From ad55c5c345888abaebf59ae85923339b7ceccbb4 Mon Sep 17 00:00:00 2001 From: Srajan Garg Date: Tue, 12 Nov 2024 19:05:27 -0500 Subject: [PATCH 296/302] fix typo in rexport DAL (#405) * fix typo in rexport DAL --- my/reddit/rexport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/my/reddit/rexport.py b/my/reddit/rexport.py index cb6af01..262635b 100644 --- a/my/reddit/rexport.py +++ b/my/reddit/rexport.py @@ -146,7 +146,7 @@ if not TYPE_CHECKING: # here we just check that types are available, we don't actually want to import them # fmt: off dal.Subreddit # noqa: B018 - dal.Profil # noqa: B018e + dal.Profile # noqa: B018 dal.Multireddit # noqa: B018 # fmt: on except AttributeError as ae: From a7f05c2cad0c500210f966e0f50e0b309490cc53 Mon Sep 17 00:00:00 2001 From: purarue <7804791+purarue@users.noreply.github.com> Date: Wed, 20 Nov 2024 00:03:40 -0800 Subject: [PATCH 297/302] doc: spelling fixes --- CHANGELOG.md | 2 +- doc/OVERLAYS.org | 6 +++--- doc/QUERY.md | 2 +- my/core/cachew.py | 2 +- my/core/konsume.py | 2 +- my/core/logging.py | 2 +- my/core/tests/test_tmp_config.py | 2 +- my/core/utils/itertools.py | 4 ++-- my/fbmessenger/__init__.py | 2 +- my/fbmessenger/android.py | 2 +- my/instagram/all.py | 2 +- my/instagram/gdpr.py | 4 ++-- my/reddit/__init__.py | 2 +- my/smscalls.py | 4 ++-- my/stackexchange/gdpr.py | 2 +- my/time/tz/via_location.py | 2 +- my/tinder/android.py | 2 +- my/topcoder.py | 2 +- my/twitter/android.py | 2 +- my/twitter/twint.py | 2 +- my/whatsapp/android.py | 2 +- my/youtube/takeout.py | 2 +- 22 files changed, 27 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dd19df..d60ef35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ General/my.core changes: - e81dddddf083ffd81aa7e2b715bd34f59949479c properly resolve class properties in make_config + add test Modules: -- some innitial work on filling **InfluxDB** with HPI data +- some initial work on filling **InfluxDB** with HPI data - pinboard - 42399f6250d9901d93dcedcfe05f7857babcf834: **breaking backwards compatibility**, use pinbexport module directly diff --git a/doc/OVERLAYS.org b/doc/OVERLAYS.org index a573007..7bafa48 100644 --- a/doc/OVERLAYS.org +++ b/doc/OVERLAYS.org @@ -10,7 +10,7 @@ Relevant discussion about overlays: https://github.com/karlicoss/HPI/issues/102 # You can see them TODO in overlays dir -Consider a toy package/module structure with minimal code, wihout any actual data parsing, just for demonstration purposes. +Consider a toy package/module structure with minimal code, without any actual data parsing, just for demonstration purposes. - =main= package structure # TODO do links @@ -19,7 +19,7 @@ Consider a toy package/module structure with minimal code, wihout any actual dat Extracts Twitter data from GDPR archive. - =my/twitter/all.py= Merges twitter data from multiple sources (only =gdpr= in this case), so data consumers are agnostic of specific data sources used. - This will be overriden by =overlay=. + This will be overridden by =overlay=. - =my/twitter/common.py= Contains helper function to merge data, so they can be reused by overlay's =all.py=. - =my/reddit.py= @@ -126,7 +126,7 @@ https://github.com/python/mypy/blob/1dd8e7fe654991b01bd80ef7f1f675d9e3910c3a/myp For now, I opened an issue in mypy repository https://github.com/python/mypy/issues/16683 -But ok, maybe mypy treats =main= as an external package somhow but still type checks it properly? +But ok, maybe mypy treats =main= as an external package somehow but still type checks it properly? Let's see what's going on with imports: : $ mypy --namespace-packages --strict -p my --follow-imports=error diff --git a/doc/QUERY.md b/doc/QUERY.md index a85450a..9a5d9d3 100644 --- a/doc/QUERY.md +++ b/doc/QUERY.md @@ -97,7 +97,7 @@ By default, this just returns the items in the order they were returned by the f hpi query my.coding.commits.commits --order-key committed_dt --limit 1 --reverse --output pprint --stream Commit(committed_dt=datetime.datetime(2023, 4, 14, 23, 9, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), authored_dt=datetime.datetime(2023, 4, 14, 23, 4, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), - message='sources.smscalls: propogate errors if there are breaking ' + message='sources.smscalls: propagate errors if there are breaking ' 'schema changes', repo='/home/username/Repos/promnesia-fork', sha='22a434fca9a28df9b0915ccf16368df129d2c9ce', diff --git a/my/core/cachew.py b/my/core/cachew.py index 9ccee09..8ce2f2b 100644 --- a/my/core/cachew.py +++ b/my/core/cachew.py @@ -136,7 +136,7 @@ if TYPE_CHECKING: CC = Callable[P, R] # need to give it a name, if inlined into bound=, mypy runs in a bug PathProvider = Union[PathIsh, Callable[P, PathIsh]] # NOTE: in cachew, HashFunction type returns str - # however in practice, cachew alwasy calls str for its result + # however in practice, cachew always calls str for its result # so perhaps better to switch it to Any in cachew as well HashFunction = Callable[P, Any] diff --git a/my/core/konsume.py b/my/core/konsume.py index 6d24167..41b5a4e 100644 --- a/my/core/konsume.py +++ b/my/core/konsume.py @@ -236,7 +236,7 @@ def test_zoom() -> None: # - very flexible, easy to adjust behaviour # - cons: # - can forget to assert about extra entities etc, so error prone -# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes erro handling harder +# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes error handling harder # - a bit verbose.. so probably requires some helper functions though (could be much leaner than current konsume though) # - if we assert, then terminates parsing too early, if we're defensive then inflates the code a lot with if statements # - TODO perhaps combine warnings somehow or at least only emit once per module? diff --git a/my/core/logging.py b/my/core/logging.py index bdee9aa..167a167 100644 --- a/my/core/logging.py +++ b/my/core/logging.py @@ -250,7 +250,7 @@ if __name__ == '__main__': test() -## legacy/deprecated methods for backwards compatilibity +## legacy/deprecated methods for backwards compatibility if not TYPE_CHECKING: from .compat import deprecated diff --git a/my/core/tests/test_tmp_config.py b/my/core/tests/test_tmp_config.py index e5a24cc..d99621d 100644 --- a/my/core/tests/test_tmp_config.py +++ b/my/core/tests/test_tmp_config.py @@ -12,7 +12,7 @@ def _init_default_config() -> None: def test_tmp_config() -> None: ## ugh. ideally this would be on the top level (would be a better test) - ## but pytest imports eveything first, executes hooks, and some reset_modules() fictures mess stuff up + ## but pytest imports everything first, executes hooks, and some reset_modules() fictures mess stuff up ## later would be nice to be a bit more careful about them _init_default_config() from my.simple import items diff --git a/my/core/utils/itertools.py b/my/core/utils/itertools.py index 501ebbe..42b2b77 100644 --- a/my/core/utils/itertools.py +++ b/my/core/utils/itertools.py @@ -321,7 +321,7 @@ _UET = TypeVar('_UET') _UEU = TypeVar('_UEU') -# NOTE: for historic reasons, this function had to accept Callable that retuns iterator +# NOTE: for historic reasons, this function had to accept Callable that returns iterator # instead of just iterator # TODO maybe deprecated Callable support? not sure def unique_everseen( @@ -358,7 +358,7 @@ def test_unique_everseen() -> None: assert list(unique_everseen(fun_good)) == [123] with pytest.raises(Exception): - # since function retuns a list rather than iterator, check happens immediately + # since function returns a list rather than iterator, check happens immediately # , even without advancing the iterator unique_everseen(fun_bad) diff --git a/my/fbmessenger/__init__.py b/my/fbmessenger/__init__.py index f729de9..e5e417c 100644 --- a/my/fbmessenger/__init__.py +++ b/my/fbmessenger/__init__.py @@ -9,7 +9,7 @@ since that allows for easier overriding using namespace packages See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info. """ -# prevent it from apprearing in modules list/doctor +# prevent it from appearing in modules list/doctor from ..core import __NOT_HPI_MODULE__ # kinda annoying to keep it, but it's so legacy 'hpi module install my.fbmessenger' works diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index a16d924..db4cc54 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -174,7 +174,7 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: However seems that when message is not sent yet it doesn't have this server id yet (happened only once, but could be just luck of course!) We exclude these messages to avoid duplication. - However poisitive filter (e.g. message_id LIKE 'mid%') feels a bit wrong, e.g. what if mesage ids change or something + However poisitive filter (e.g. message_id LIKE 'mid%') feels a bit wrong, e.g. what if message ids change or something So instead this excludes only such unsent messages. */ message_id != offline_threading_id diff --git a/my/instagram/all.py b/my/instagram/all.py index 214e6ac..ce78409 100644 --- a/my/instagram/all.py +++ b/my/instagram/all.py @@ -23,7 +23,7 @@ def messages() -> Iterator[Res[Message]]: # TODO in general best to prefer android, it has more data # - message ids # - usernames are correct for Android data - # - thread ids more meaninful? + # - thread ids more meaningful? # but for now prefer gdpr prefix since it makes a bit things a bit more consistent? # e.g. a new batch of android exports can throw off ids if we rely on it for mapping yield from _merge_messages( diff --git a/my/instagram/gdpr.py b/my/instagram/gdpr.py index 7454a04..d417fdb 100644 --- a/my/instagram/gdpr.py +++ b/my/instagram/gdpr.py @@ -76,7 +76,7 @@ def _entities() -> Iterator[Res[User | _Message]]: # NOTE: here there are basically two options # - process inputs as is (from oldest to newest) # this would be more stable wrt newer exports (e.g. existing thread ids won't change) - # the downside is that newer exports seem to have better thread ids, so might be preferrable to use them + # the downside is that newer exports seem to have better thread ids, so might be preferable to use them # - process inputs reversed (from newest to oldest) # the upside is that thread ids/usernames might be better # the downside is that if for example the user renames, thread ids will change _a lot_, might be undesirable.. @@ -137,7 +137,7 @@ def _entitites_from_path(path: Path) -> Iterator[Res[User | _Message]]: j = json.loads(ffile.read_text()) id_len = 10 - # NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole converstation + # NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole conversation # but I stared a bit at these ids vs database ids and can't see any way to find the correspondence :( # so basically the only way to merge is to actually try some magic and correlate timestamps/message texts? # another option is perhaps to query user id from username with some free API diff --git a/my/reddit/__init__.py b/my/reddit/__init__.py index f344eeb..982901a 100644 --- a/my/reddit/__init__.py +++ b/my/reddit/__init__.py @@ -9,7 +9,7 @@ since that allows for easier overriding using namespace packages See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info. """ -# prevent it from apprearing in modules list/doctor +# prevent it from appearing in modules list/doctor from ..core import __NOT_HPI_MODULE__ # kinda annoying to keep it, but it's so legacy 'hpi module install my.reddit' works diff --git a/my/smscalls.py b/my/smscalls.py index ccaac72..324bc44 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -186,7 +186,7 @@ class MMS(NamedTuple): for (addr, _type) in self.addresses: if _type == 137: return addr - # hmm, maybe return instead? but this probably shouldnt happen, means + # hmm, maybe return instead? but this probably shouldn't happen, means # something is very broken raise RuntimeError(f'No from address matching 137 found in {self.addresses}') @@ -214,7 +214,7 @@ def mms() -> Iterator[Res[MMS]]: def _resolve_null_str(value: str | None) -> str | None: if value is None: return None - # hmm.. theres some risk of the text actually being 'null', but theres + # hmm.. there's some risk of the text actually being 'null', but there's # no way to distinguish that from XML values if value == 'null': return None diff --git a/my/stackexchange/gdpr.py b/my/stackexchange/gdpr.py index 78987be..8ed0d30 100644 --- a/my/stackexchange/gdpr.py +++ b/my/stackexchange/gdpr.py @@ -49,7 +49,7 @@ class Vote(NamedTuple): # hmm, this loads very raw comments without the rest of the page? # - https://meta.stackexchange.com/posts/27319/comments#comment-57475 # - # parentPostId is the original quesion + # parentPostId is the original question # TODO is not always present? fucking hell # seems like there is no way to get a hierarchical comment link.. guess this needs to be handled in Promnesia normalisation... # postId is the answer diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 58b5bf7..1b2275b 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -245,7 +245,7 @@ def _iter_tzs() -> Iterator[DayWithZone]: def _day2zone() -> dict[date, pytz.BaseTzInfo]: # NOTE: kinda unfortunate that this will have to process all days before returning result for just one # however otherwise cachew cache might never be initialized properly - # so we'll always end up recomputing everyting during subsequent runs + # so we'll always end up recomputing everything during subsequent runs return {dz.day: pytz.timezone(dz.zone) for dz in _iter_tzs()} diff --git a/my/tinder/android.py b/my/tinder/android.py index a09794f..5a5d887 100644 --- a/my/tinder/android.py +++ b/my/tinder/android.py @@ -106,7 +106,7 @@ def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]: user_profile_rows = list(db.execute('SELECT * FROM profile_user_view')) if len(user_profile_rows) == 0: - # shit, sometime in 2023 profile_user_view stoppped containing user profile.. + # shit, sometime in 2023 profile_user_view stopped containing user profile.. # presumably the most common from_id/to_id would be our own username counter = Counter([id_ for (id_,) in db.execute('SELECT from_id FROM message UNION ALL SELECT to_id FROM message')]) if len(counter) > 0: # this might happen if db is empty (e.g. user got logged out) diff --git a/my/topcoder.py b/my/topcoder.py index 56403e2..40df77c 100644 --- a/my/topcoder.py +++ b/my/topcoder.py @@ -81,7 +81,7 @@ def _parse_one(p: Path) -> Iterator[Res[Competition]]: # but also expects cooperation from .make method (e.g. popping items from the dict) # could also wrap in helper and pass to .make .. not sure # an argument could be made that .make isn't really a class methond.. - # it's pretty specific to this parser onl + # it's pretty specific to this parser only yield from Competition.make(j=c) yield from m.check() diff --git a/my/twitter/android.py b/my/twitter/android.py index 88c9389..8159ee7 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -192,7 +192,7 @@ def get_own_user_id(conn) -> str: # - timeline_data_type # 1 : the bulk of tweets, but also some notifications etc?? # 2 : who-to-follow/community-to-join. contains a couple of tweets, but their corresponding status_id is NULL -# 8 : who-to-follow/notfication +# 8 : who-to-follow/notification # 13: semantic-core/who-to-follow # 14: cursor # 17: trends diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 5106923..9d36a93 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -54,7 +54,7 @@ class Tweet(NamedTuple): # https://github.com/thomasancheriyil/Red-Tide-Detection-based-on-Twitter/blob/beb200be60cc66dcbc394e670513715509837812/python/twitterGapParse.py#L61-L62 # # twint is also saving 'timezone', but this is local machine timezone at the time of scraping? - # perhaps they thought date-time-ms was local time... or just kept it just in case (they are keepin lots on unnecessary stuff in the db) + # perhaps they thought date-time-ms was local time... or just kept it just in case (they are keeping lots on unnecessary stuff in the db) return datetime.fromtimestamp(seconds, tz=tz) @property diff --git a/my/whatsapp/android.py b/my/whatsapp/android.py index 3cd4436..a8dbe8d 100644 --- a/my/whatsapp/android.py +++ b/my/whatsapp/android.py @@ -199,7 +199,7 @@ def _process_db(db: sqlite3.Connection) -> Iterator[Entity]: sender_row_id = r['sender_jid_row_id'] if sender_row_id == 0: # seems that it's always 0 for 1-1 chats - # for group chats our onw id is still 0, but other ids are properly set + # for group chats our own id is still 0, but other ids are properly set if from_me: myself_user_id = config.my_user_id or 'MYSELF_USER_ID' sender = Sender(id=myself_user_id, name=None) # TODO set my own name as well? diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index 703715f..8eca328 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -36,7 +36,7 @@ def watched() -> Iterator[Res[Watched]]: continue # older exports (e.g. html) didn't have microseconds - # wheras newer json ones do have them + # whereas newer json ones do have them # seconds resolution is enough to distinguish watched videos # also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data without_microsecond = w.when.replace(microsecond=0) From 95a16b956f8ab24bea3002d1428c0c10b30a3455 Mon Sep 17 00:00:00 2001 From: purarue <7804791+purarue@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:53:10 -0800 Subject: [PATCH 298/302] doc: some performance notes for query_range (#409) * doc: some performance notes for query_range * add ruff_cache to gitignore --- .gitignore | 3 +++ my/core/__init__.py | 33 ++++++++++++++++++--------------- my/core/__main__.py | 3 +++ my/core/query_range.py | 4 +++- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 19c3380..65ba630 100644 --- a/.gitignore +++ b/.gitignore @@ -155,6 +155,9 @@ celerybeat-schedule .dmypy.json dmypy.json +# linters +.ruff_cache/ + # Pyre type checker .pyre/ diff --git a/my/core/__init__.py b/my/core/__init__.py index cc549d5..a8a41f4 100644 --- a/my/core/__init__.py +++ b/my/core/__init__.py @@ -29,22 +29,25 @@ if not TYPE_CHECKING: __all__ = [ - 'get_files', 'PathIsh', 'Paths', - 'Json', - 'make_logger', - 'LazyLogger', # legacy import - 'warn_if_empty', - 'stat', 'Stats', - 'datetime_aware', 'datetime_naive', - 'assert_never', # TODO maybe deprecate from use in my.core? will be in stdlib soon - - 'make_config', - '__NOT_HPI_MODULE__', - - 'Res', 'unwrap', 'notnone', - - 'dataclass', 'Path', + 'Json', + 'LazyLogger', # legacy import + 'Path', + 'PathIsh', + 'Paths', + 'Res', + 'Stats', + 'assert_never', # TODO maybe deprecate from use in my.core? will be in stdlib soon + 'dataclass', + 'datetime_aware', + 'datetime_naive', + 'get_files', + 'make_config', + 'make_logger', + 'notnone', + 'stat', + 'unwrap', + 'warn_if_empty', ] diff --git a/my/core/__main__.py b/my/core/__main__.py index 00ac4ee..7e2d8f9 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -538,6 +538,9 @@ def query_hpi_functions( # chain list of functions from user, in the order they wrote them on the CLI input_src = chain(*(f() for f in _locate_functions_or_prompt(qualified_names))) + # NOTE: if passing just one function to this which returns a single namedtuple/dataclass, + # using both --order-key and --order-type will often be faster as it does not need to + # duplicate the iterator in memory, or try to find the --order-type type on each object before sorting res = select_range( input_src, order_key=order_key, diff --git a/my/core/query_range.py b/my/core/query_range.py index 2a8d7bd..83728bf 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -337,6 +337,8 @@ def select_range( # if the user supplied a order_key, and/or we've generated an order_value, create # the function that accesses that type on each value in the iterator if order_key is not None or order_value is not None: + # _generate_order_value_func internally here creates a copy of the iterator, which has to + # be consumed in-case we're sorting by mixed types order_by_chosen, itr = _handle_generate_order_by(itr, order_key=order_key, order_value=order_value) # signifies that itr is empty -- can early return here if order_by_chosen is None: @@ -398,7 +400,7 @@ Specify a type or a key to order the value by""") return itr -# re-use items from query for testing +# reuse items from query for testing from .query import _A, _B, _Float, _mixed_iter_errors From d8c53bde34e2a5e68f2bac18941ae426ed468b02 Mon Sep 17 00:00:00 2001 From: purarue <7804791+purarue@users.noreply.github.com> Date: Mon, 25 Nov 2024 16:31:22 -0800 Subject: [PATCH 299/302] smscalls: add phone number to model --- my/smscalls.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/my/smscalls.py b/my/smscalls.py index 324bc44..0ff2553 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -37,6 +37,7 @@ class Call(NamedTuple): dt: datetime dt_readable: str duration_s: int + phone_number: str who: str | None # type - 1 = Incoming, 2 = Outgoing, 3 = Missed, 4 = Voicemail, 5 = Rejected, 6 = Refused List. call_type: int @@ -65,12 +66,13 @@ def _extract_calls(path: Path) -> Iterator[Res[Call]]: duration = cxml.get('duration') who = cxml.get('contact_name') call_type = cxml.get('type') + number = cxml.get('number') # if name is missing, its not None (its some string), depends on the phone/message app if who is not None and who in UNKNOWN: who = None - if dt is None or dt_readable is None or duration is None or call_type is None: + if dt is None or dt_readable is None or duration is None or call_type is None or number is None: call_str = etree.tostring(cxml).decode('utf-8') - yield RuntimeError(f"Missing one or more required attributes [date, readable_date, duration, type] in {call_str}") + yield RuntimeError(f"Missing one or more required attributes [date, readable_date, duration, type, number] in {call_str}") continue # TODO we've got local tz here, not sure if useful.. # ok, so readable date is local datetime, changing throughout the backup @@ -78,6 +80,7 @@ def _extract_calls(path: Path) -> Iterator[Res[Call]]: dt=_parse_dt_ms(dt), dt_readable=dt_readable, duration_s=int(duration), + phone_number=number, who=who, call_type=int(call_type), ) From f1d23c5e96d95819d383485f22b480d8d190fe98 Mon Sep 17 00:00:00 2001 From: purarue <7804791+purarue@users.noreply.github.com> Date: Sun, 22 Dec 2024 21:50:03 -0800 Subject: [PATCH 300/302] smscalls: allow large XML files as input once XML files increase past a certain size (was about 220MB for me), the parser just throws an error because the tree is too large (iirc for security reasons) could maybe look at using iterparse in the future to parse it without loading the whole file, but this seems to fix it fine for me --- my/smscalls.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/my/smscalls.py b/my/smscalls.py index 0ff2553..27d08be 100644 --- a/my/smscalls.py +++ b/my/smscalls.py @@ -57,9 +57,12 @@ class Call(NamedTuple): # The '(Unknown)' is just what my android phone does, not sure if there are others UNKNOWN: set[str] = {'(Unknown)'} +def _parse_xml(xml: Path) -> Any: + return etree.parse(str(xml), parser=etree.XMLParser(huge_tree=True)) + def _extract_calls(path: Path) -> Iterator[Res[Call]]: - tr = etree.parse(str(path)) + tr = _parse_xml(path) for cxml in tr.findall('call'): dt = cxml.get('date') dt_readable = cxml.get('readable_date') @@ -133,7 +136,7 @@ def messages() -> Iterator[Res[Message]]: def _extract_messages(path: Path) -> Iterator[Res[Message]]: - tr = etree.parse(str(path)) + tr = _parse_xml(path) for mxml in tr.findall('sms'): dt = mxml.get('date') dt_readable = mxml.get('readable_date') @@ -225,8 +228,7 @@ def _resolve_null_str(value: str | None) -> str | None: def _extract_mms(path: Path) -> Iterator[Res[MMS]]: - tr = etree.parse(str(path)) - + tr = _parse_xml(path) for mxml in tr.findall('mms'): dt = mxml.get('date') dt_readable = mxml.get('readable_date') @@ -271,10 +273,7 @@ def _extract_mms(path: Path) -> Iterator[Res[MMS]]: # # This seems pretty useless, so we should try and skip it, and just return the # text/images/data - # - # man, attrib is some internal cpython ._Attrib type which can't - # be typed by any sort of mappingproxy. maybe a protocol could work..? - part_data: dict[str, Any] = part.attrib # type: ignore + part_data: dict[str, Any] = part.attrib seq: str | None = part_data.get('seq') if seq == '-1': continue From 54df429f614a5e5d0617dcd196bf8566608e987c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 29 Dec 2024 15:06:49 +0000 Subject: [PATCH 301/302] core.sqlite: add helper SqliteTool to get table schemas --- my/core/sqlite.py | 43 +++++++++++++++++++++++++++++++++++++++ my/fbmessenger/android.py | 4 ++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/my/core/sqlite.py b/my/core/sqlite.py index aa41ab3..6167d2e 100644 --- a/my/core/sqlite.py +++ b/my/core/sqlite.py @@ -134,3 +134,46 @@ def select(cols: tuple[str, str, str, str, str, str, str, str], rest: str, *, db def select(cols, rest, *, db): # db arg is last cause that results in nicer code formatting.. return db.execute('SELECT ' + ','.join(cols) + ' ' + rest) + + +class SqliteTool: + def __init__(self, connection: sqlite3.Connection) -> None: + self.connection = connection + + def _get_sqlite_master(self) -> dict[str, str]: + res = {} + for c in self.connection.execute('SELECT name, type FROM sqlite_master'): + [name, type_] = c + assert type_ in {'table', 'index', 'view', 'trigger'}, (name, type_) # just in case + res[name] = type_ + return res + + def get_table_names(self) -> list[str]: + master = self._get_sqlite_master() + res = [] + for name, type_ in master.items(): + if type_ != 'table': + continue + res.append(name) + return res + + def get_table_schema(self, name: str) -> dict[str, str]: + """ + Returns map from column name to column type + + NOTE: Sometimes this doesn't work if the db has some extensions (e.g. happens for facebook apps) + In this case you might still be able to use get_table_names + """ + schema: dict[str, str] = {} + for row in self.connection.execute(f'PRAGMA table_info(`{name}`)'): + col = row[1] + type_ = row[2] + # hmm, somewhere between 3.34.1 and 3.37.2, sqlite started normalising type names to uppercase + # let's do this just in case since python < 3.10 are using the old version + # e.g. it could have returned 'blob' and that would confuse blob check (see _check_allowed_blobs) + type_ = type_.upper() + schema[col] = type_ + return schema + + def get_table_schemas(self) -> dict[str, dict[str, str]]: + return {name: self.get_table_schema(name) for name in self.get_table_names()} diff --git a/my/fbmessenger/android.py b/my/fbmessenger/android.py index db4cc54..f6fdb82 100644 --- a/my/fbmessenger/android.py +++ b/my/fbmessenger/android.py @@ -15,7 +15,7 @@ from my.core import LazyLogger, Paths, Res, datetime_aware, get_files, make_conf from my.core.common import unique_everseen from my.core.compat import assert_never from my.core.error import echain -from my.core.sqlite import sqlite_connection +from my.core.sqlite import sqlite_connection, SqliteTool from my.config import fbmessenger as user_config # isort: skip @@ -86,8 +86,8 @@ def _entities() -> Iterator[Res[Entity]]: for idx, path in enumerate(paths): logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') with sqlite_connection(path, immutable=True, row_factory='row') as db: + use_msys = "logging_events_v2" in SqliteTool(db).get_table_names() try: - use_msys = len(list(db.execute('SELECT * FROM sqlite_master WHERE name = "logging_events_v2"'))) > 0 if use_msys: yield from _process_db_msys(db) else: From bb703c8c6a7ef80205030f640316c222bc48a6e1 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 29 Dec 2024 15:37:10 +0000 Subject: [PATCH 302/302] twitter.android: fix get_own_user_id for latest exports --- my/twitter/android.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/my/twitter/android.py b/my/twitter/android.py index 8159ee7..7e8f170 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -161,9 +161,22 @@ def get_own_user_id(conn) -> str: 'SELECT DISTINCT CAST(list_mapping_user_id AS TEXT) FROM list_mapping', 'SELECT DISTINCT CAST(owner_id AS TEXT) FROM cursors', 'SELECT DISTINCT CAST(user_id AS TEXT) FROM users WHERE _id == 1', + # ugh, sometimes all of the above are empty... + # for the rest it seems: + # - is_active_creator is NULL + # - is_graduated is NULL + # - profile_highlighted_info is NULL + 'SELECT DISTINCT CAST(user_id AS TEXT) FROM users WHERE is_active_creator == 0 AND is_graduated == 1 AND profile_highlights_info IS NOT NULL', ]: - for (r,) in conn.execute(q): - res.add(r) + res |= {r for (r,) in conn.execute(q)} + + assert len(res) <= 1, res + if len(res) == 0: + # sometimes even all of the above doesn't help... + # last resort is trying to get from status_groups table + # however we can't always use it because it might contain multiple different owner_id? + # not sure, maybe it will break as well and we'll need to fallback on the most common or something.. + res |= {r for (r,) in conn.execute('SELECT DISTINCT CAST(owner_id AS TEXT) FROM status_groups')} assert len(res) == 1, res [r] = res return r