Compare commits

..

No commits in common. "master" and "v0.4.20240506" have entirely different histories.

213 changed files with 3500 additions and 5258 deletions

13
.ci/run
View file

@ -11,8 +11,6 @@ if ! command -v sudo; then
} }
fi fi
# --parallel-live to show outputs while it's running
tox_cmd='run-parallel --parallel-live'
if [ -n "${CI-}" ]; then if [ -n "${CI-}" ]; then
# install OS specific stuff here # install OS specific stuff here
case "$OSTYPE" in case "$OSTYPE" in
@ -22,8 +20,7 @@ if [ -n "${CI-}" ]; then
;; ;;
cygwin* | msys* | win*) cygwin* | msys* | win*)
# windows # windows
# ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that :
tox_cmd='run'
;; ;;
*) *)
# must be linux? # must be linux?
@ -40,9 +37,5 @@ if ! command -v python3 &> /dev/null; then
PY_BIN="python" PY_BIN="python"
fi fi
"$PY_BIN" -m pip install --user tox
# TODO hmm for some reason installing uv with pip and then running "$PY_BIN" -m tox --parallel --parallel-live "$@"
# "$PY_BIN" -m uv tool fails with missing setuptools error??
# just uvx directly works, but it's not present in PATH...
"$PY_BIN" -m pip install --user pipx
"$PY_BIN" -m pipx run uv tool run --with=tox-uv tox $tox_cmd "$@"

View file

@ -21,20 +21,19 @@ on:
jobs: jobs:
build: build:
strategy: strategy:
fail-fast: false
matrix: matrix:
platform: [ubuntu-latest, macos-latest, windows-latest] platform: [ubuntu-latest, macos-latest, windows-latest]
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
exclude: [ exclude: [
# windows runners are pretty scarce, so let's only run lowest and highest python version # windows runners are pretty scarce, so let's only run lowest and highest python version
{platform: windows-latest, python-version: '3.9' },
{platform: windows-latest, python-version: '3.10'}, {platform: windows-latest, python-version: '3.10'},
{platform: windows-latest, python-version: '3.11'}, {platform: windows-latest, python-version: '3.11'},
{platform: windows-latest, python-version: '3.12'},
# same, macos is a bit too slow and ubuntu covers python quirks well # same, macos is a bit too slow and ubuntu covers python quirks well
{platform: macos-latest , python-version: '3.9' },
{platform: macos-latest , python-version: '3.10' }, {platform: macos-latest , python-version: '3.10' },
{platform: macos-latest , python-version: '3.11' }, {platform: macos-latest , python-version: '3.11' },
{platform: macos-latest , python-version: '3.12' },
] ]
runs-on: ${{ matrix.platform }} runs-on: ${{ matrix.platform }}
@ -64,13 +63,11 @@ jobs:
- if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
include-hidden-files: true
name: .coverage.mypy-misc_${{ matrix.platform }}_${{ matrix.python-version }} name: .coverage.mypy-misc_${{ matrix.platform }}_${{ matrix.python-version }}
path: .coverage.mypy-misc/ path: .coverage.mypy-misc/
- if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
include-hidden-files: true
name: .coverage.mypy-core_${{ matrix.platform }}_${{ matrix.python-version }} name: .coverage.mypy-core_${{ matrix.platform }}_${{ matrix.python-version }}
path: .coverage.mypy-core/ path: .coverage.mypy-core/
@ -84,7 +81,7 @@ jobs:
- uses: actions/setup-python@v5 - uses: actions/setup-python@v5
with: with:
python-version: '3.10' python-version: '3.8'
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:

3
.gitignore vendored
View file

@ -155,9 +155,6 @@ celerybeat-schedule
.dmypy.json .dmypy.json
dmypy.json dmypy.json
# linters
.ruff_cache/
# Pyre type checker # Pyre type checker
.pyre/ .pyre/

View file

@ -20,7 +20,7 @@ General/my.core changes:
- e81dddddf083ffd81aa7e2b715bd34f59949479c properly resolve class properties in make_config + add test - e81dddddf083ffd81aa7e2b715bd34f59949479c properly resolve class properties in make_config + add test
Modules: Modules:
- some initial work on filling **InfluxDB** with HPI data - some innitial work on filling **InfluxDB** with HPI data
- pinboard - pinboard
- 42399f6250d9901d93dcedcfe05f7857babcf834: **breaking backwards compatibility**, use pinbexport module directly - 42399f6250d9901d93dcedcfe05f7857babcf834: **breaking backwards compatibility**, use pinbexport module directly

View file

@ -723,10 +723,10 @@ If you want to write modules for personal use but don't want to merge them into
Other HPI Repositories: Other HPI Repositories:
- [[https://github.com/purarue/HPI][purarue/HPI]] - [[https://github.com/seanbreckenridge/HPI][seanbreckenridge/HPI]]
- [[https://github.com/madelinecameron/hpi][madelinecameron/HPI]] - [[https://github.com/madelinecameron/hpi][madelinecameron/HPI]]
If you want to create your own to create your own modules/override something here, you can use the [[https://github.com/purarue/HPI-template][template]]. If you want to create your own to create your own modules/override something here, you can use the [[https://github.com/seanbreckenridge/HPI-template][template]].
* Related links * Related links
:PROPERTIES: :PROPERTIES:

View file

@ -1,47 +0,0 @@
# this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly
# without it, pytest can't discover the package root for some reason
# also see https://github.com/karlicoss/pytest_namespace_pkgs for more
import os
import pathlib
from typing import Optional
import _pytest.main
import _pytest.pathlib
# we consider all dirs in repo/ to be namespace packages
root_dir = pathlib.Path(__file__).absolute().parent.resolve() # / 'src'
assert root_dir.exists(), root_dir
# TODO assert it contains package name?? maybe get it via setuptools..
namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()]
# resolve_package_path is called from _pytest.pathlib.import_path
# takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem
resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path
def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]:
result = path # search from the test file upwards
for parent in result.parents:
if str(parent) in namespace_pkg_dirs:
return parent
if os.name == 'nt':
# ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx
if path.name == 'conftest.py':
return resolve_pkg_path_orig(path)
raise RuntimeError("Couldn't determine path for ", path)
_pytest.pathlib.resolve_package_path = resolve_package_path
# without patching, the orig function returns just a package name for some reason
# (I think it's used as a sort of fallback)
# so we need to point it at the absolute path properly
# not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure..
search_pypath_orig = _pytest.main.search_pypath
def search_pypath(module_name: str) -> str:
mpath = root_dir / module_name.replace('.', os.sep)
if not mpath.is_dir():
mpath = mpath.with_suffix('.py')
assert mpath.exists(), mpath # just in case
return str(mpath)
_pytest.main.search_pypath = search_pypath

View file

@ -76,7 +76,7 @@ This would typically be used in an overridden `all.py` file, or in a one-off scr
which you may want to filter out some items from a source, progressively adding more which you may want to filter out some items from a source, progressively adding more
items to the denylist as you go. items to the denylist as you go.
A potential `my/ip/all.py` file might look like (Sidenote: `discord` module from [here](https://github.com/purarue/HPI)): A potential `my/ip/all.py` file might look like (Sidenote: `discord` module from [here](https://github.com/seanbreckenridge/HPI)):
```python ```python
from typing import Iterator from typing import Iterator
@ -119,9 +119,9 @@ python3 -c 'from my.ip import all; all.deny.deny_cli(all.ips())'
To edit the `all.py`, you could either: To edit the `all.py`, you could either:
- install it as editable (`python3 -m pip install --user -e ./HPI`), and then edit the file directly - install it as editable (`python3 -m pip install --user -e ./HPI`), and then edit the file directly
- or, create a namespace package, which splits the package across multiple directories. For info on that see [`MODULE_DESIGN`](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#namespace-packages), [`reorder_editable`](https://github.com/purarue/reorder_editable), and possibly the [`HPI-template`](https://github.com/purarue/HPI-template) to create your own HPI namespace package to create your own `all.py` file. - or, create a namespace package, which splits the package across multiple directories. For info on that see [`MODULE_DESIGN`](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#namespace-packages), [`reorder_editable`](https://github.com/seanbreckenridge/reorder_editable), and possibly the [`HPI-template`](https://github.com/seanbreckenridge/HPI-template) to create your own HPI namespace package to create your own `all.py` file.
For a real example of this see, [purarue/HPI-personal](https://github.com/purarue/HPI-personal/blob/master/my/ip/all.py) For a real example of this see, [seanbreckenridge/HPI-personal](https://github.com/seanbreckenridge/HPI-personal/blob/master/my/ip/all.py)
Sidenote: the reason why we want to specifically override Sidenote: the reason why we want to specifically override
the all.py and not just create a script that filters out the items you're the all.py and not just create a script that filters out the items you're

View file

@ -76,7 +76,7 @@ The config snippets below are meant to be modified accordingly and *pasted into
You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works. You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works.
For an extensive/complex example, you can check out ~@purarue~'s [[https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py][config]] For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[https://github.com/seanbreckenridge/dotfiles/blob/master/.config/my/my/config/__init__.py][config]]
# Nested Configurations before the doc generation using the block below # Nested Configurations before the doc generation using the block below
** [[file:../my/reddit][my.reddit]] ** [[file:../my/reddit][my.reddit]]
@ -96,7 +96,7 @@ For an extensive/complex example, you can check out ~@purarue~'s [[https://githu
class pushshift: class pushshift:
''' '''
Uses [[https://github.com/purarue/pushshift_comment_export][pushshift]] to get access to old comments Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
''' '''
# path[s]/glob to the exported JSON data # path[s]/glob to the exported JSON data
@ -106,7 +106,7 @@ For an extensive/complex example, you can check out ~@purarue~'s [[https://githu
** [[file:../my/browser/][my.browser]] ** [[file:../my/browser/][my.browser]]
Parses browser history using [[http://github.com/purarue/browserexport][browserexport]] Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
#+begin_src python #+begin_src python
class browser: class browser:
@ -132,7 +132,7 @@ For an extensive/complex example, you can check out ~@purarue~'s [[https://githu
You might also be able to use [[file:../my/location/via_ip.py][my.location.via_ip]] which uses =my.ip.all= to You might also be able to use [[file:../my/location/via_ip.py][my.location.via_ip]] which uses =my.ip.all= to
provide geolocation data for an IPs (though no IPs are provided from any provide geolocation data for an IPs (though no IPs are provided from any
of the sources here). For an example of usage, see [[https://github.com/purarue/HPI/tree/master/my/ip][here]] of the sources here). For an example of usage, see [[https://github.com/seanbreckenridge/HPI/tree/master/my/ip][here]]
#+begin_src python #+begin_src python
class location: class location:
@ -256,9 +256,9 @@ for cls, p in modules:
** [[file:../my/google/takeout/parser.py][my.google.takeout.parser]] ** [[file:../my/google/takeout/parser.py][my.google.takeout.parser]]
Parses Google Takeout using [[https://github.com/purarue/google_takeout_parser][google_takeout_parser]] Parses Google Takeout using [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]]
See [[https://github.com/purarue/google_takeout_parser][google_takeout_parser]] for more information about how to export and organize your takeouts See [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] for more information about how to export and organize your takeouts
If the =DISABLE_TAKEOUT_CACHE= environment variable is set, this won't If the =DISABLE_TAKEOUT_CACHE= environment variable is set, this won't
cache individual exports in =~/.cache/google_takeout_parser= cache individual exports in =~/.cache/google_takeout_parser=

View file

@ -67,7 +67,7 @@ If you want to disable a source, you have a few options.
... that suppresses the warning message and lets you use ~my.location.all~ without having to change any lines of code ... that suppresses the warning message and lets you use ~my.location.all~ without having to change any lines of code
Another benefit is that all the custom sources/data is localized to the ~all.py~ file, so a user can override the ~all.py~ (see the sections below on ~namespace packages~) file in their own HPI repository, adding additional sources without having to maintain a fork and patching in changes as things eventually change. For a 'real world' example of that, see [[https://github.com/purarue/HPI#partially-in-usewith-overrides][purarue]]s location and ip modules. Another benefit is that all the custom sources/data is localized to the ~all.py~ file, so a user can override the ~all.py~ (see the sections below on ~namespace packages~) file in their own HPI repository, adding additional sources without having to maintain a fork and patching in changes as things eventually change. For a 'real world' example of that, see [[https://github.com/seanbreckenridge/HPI#partially-in-usewith-overrides][seanbreckenridge]]s location and ip modules.
This is of course not required for personal or single file modules, its just the pattern that seems to have the least amount of friction for the user, while being extendable, and without using a bulky plugin system to let users add additional sources. This is of course not required for personal or single file modules, its just the pattern that seems to have the least amount of friction for the user, while being extendable, and without using a bulky plugin system to let users add additional sources.
@ -208,13 +208,13 @@ Where ~lastfm.py~ is your version of ~my.lastfm~, which you've copied from this
Then, running ~python3 -m pip install -e .~ in that directory would install that as part of the namespace package, and assuming (see below for possible issues) this appears on ~sys.path~ before the upstream repository, your ~lastfm.py~ file overrides the upstream. Adding more files, like ~my.some_new_module~ into that directory immediately updates the global ~my~ package -- allowing you to quickly add new modules without having to re-install. Then, running ~python3 -m pip install -e .~ in that directory would install that as part of the namespace package, and assuming (see below for possible issues) this appears on ~sys.path~ before the upstream repository, your ~lastfm.py~ file overrides the upstream. Adding more files, like ~my.some_new_module~ into that directory immediately updates the global ~my~ package -- allowing you to quickly add new modules without having to re-install.
If you install both directories as editable packages (which has the benefit of any changes you making in either repository immediately updating the globally installed ~my~ package), there are some concerns with which editable install appears on your ~sys.path~ first. If you wanted your modules to override the upstream modules, yours would have to appear on the ~sys.path~ first (this is the same reason that =custom_lastfm_overlay= must be at the front of your ~PYTHONPATH~). For more details and examples on dealing with editable namespace packages in the context of HPI, see the [[https://github.com/purarue/reorder_editable][reorder_editable]] repository. If you install both directories as editable packages (which has the benefit of any changes you making in either repository immediately updating the globally installed ~my~ package), there are some concerns with which editable install appears on your ~sys.path~ first. If you wanted your modules to override the upstream modules, yours would have to appear on the ~sys.path~ first (this is the same reason that =custom_lastfm_overlay= must be at the front of your ~PYTHONPATH~). For more details and examples on dealing with editable namespace packages in the context of HPI, see the [[https://github.com/seanbreckenridge/reorder_editable][reorder_editable]] repository.
There is no limit to how many directories you could install into a single namespace package, which could be a possible way for people to install additional HPI modules, without worrying about the module count here becoming too large to manage. There is no limit to how many directories you could install into a single namespace package, which could be a possible way for people to install additional HPI modules, without worrying about the module count here becoming too large to manage.
There are some other users [[https://github.com/hpi/hpi][who have begun publishing their own modules]] as namespace packages, which you could potentially install and use, in addition to this repository, if any of those interest you. If you want to create your own you can use the [[https://github.com/purarue/HPI-template][template]] to get started. There are some other users [[https://github.com/hpi/hpi][who have begun publishing their own modules]] as namespace packages, which you could potentially install and use, in addition to this repository, if any of those interest you. If you want to create your own you can use the [[https://github.com/seanbreckenridge/HPI-template][template]] to get started.
Though, enabling this many modules may make ~hpi doctor~ look pretty busy. You can explicitly choose to enable/disable modules with a list of modules/regexes in your [[https://github.com/karlicoss/HPI/blob/f559e7cb899107538e6c6bbcf7576780604697ef/my/core/core_config.py#L24-L55][core config]], see [[https://github.com/purarue/dotfiles/blob/a1a77c581de31bd55a6af3d11b8af588614a207e/.config/my/my/config/__init__.py#L42-L72][here]] for an example. Though, enabling this many modules may make ~hpi doctor~ look pretty busy. You can explicitly choose to enable/disable modules with a list of modules/regexes in your [[https://github.com/karlicoss/HPI/blob/f559e7cb899107538e6c6bbcf7576780604697ef/my/core/core_config.py#L24-L55][core config]], see [[https://github.com/seanbreckenridge/dotfiles/blob/a1a77c581de31bd55a6af3d11b8af588614a207e/.config/my/my/config/__init__.py#L42-L72][here]] for an example.
You may use the other modules or [[https://github.com/karlicoss/hpi-personal-overlay][my overlay]] as reference, but python packaging is already a complicated issue, before adding complexities like namespace packages and editable installs on top of it... If you're having trouble extending HPI in this fashion, you can open an issue here, preferably with a link to your code/repository and/or ~setup.py~ you're trying to use. You may use the other modules or [[https://github.com/karlicoss/hpi-personal-overlay][my overlay]] as reference, but python packaging is already a complicated issue, before adding complexities like namespace packages and editable installs on top of it... If you're having trouble extending HPI in this fashion, you can open an issue here, preferably with a link to your code/repository and/or ~setup.py~ you're trying to use.

View file

@ -10,7 +10,7 @@ Relevant discussion about overlays: https://github.com/karlicoss/HPI/issues/102
# You can see them TODO in overlays dir # You can see them TODO in overlays dir
Consider a toy package/module structure with minimal code, without any actual data parsing, just for demonstration purposes. Consider a toy package/module structure with minimal code, wihout any actual data parsing, just for demonstration purposes.
- =main= package structure - =main= package structure
# TODO do links # TODO do links
@ -19,7 +19,7 @@ Consider a toy package/module structure with minimal code, without any actual da
Extracts Twitter data from GDPR archive. Extracts Twitter data from GDPR archive.
- =my/twitter/all.py= - =my/twitter/all.py=
Merges twitter data from multiple sources (only =gdpr= in this case), so data consumers are agnostic of specific data sources used. Merges twitter data from multiple sources (only =gdpr= in this case), so data consumers are agnostic of specific data sources used.
This will be overridden by =overlay=. This will be overriden by =overlay=.
- =my/twitter/common.py= - =my/twitter/common.py=
Contains helper function to merge data, so they can be reused by overlay's =all.py=. Contains helper function to merge data, so they can be reused by overlay's =all.py=.
- =my/reddit.py= - =my/reddit.py=
@ -66,7 +66,7 @@ This basically means that modules will be searched in both paths, with overlay t
** Installing with =--use-pep517= ** Installing with =--use-pep517=
See here for discussion https://github.com/purarue/reorder_editable/issues/2, but TLDR it should work similarly. See here for discussion https://github.com/seanbreckenridge/reorder_editable/issues/2, but TLDR it should work similarly.
* Testing runtime behaviour (editable install) * Testing runtime behaviour (editable install)
@ -126,7 +126,7 @@ https://github.com/python/mypy/blob/1dd8e7fe654991b01bd80ef7f1f675d9e3910c3a/myp
For now, I opened an issue in mypy repository https://github.com/python/mypy/issues/16683 For now, I opened an issue in mypy repository https://github.com/python/mypy/issues/16683
But ok, maybe mypy treats =main= as an external package somehow but still type checks it properly? But ok, maybe mypy treats =main= as an external package somhow but still type checks it properly?
Let's see what's going on with imports: Let's see what's going on with imports:
: $ mypy --namespace-packages --strict -p my --follow-imports=error : $ mypy --namespace-packages --strict -p my --follow-imports=error

View file

@ -97,9 +97,9 @@ By default, this just returns the items in the order they were returned by the f
hpi query my.coding.commits.commits --order-key committed_dt --limit 1 --reverse --output pprint --stream hpi query my.coding.commits.commits --order-key committed_dt --limit 1 --reverse --output pprint --stream
Commit(committed_dt=datetime.datetime(2023, 4, 14, 23, 9, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), Commit(committed_dt=datetime.datetime(2023, 4, 14, 23, 9, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))),
authored_dt=datetime.datetime(2023, 4, 14, 23, 4, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), authored_dt=datetime.datetime(2023, 4, 14, 23, 4, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))),
message='sources.smscalls: propagate errors if there are breaking ' message='sources.smscalls: propogate errors if there are breaking '
'schema changes', 'schema changes',
repo='/home/username/Repos/promnesia-fork', repo='/home/sean/Repos/promnesia-fork',
sha='22a434fca9a28df9b0915ccf16368df129d2c9ce', sha='22a434fca9a28df9b0915ccf16368df129d2c9ce',
ref='refs/heads/smscalls-handle-result') ref='refs/heads/smscalls-handle-result')
``` ```
@ -195,7 +195,7 @@ To preview, you can use something like [`qgis`](https://qgis.org/en/site/) or fo
<img src="https://user-images.githubusercontent.com/7804791/232249184-7e203ee6-a3ec-4053-800c-751d2c28e690.png" width=500 alt="chicago trip" /> <img src="https://user-images.githubusercontent.com/7804791/232249184-7e203ee6-a3ec-4053-800c-751d2c28e690.png" width=500 alt="chicago trip" />
(Sidenote: this is [`@purarue`](https://github.com/purarue/)s locations, on a trip to Chicago) (Sidenote: this is [`@seanbreckenridge`](https://github.com/seanbreckenridge/)s locations, on a trip to Chicago)
## Python reference ## Python reference
@ -301,4 +301,4 @@ The `hpi query` command is a CLI wrapper around the code in [`query.py`](../my/c
If you specify a range, drop_unsorted is forced to be True If you specify a range, drop_unsorted is forced to be True
``` ```
Those can be imported and accept any sort of iterator, `hpi query` just defaults to the output of functions here. As an example, see [`listens`](https://github.com/purarue/HPI-personal/blob/master/scripts/listens) which just passes an generator (iterator) as the first argument to `query_range` Those can be imported and accept any sort of iterator, `hpi query` just defaults to the output of functions here. As an example, see [`listens`](https://github.com/seanbreckenridge/HPI-personal/blob/master/scripts/listens) which just passes an generator (iterator) as the first argument to `query_range`

View file

@ -387,7 +387,7 @@ But there is an extra caveat: rexport is already coming with nice [[https://gith
Several other HPI modules are following a similar pattern: hypothesis, instapaper, pinboard, kobo, etc. Several other HPI modules are following a similar pattern: hypothesis, instapaper, pinboard, kobo, etc.
Since the [[https://github.com/karlicoss/rexport#api-limitations][reddit API has limited results]], you can use [[https://github.com/purarue/pushshift_comment_export][my.reddit.pushshift]] to access older reddit comments, which both then get merged into =my.reddit.all.comments= Since the [[https://github.com/karlicoss/rexport#api-limitations][reddit API has limited results]], you can use [[https://github.com/seanbreckenridge/pushshift_comment_export][my.reddit.pushshift]] to access older reddit comments, which both then get merged into =my.reddit.all.comments=
** Twitter ** Twitter

View file

@ -32,6 +32,6 @@ ignore =
# #
# as a reference: # as a reference:
# https://github.com/purarue/cookiecutter-template/blob/master/%7B%7Bcookiecutter.module_name%7D%7D/setup.cfg # https://github.com/seanbreckenridge/cookiecutter-template/blob/master/%7B%7Bcookiecutter.module_name%7D%7D/setup.cfg
# and this https://github.com/karlicoss/HPI/pull/151 # and this https://github.com/karlicoss/HPI/pull/151
# find ./my | entr flake8 --ignore=E402,E501,E741,W503,E266,E302,E305,E203,E261,E252,E251,E221,W291,E225,E303,E702,E202,F841,E731,E306,E127 E722,E231 my | grep -v __NOT_HPI_MODULE__ # find ./my | entr flake8 --ignore=E402,E501,E741,W503,E266,E302,E305,E203,E261,E252,E251,E221,W291,E225,E303,E702,E202,F841,E731,E306,E127 E722,E231 my | grep -v __NOT_HPI_MODULE__

View file

@ -2,22 +2,19 @@
[[https://github.com/nomeata/arbtt#arbtt-the-automatic-rule-based-time-tracker][Arbtt]] time tracking [[https://github.com/nomeata/arbtt#arbtt-the-automatic-rule-based-time-tracker][Arbtt]] time tracking
''' '''
from __future__ import annotations
REQUIRES = ['ijson', 'cffi'] REQUIRES = ['ijson', 'cffi']
# NOTE likely also needs libyajl2 from apt or elsewhere? # NOTE likely also needs libyajl2 from apt or elsewhere?
from collections.abc import Iterable, Sequence
from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Sequence, Iterable, List, Optional
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
try: try:
from my.config import arbtt as user_config from my.config import arbtt as user_config
except ImportError: except ImportError:
from my.core.warnings import low from .core.warnings import low
low("Couldn't find 'arbtt' config section, falling back to the default capture.log (usually in HOME dir). Add 'arbtt' section with logfiles = '' to suppress this warning.") low("Couldn't find 'arbtt' config section, falling back to the default capture.log (usually in HOME dir). Add 'arbtt' section with logfiles = '' to suppress this warning.")
return [] return []
else: else:
@ -25,9 +22,8 @@ def inputs() -> Sequence[Path]:
return get_files(user_config.logfiles) return get_files(user_config.logfiles)
from .core import dataclass, Json, PathIsh, datetime_aware
from my.core import Json, PathIsh, datetime_aware from .core.compat import fromisoformat
from my.core.compat import fromisoformat
@dataclass @dataclass
@ -57,7 +53,7 @@ class Entry:
return fromisoformat(ds) return fromisoformat(ds)
@property @property
def active(self) -> str | None: def active(self) -> Optional[str]:
# NOTE: WIP, might change this in the future... # NOTE: WIP, might change this in the future...
ait = (w for w in self.json['windows'] if w['active']) ait = (w for w in self.json['windows'] if w['active'])
a = next(ait, None) a = next(ait, None)
@ -76,18 +72,17 @@ class Entry:
def entries() -> Iterable[Entry]: def entries() -> Iterable[Entry]:
inps = list(inputs()) inps = list(inputs())
base: list[PathIsh] = ['arbtt-dump', '--format=json'] base: List[PathIsh] = ['arbtt-dump', '--format=json']
cmds: list[list[PathIsh]] cmds: List[List[PathIsh]]
if len(inps) == 0: if len(inps) == 0:
cmds = [base] # rely on default cmds = [base] # rely on default
else: else:
# otherwise, 'merge' them # otherwise, 'merge' them
cmds = [[*base, '--logfile', f] for f in inps] cmds = [base + ['--logfile', f] for f in inps]
from subprocess import PIPE, Popen
import ijson.backends.yajl2_cffi as ijson # type: ignore import ijson.backends.yajl2_cffi as ijson # type: ignore
from subprocess import Popen, PIPE
for cmd in cmds: for cmd in cmds:
with Popen(cmd, stdout=PIPE) as p: with Popen(cmd, stdout=PIPE) as p:
out = p.stdout; assert out is not None out = p.stdout; assert out is not None
@ -96,8 +91,8 @@ def entries() -> Iterable[Entry]:
def fill_influxdb() -> None: def fill_influxdb() -> None:
from .core.freezer import Freezer
from .core.influxdb import magic_fill from .core.influxdb import magic_fill
from .core.freezer import Freezer
freezer = Freezer(Entry) freezer = Freezer(Entry)
fit = (freezer.freeze(e) for e in entries()) fit = (freezer.freeze(e) for e in entries())
# TODO crap, influxdb doesn't like None https://github.com/influxdata/influxdb/issues/7722 # TODO crap, influxdb doesn't like None https://github.com/influxdata/influxdb/issues/7722
@ -109,8 +104,6 @@ def fill_influxdb() -> None:
magic_fill(fit, name=f'{entries.__module__}:{entries.__name__}') magic_fill(fit, name=f'{entries.__module__}:{entries.__name__}')
from .core import Stats, stat from .core import stat, Stats
def stats() -> Stats: def stats() -> Stats:
return stat(entries) return stat(entries)

View file

@ -1,63 +1,39 @@
#!/usr/bin/python3
""" """
[[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor [[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor
""" """
from __future__ import annotations
# todo most of it belongs to DAL... but considering so few people use it I didn't bother for now # todo most of it belongs to DAL... but considering so few people use it I didn't bother for now
import re
import sqlite3
from abc import abstractmethod
from collections.abc import Iterable, Sequence
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime, timedelta
from pathlib import Path from pathlib import Path
from typing import Protocol import re
import sqlite3
from typing import Iterable, Sequence, Set, Optional
import pytz import pytz
from my.core import ( from my.core import (
Paths,
Res,
Stats,
get_files, get_files,
make_logger, make_logger,
Res,
stat, stat,
unwrap, Stats,
influxdb,
) )
from my.core.cachew import mcachew from my.core.common import mcachew
from my.core.error import unwrap
from my.core.pandas import DataFrameT, as_dataframe from my.core.pandas import DataFrameT, as_dataframe
from my.core.sqlite import sqlite_connect_immutable from my.core.sqlite import sqlite_connect_immutable
from my.config import bluemaestro as config
class config(Protocol):
@property
@abstractmethod
def export_path(self) -> Paths:
raise NotImplementedError
@property
def tz(self) -> pytz.BaseTzInfo:
# fixme: later, rely on the timezone provider
# NOTE: the timezone should be set with respect to the export date!!!
return pytz.timezone('Europe/London')
# TODO when I change tz, check the diff
def make_config() -> config:
from my.config import bluemaestro as user_config
class combined_config(user_config, config): ...
return combined_config()
logger = make_logger(__name__) logger = make_logger(__name__)
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
cfg = make_config() return get_files(config.export_path)
return get_files(cfg.export_path)
Celsius = float Celsius = float
@ -74,6 +50,12 @@ class Measurement:
dewpoint: Celsius dewpoint: Celsius
# fixme: later, rely on the timezone provider
# NOTE: the timezone should be set with respect to the export date!!!
tz = pytz.timezone('Europe/London')
# TODO when I change tz, check the diff
def is_bad_table(name: str) -> bool: def is_bad_table(name: str) -> bool:
# todo hmm would be nice to have a hook that can patch any module up to # todo hmm would be nice to have a hook that can patch any module up to
delegate = getattr(config, 'is_bad_table', None) delegate = getattr(config, 'is_bad_table', None)
@ -82,31 +64,28 @@ def is_bad_table(name: str) -> bool:
@mcachew(depends_on=inputs) @mcachew(depends_on=inputs)
def measurements() -> Iterable[Res[Measurement]]: def measurements() -> Iterable[Res[Measurement]]:
cfg = make_config()
tz = cfg.tz
# todo ideally this would be via arguments... but needs to be lazy # todo ideally this would be via arguments... but needs to be lazy
paths = inputs() paths = inputs()
total = len(paths) total = len(paths)
width = len(str(total)) width = len(str(total))
last: datetime | None = None last: Optional[datetime] = None
# tables are immutable, so can save on processing.. # tables are immutable, so can save on processing..
processed_tables: set[str] = set() processed_tables: Set[str] = set()
for idx, path in enumerate(paths): for idx, path in enumerate(paths):
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
tot = 0 tot = 0
new = 0 new = 0
# todo assert increasing timestamp? # todo assert increasing timestamp?
with sqlite_connect_immutable(path) as db: with sqlite_connect_immutable(path) as db:
db_dt: datetime | None = None db_dt: Optional[datetime] = None
try: try:
datas = db.execute( datas = db.execute(
f'SELECT "{path.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index' f'SELECT "{path.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index'
) )
oldfmt = True oldfmt = True
[(db_dts,)] = db.execute('SELECT last_download FROM info') db_dts = list(db.execute('SELECT last_download FROM info'))[0][0]
if db_dts == 'N/A': if db_dts == 'N/A':
# ??? happens for 20180923-20180928 # ??? happens for 20180923-20180928
continue continue
@ -139,7 +118,7 @@ def measurements() -> Iterable[Res[Measurement]]:
processed_tables |= set(log_tables) processed_tables |= set(log_tables)
# todo use later? # todo use later?
frequencies = [list(db.execute(f'SELECT interval from {t.replace("_log", "_meta")}'))[0][0] for t in log_tables] # noqa: RUF015 frequencies = [list(db.execute(f'SELECT interval from {t.replace("_log", "_meta")}'))[0][0] for t in log_tables]
# todo could just filter out the older datapoints?? dunno. # todo could just filter out the older datapoints?? dunno.
@ -155,7 +134,7 @@ def measurements() -> Iterable[Res[Measurement]]:
oldfmt = False oldfmt = False
db_dt = None db_dt = None
for (name, tsc, temp, hum, pres, dewp) in datas: for i, (name, tsc, temp, hum, pres, dewp) in enumerate(datas):
if is_bad_table(name): if is_bad_table(name):
continue continue
@ -232,8 +211,6 @@ def dataframe() -> DataFrameT:
def fill_influxdb() -> None: def fill_influxdb() -> None:
from my.core import influxdb
influxdb.fill(measurements(), measurement=__name__) influxdb.fill(measurements(), measurement=__name__)

View file

@ -2,42 +2,41 @@
Blood tracking (manual org-mode entries) Blood tracking (manual org-mode entries)
""" """
from __future__ import annotations
from collections.abc import Iterable
from datetime import datetime from datetime import datetime
from typing import NamedTuple from typing import Iterable, NamedTuple, Optional
import orgparse
import pandas as pd
from my.config import blood as config # type: ignore[attr-defined]
from ..core.error import Res from ..core.error import Res
from ..core.orgmode import one_table, parse_org_datetime from ..core.orgmode import parse_org_datetime, one_table
import pandas as pd
import orgparse
from my.config import blood as config # type: ignore[attr-defined]
class Entry(NamedTuple): class Entry(NamedTuple):
dt: datetime dt: datetime
ketones : float | None=None ketones : Optional[float]=None
glucose : float | None=None glucose : Optional[float]=None
vitamin_d : float | None=None vitamin_d : Optional[float]=None
vitamin_b12 : float | None=None vitamin_b12 : Optional[float]=None
hdl : float | None=None hdl : Optional[float]=None
ldl : float | None=None ldl : Optional[float]=None
triglycerides: float | None=None triglycerides: Optional[float]=None
source : str | None=None source : Optional[str]=None
extra : str | None=None extra : Optional[str]=None
Result = Res[Entry] Result = Res[Entry]
def try_float(s: str) -> float | None: def try_float(s: str) -> Optional[float]:
l = s.split() l = s.split()
if len(l) == 0: if len(l) == 0:
return None return None
@ -106,7 +105,6 @@ def blood_tests_data() -> Iterable[Result]:
def data() -> Iterable[Result]: def data() -> Iterable[Result]:
from itertools import chain from itertools import chain
from ..core.error import sort_res_by from ..core.error import sort_res_by
datas = chain(glucose_ketones_data(), blood_tests_data()) datas = chain(glucose_ketones_data(), blood_tests_data())
return sort_res_by(datas, key=lambda e: e.dt) return sort_res_by(datas, key=lambda e: e.dt)

View file

@ -7,10 +7,10 @@ from ...core.pandas import DataFrameT, check_dataframe
@check_dataframe @check_dataframe
def dataframe() -> DataFrameT: def dataframe() -> DataFrameT:
# this should be somehow more flexible... # this should be somehow more flexible...
import pandas as pd
from ...endomondo import dataframe as EDF from ...endomondo import dataframe as EDF
from ...runnerup import dataframe as RDF from ...runnerup import dataframe as RDF
import pandas as pd
return pd.concat([ return pd.concat([
EDF(), EDF(),
RDF(), RDF(),

View file

@ -3,6 +3,7 @@ Cardio data, filtered from various data sources
''' '''
from ...core.pandas import DataFrameT, check_dataframe from ...core.pandas import DataFrameT, check_dataframe
CARDIO = { CARDIO = {
'Running', 'Running',
'Running, treadmill', 'Running, treadmill',

View file

@ -5,18 +5,16 @@ This is probably too specific to my needs, so later I will move it away to a per
For now it's worth keeping it here as an example and perhaps utility functions might be useful for other HPI modules. For now it's worth keeping it here as an example and perhaps utility functions might be useful for other HPI modules.
''' '''
from __future__ import annotations
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Optional
import pytz from ...core.pandas import DataFrameT, check_dataframe as cdf
from ...core.orgmode import collect, Table, parse_org_datetime, TypedTable
from my.config import exercise as config from my.config import exercise as config
from ...core.orgmode import Table, TypedTable, collect, parse_org_datetime
from ...core.pandas import DataFrameT
from ...core.pandas import check_dataframe as cdf
import pytz
# FIXME how to attach it properly? # FIXME how to attach it properly?
tz = pytz.timezone('Europe/London') tz = pytz.timezone('Europe/London')
@ -107,7 +105,7 @@ def dataframe() -> DataFrameT:
rows = [] rows = []
idxs = [] # type: ignore[var-annotated] idxs = [] # type: ignore[var-annotated]
NO_ENDOMONDO = 'no endomondo matches' NO_ENDOMONDO = 'no endomondo matches'
for _i, row in mdf.iterrows(): for i, row in mdf.iterrows():
rd = row.to_dict() rd = row.to_dict()
mdate = row['date'] mdate = row['date']
if pd.isna(mdate): if pd.isna(mdate):
@ -116,7 +114,7 @@ def dataframe() -> DataFrameT:
rows.append(rd) # presumably has an error set rows.append(rd) # presumably has an error set
continue continue
idx: int | None idx: Optional[int]
close = edf[edf['start_time'].apply(lambda t: pd_date_diff(t, mdate)).abs() < _DELTA] close = edf[edf['start_time'].apply(lambda t: pd_date_diff(t, mdate)).abs() < _DELTA]
if len(close) == 0: if len(close) == 0:
idx = None idx = None
@ -165,9 +163,7 @@ def dataframe() -> DataFrameT:
# TODO wtf?? where is speed coming from?? # TODO wtf?? where is speed coming from??
from ...core import Stats, stat from ...core import stat, Stats
def stats() -> Stats: def stats() -> Stats:
return stat(cross_trainer_data) return stat(cross_trainer_data)

View file

@ -1,6 +1,5 @@
from ...core import Stats, stat from ...core import stat, Stats
from ...core.pandas import DataFrameT from ...core.pandas import DataFrameT, check_dataframe as cdf
from ...core.pandas import check_dataframe as cdf
class Combine: class Combine:
@ -8,7 +7,7 @@ class Combine:
self.modules = modules self.modules = modules
@cdf @cdf
def dataframe(self, *, with_temperature: bool=True) -> DataFrameT: def dataframe(self, with_temperature: bool=True) -> DataFrameT:
import pandas as pd import pandas as pd
# todo include 'source'? # todo include 'source'?
df = pd.concat([m.dataframe() for m in self.modules]) df = pd.concat([m.dataframe() for m in self.modules])

View file

@ -1,6 +1,7 @@
from ... import emfit, jawbone from ... import jawbone
from .common import Combine from ... import emfit
from .common import Combine
_combined = Combine([ _combined = Combine([
jawbone, jawbone,
emfit, emfit,

View file

@ -2,29 +2,21 @@
Weight data (manually logged) Weight data (manually logged)
''' '''
from collections.abc import Iterator
from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from typing import Any from typing import NamedTuple, Iterator
from my import orgmode from ..core import LazyLogger
from my.core import make_logger from ..core.error import Res, set_error_datetime, extract_error_datetime
from my.core.error import Res, extract_error_datetime, set_error_datetime
config = Any from .. import orgmode
from my.config import weight as config # type: ignore[attr-defined]
def make_config() -> config: log = LazyLogger('my.body.weight')
from my.config import weight as user_config # type: ignore[attr-defined]
return user_config()
log = make_logger(__name__) class Entry(NamedTuple):
@dataclass
class Entry:
dt: datetime dt: datetime
value: float value: float
# TODO comment?? # TODO comment??
@ -34,8 +26,6 @@ Result = Res[Entry]
def from_orgmode() -> Iterator[Result]: def from_orgmode() -> Iterator[Result]:
cfg = make_config()
orgs = orgmode.query() orgs = orgmode.query()
for o in orgmode.query().all(): for o in orgmode.query().all():
if 'weight' not in o.tags: if 'weight' not in o.tags:
@ -56,7 +46,7 @@ def from_orgmode() -> Iterator[Result]:
yield e yield e
continue continue
# FIXME use timezone provider # FIXME use timezone provider
created = cfg.default_timezone.localize(created) created = config.default_timezone.localize(created)
assert created is not None #??? somehow mypy wasn't happy? assert created is not None #??? somehow mypy wasn't happy?
yield Entry( yield Entry(
dt=created, dt=created,
@ -67,7 +57,6 @@ def from_orgmode() -> Iterator[Result]:
def make_dataframe(data: Iterator[Result]): def make_dataframe(data: Iterator[Result]):
import pandas as pd import pandas as pd
def it(): def it():
for e in data: for e in data:
if isinstance(e, Exception): if isinstance(e, Exception):
@ -81,9 +70,8 @@ def make_dataframe(data: Iterator[Result]):
'dt' : e.dt, 'dt' : e.dt,
'weight': e.value, 'weight': e.value,
} }
df = pd.DataFrame(it()) df = pd.DataFrame(it())
df = df.set_index('dt') df.set_index('dt', inplace=True)
# TODO not sure about UTC?? # TODO not sure about UTC??
df.index = pd.to_datetime(df.index, utc=True) df.index = pd.to_datetime(df.index, utc=True)
return df return df
@ -93,7 +81,6 @@ def dataframe():
entries = from_orgmode() entries = from_orgmode()
return make_dataframe(entries) return make_dataframe(entries)
# TODO move to a submodule? e.g. my.body.weight.orgmode? # TODO move to a submodule? e.g. my.body.weight.orgmode?
# so there could be more sources # so there could be more sources
# not sure about my.body thing though # not sure about my.body thing though

View file

@ -1,6 +1,7 @@
from my.core import warnings from ..core import warnings
warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!') warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!')
from my.core.util import __NOT_HPI_MODULE__ from ..core.util import __NOT_HPI_MODULE__
from my.kobo import *
from ..kobo import * # type: ignore[no-redef]

View file

@ -1,13 +1,12 @@
""" """
Parses active browser history by backing it up with [[http://github.com/purarue/sqlite_backup][sqlite_backup]] Parses active browser history by backing it up with [[http://github.com/seanbreckenridge/sqlite_backup][sqlite_backup]]
""" """
REQUIRES = ["browserexport", "sqlite_backup"] REQUIRES = ["browserexport", "sqlite_backup"]
from dataclasses import dataclass
from my.config import browser as user_config from my.config import browser as user_config
from my.core import Paths from my.core import Paths, dataclass
@dataclass @dataclass
@ -19,18 +18,16 @@ class config(user_config.active_browser):
export_path: Paths export_path: Paths
from collections.abc import Iterator, Sequence
from pathlib import Path from pathlib import Path
from typing import Sequence, Iterator
from browserexport.merge import Visit, read_visits from my.core import get_files, Stats, make_logger
from browserexport.merge import read_visits, Visit
from sqlite_backup import sqlite_backup from sqlite_backup import sqlite_backup
from my.core import Stats, get_files, make_logger
logger = make_logger(__name__) logger = make_logger(__name__)
from .common import _patch_browserexport_logs from .common import _patch_browserexport_logs
_patch_browserexport_logs(logger.level) _patch_browserexport_logs(logger.level)

View file

@ -1,9 +1,9 @@
from collections.abc import Iterator from typing import Iterator
from browserexport.merge import Visit, merge_visits
from my.core import Stats from my.core import Stats
from my.core.source import import_source from my.core.source import import_source
from browserexport.merge import merge_visits, Visit
src_export = import_source(module_name="my.browser.export") src_export = import_source(module_name="my.browser.export")
src_active = import_source(module_name="my.browser.active_browser") src_active = import_source(module_name="my.browser.active_browser")

View file

@ -1,15 +1,14 @@
""" """
Parses browser history using [[http://github.com/purarue/browserexport][browserexport]] Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
""" """
REQUIRES = ["browserexport"] REQUIRES = ["browserexport"]
from collections.abc import Iterator, Sequence
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Iterator, Sequence
from browserexport.merge import Visit, read_and_merge import my.config
from my.core import ( from my.core import (
Paths, Paths,
Stats, Stats,
@ -17,12 +16,12 @@ from my.core import (
make_logger, make_logger,
stat, stat,
) )
from my.core.cachew import mcachew from my.core.common import mcachew
from browserexport.merge import read_and_merge, Visit
from .common import _patch_browserexport_logs from .common import _patch_browserexport_logs
import my.config # isort: skip
@dataclass @dataclass
class config(my.config.browser.export): class config(my.config.browser.export):

View file

@ -3,24 +3,24 @@ Bumble data from Android app database (in =/data/data/com.bumble.app/databases/C
""" """
from __future__ import annotations from __future__ import annotations
from collections.abc import Iterator, Sequence
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from pathlib import Path from typing import Iterator, Sequence, Optional, Dict
from more_itertools import unique_everseen from more_itertools import unique_everseen
from my.core import Paths, get_files from my.config import bumble as user_config
from my.config import bumble as user_config # isort: skip
from ..core import Paths
@dataclass @dataclass
class config(user_config.android): class config(user_config.android):
# paths[s]/glob to the exported sqlite databases # paths[s]/glob to the exported sqlite databases
export_path: Paths export_path: Paths
from ..core import get_files
from pathlib import Path
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
return get_files(config.export_path) return get_files(config.export_path)
@ -43,23 +43,20 @@ class _BaseMessage:
@dataclass(unsafe_hash=True) @dataclass(unsafe_hash=True)
class _Message(_BaseMessage): class _Message(_BaseMessage):
conversation_id: str conversation_id: str
reply_to_id: str | None reply_to_id: Optional[str]
@dataclass(unsafe_hash=True) @dataclass(unsafe_hash=True)
class Message(_BaseMessage): class Message(_BaseMessage):
person: Person person: Person
reply_to: Message | None reply_to: Optional[Message]
import json import json
import sqlite3
from typing import Union from typing import Union
from ..core import Res, assert_never
from my.core.compat import assert_never import sqlite3
from ..core.sqlite import sqlite_connect_immutable, select
from ..core import Res
from ..core.sqlite import select, sqlite_connect_immutable
EntitiesRes = Res[Union[Person, _Message]] EntitiesRes = Res[Union[Person, _Message]]
@ -122,8 +119,8 @@ _UNKNOWN_PERSON = "UNKNOWN_PERSON"
def messages() -> Iterator[Res[Message]]: def messages() -> Iterator[Res[Message]]:
id2person: dict[str, Person] = {} id2person: Dict[str, Person] = {}
id2msg: dict[str, Message] = {} id2msg: Dict[str, Message] = {}
for x in unique_everseen(_entities(), key=_key): for x in unique_everseen(_entities(), key=_key):
if isinstance(x, Exception): if isinstance(x, Exception):
yield x yield x

View file

@ -9,18 +9,16 @@ from datetime import date, datetime, timedelta
from functools import lru_cache from functools import lru_cache
from typing import Union from typing import Union
from my.core import Stats from ..core.time import zone_to_countrycode
from my.core.time import zone_to_countrycode
@lru_cache(1) @lru_cache(1)
def _calendar(): def _calendar():
from workalendar.registry import registry # type: ignore from workalendar.registry import registry # type: ignore
# todo switch to using time.tz.main once _get_tz stabilizes? # todo switch to using time.tz.main once _get_tz stabilizes?
from ..time.tz import via_location as LTZ from ..time.tz import via_location as LTZ
# TODO would be nice to do it dynamically depending on the past timezones... # TODO would be nice to do it dynamically depending on the past timezones...
tz = LTZ.get_tz(datetime.now()) tz = LTZ._get_tz(datetime.now())
assert tz is not None assert tz is not None
zone = tz.zone; assert zone is not None zone = tz.zone; assert zone is not None
code = zone_to_countrycode(zone) code = zone_to_countrycode(zone)
@ -48,6 +46,7 @@ def is_workday(d: DateIsh) -> bool:
return not is_holiday(d) return not is_holiday(d)
from ..core.common import Stats
def stats() -> Stats: def stats() -> Stats:
# meh, but not sure what would be a better test? # meh, but not sure what would be a better test?
res = {} res = {}

View file

@ -1,6 +1,7 @@
import my.config as config import my.config as config
from .core import __NOT_HPI_MODULE__ from .core import __NOT_HPI_MODULE__
from .core import warnings as W from .core import warnings as W
# still used in Promnesia, maybe in dashboard? # still used in Promnesia, maybe in dashboard?

View file

@ -1,78 +1,86 @@
import json from my.config import codeforces as config # type: ignore[attr-defined]
from collections.abc import Iterator, Sequence
from dataclasses import dataclass
from datetime import datetime, timezone from datetime import datetime, timezone
from functools import cached_property from functools import cached_property
from pathlib import Path import json
from typing import NamedTuple, Dict, Iterator
from my.config import codeforces as config # type: ignore[attr-defined]
from my.core import Res, datetime_aware, get_files
def inputs() -> Sequence[Path]: from my.core import get_files, Res
return get_files(config.export_path) from my.core.konsume import ignore, wrap
ContestId = int Cid = int
class Contest(NamedTuple):
cid: Cid
when: datetime
@classmethod
def make(cls, j) -> 'Contest':
return cls(
cid=j['id'],
when=datetime.fromtimestamp(j['startTimeSeconds'], tz=timezone.utc),
)
Cmap = Dict[Cid, Contest]
@dataclass def get_contests() -> Cmap:
class Contest: last = max(get_files(config.export_path, 'allcontests*.json'))
contest_id: ContestId j = json.loads(last.read_text())
when: datetime_aware d = {}
name: str for c in j['result']:
cc = Contest.make(c)
d[cc.cid] = cc
return d
@dataclass class Competition(NamedTuple):
class Competition: contest_id: Cid
contest: Contest contest: str
old_rating: int cmap: Cmap
new_rating: int
@cached_property @cached_property
def when(self) -> datetime_aware: def uid(self) -> Cid:
return self.contest.when return self.contest_id
def __hash__(self):
return hash(self.contest_id)
# todo not sure if parser is the best name? hmm @cached_property
class Parser: def when(self) -> datetime:
def __init__(self, *, inputs: Sequence[Path]) -> None: return self.cmap[self.uid].when
self.inputs = inputs
self.contests: dict[ContestId, Contest] = {}
def _parse_allcontests(self, p: Path) -> Iterator[Contest]: @cached_property
j = json.loads(p.read_text()) def summary(self) -> str:
for c in j['result']: return f'participated in {self.contest}' # TODO
yield Contest(
contest_id=c['id'],
when=datetime.fromtimestamp(c['startTimeSeconds'], tz=timezone.utc),
name=c['name'],
)
def _parse_competitions(self, p: Path) -> Iterator[Competition]: @classmethod
j = json.loads(p.read_text()) def make(cls, cmap, json) -> Iterator[Res['Competition']]:
for c in j['result']: # TODO try here??
contest_id = c['contestId'] contest_id = json['contestId'].zoom().value
contest = self.contests[contest_id] contest = json['contestName'].zoom().value
yield Competition( yield cls(
contest_id=contest_id,
contest=contest, contest=contest,
old_rating=c['oldRating'], cmap=cmap,
new_rating=c['newRating'],
) )
# TODO ytry???
def parse(self) -> Iterator[Res[Competition]]: ignore(json, 'rank', 'oldRating', 'newRating')
for path in inputs():
if 'allcontests' in path.name:
# these contain information about all CF contests along with useful metadata
for contest in self._parse_allcontests(path):
# TODO some method to assert on mismatch if it exists? not sure
self.contests[contest.contest_id] = contest
elif 'codeforces' in path.name:
# these contain only contests the user participated in
yield from self._parse_competitions(path)
else:
raise RuntimeError(f"shouldn't happen: {path.name}")
def data() -> Iterator[Res[Competition]]: def data() -> Iterator[Res[Competition]]:
return Parser(inputs=inputs()).parse() cmap = get_contests()
last = max(get_files(config.export_path, 'codeforces*.json'))
with wrap(json.loads(last.read_text())) as j:
j['status'].ignore() # type: ignore[index]
res = j['result'].zoom() # type: ignore[index]
for c in list(res): # TODO maybe we want 'iter' method??
ignore(c, 'handle', 'ratingUpdateTimeSeconds')
yield from Competition.make(cmap=cmap, json=c)
c.consume()
# TODO maybe if they are all empty, no need to consume??

View file

@ -1,32 +1,30 @@
""" """
Git commits data for repositories on your filesystem Git commits data for repositories on your filesystem
""" """
from __future__ import annotations
REQUIRES = [ REQUIRES = [
'gitpython', 'gitpython',
] ]
import shutil
from collections.abc import Iterator, Sequence
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, cast
from my.core import LazyLogger, PathIsh, make_config import shutil
from my.core.cachew import cache_dir, mcachew from pathlib import Path
from datetime import datetime, timezone
from dataclasses import dataclass, field
from typing import List, Optional, Iterator, Set, Sequence, cast
from my.core import PathIsh, LazyLogger, make_config
from my.core.cachew import cache_dir
from my.core.common import mcachew
from my.core.warnings import high from my.core.warnings import high
from my.config import commits as user_config # isort: skip
from my.config import commits as user_config
@dataclass @dataclass
class commits_cfg(user_config): class commits_cfg(user_config):
roots: Sequence[PathIsh] = field(default_factory=list) roots: Sequence[PathIsh] = field(default_factory=list)
emails: Sequence[str] | None = None emails: Optional[Sequence[str]] = None
names: Sequence[str] | None = None names: Optional[Sequence[str]] = None
# experiment to make it lazy? # experiment to make it lazy?
@ -43,6 +41,7 @@ def config() -> commits_cfg:
import git import git
from git.repo.fun import is_git_dir from git.repo.fun import is_git_dir
log = LazyLogger(__name__, level='info') log = LazyLogger(__name__, level='info')
@ -95,7 +94,7 @@ def _git_root(git_dir: PathIsh) -> Path:
return gd # must be bare return gd # must be bare
def _repo_commits_aux(gr: git.Repo, rev: str, emitted: set[str]) -> Iterator[Commit]: def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Commit]:
# without path might not handle pull heads properly # without path might not handle pull heads properly
for c in gr.iter_commits(rev=rev): for c in gr.iter_commits(rev=rev):
if not by_me(c): if not by_me(c):
@ -122,7 +121,7 @@ def _repo_commits_aux(gr: git.Repo, rev: str, emitted: set[str]) -> Iterator[Com
def repo_commits(repo: PathIsh): def repo_commits(repo: PathIsh):
gr = git.Repo(str(repo)) gr = git.Repo(str(repo))
emitted: set[str] = set() emitted: Set[str] = set()
for r in gr.references: for r in gr.references:
yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted) yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted)
@ -143,14 +142,14 @@ def canonical_name(repo: Path) -> str:
def _fd_path() -> str: def _fd_path() -> str:
# todo move it to core # todo move it to core
fd_path: str | None = shutil.which("fdfind") or shutil.which("fd-find") or shutil.which("fd") fd_path: Optional[str] = shutil.which("fdfind") or shutil.which("fd-find") or shutil.which("fd")
if fd_path is None: if fd_path is None:
high("my.coding.commits requires 'fd' to be installed, See https://github.com/sharkdp/fd#installation") high("my.coding.commits requires 'fd' to be installed, See https://github.com/sharkdp/fd#installation")
assert fd_path is not None assert fd_path is not None
return fd_path return fd_path
def git_repos_in(roots: list[Path]) -> list[Path]: def git_repos_in(roots: List[Path]) -> List[Path]:
from subprocess import check_output from subprocess import check_output
outputs = check_output([ outputs = check_output([
_fd_path(), _fd_path(),
@ -163,36 +162,37 @@ def git_repos_in(roots: list[Path]) -> list[Path]:
*roots, *roots,
]).decode('utf8').splitlines() ]).decode('utf8').splitlines()
candidates = {Path(o).resolve().absolute().parent for o in outputs} candidates = set(Path(o).resolve().absolute().parent for o in outputs)
# exclude stuff within .git dirs (can happen for submodules?) # exclude stuff within .git dirs (can happen for submodules?)
candidates = {c for c in candidates if '.git' not in c.parts[:-1]} candidates = {c for c in candidates if '.git' not in c.parts[:-1]}
candidates = {c for c in candidates if is_git_dir(c)} candidates = {c for c in candidates if is_git_dir(c)}
repos = sorted(map(_git_root, candidates)) repos = list(sorted(map(_git_root, candidates)))
return repos return repos
def repos() -> list[Path]: def repos() -> List[Path]:
return git_repos_in(list(map(Path, config().roots))) return git_repos_in(list(map(Path, config().roots)))
# returns modification time for an index to use as hash function # returns modification time for an index to use as hash function
def _repo_depends_on(_repo: Path) -> int: def _repo_depends_on(_repo: Path) -> int:
for pp in [ for pp in {
".git/FETCH_HEAD", ".git/FETCH_HEAD",
".git/HEAD", ".git/HEAD",
"FETCH_HEAD", # bare "FETCH_HEAD", # bare
"HEAD", # bare "HEAD", # bare
]: }:
ff = _repo / pp ff = _repo / pp
if ff.exists(): if ff.exists():
return int(ff.stat().st_mtime) return int(ff.stat().st_mtime)
else:
raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}") raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}")
def _commits(_repos: list[Path]) -> Iterator[Commit]: def _commits(_repos: List[Path]) -> Iterator[Commit]:
for r in _repos: for r in _repos:
yield from _cached_commits(r) yield from _cached_commits(r)

View file

@ -1,12 +1,9 @@
from typing import TYPE_CHECKING import warnings
from my.core import warnings warnings.warn('my.coding.github is deprecated! Please use my.github.all instead!')
warnings.high('my.coding.github is deprecated! Please use my.github.all instead!')
# todo why aren't DeprecationWarning shown by default?? # todo why aren't DeprecationWarning shown by default??
if not TYPE_CHECKING: from ..github.all import events, get_events
from ..github.all import events, get_events # noqa: F401
# todo deprecate properly # todo deprecate properly
iter_events = events iter_events = events

View file

@ -1,6 +1,6 @@
from .core.warnings import high from .core.warnings import high
high("DEPRECATED! Please use my.core.common instead.") high("DEPRECATED! Please use my.core.common instead.")
from .core import __NOT_HPI_MODULE__ from .core import __NOT_HPI_MODULE__
from .core.common import * from .core.common import *

View file

@ -9,18 +9,17 @@ This file is used for:
- mypy: this file provides some type annotations - mypy: this file provides some type annotations
- for loading the actual user config - for loading the actual user config
''' '''
from __future__ import annotations
#### NOTE: you won't need this line VVVV in your personal config #### NOTE: you won't need this line VVVV in your personal config
from my.core import init # noqa: F401 # isort: skip from my.core import init
### ###
from datetime import tzinfo from datetime import tzinfo
from pathlib import Path from pathlib import Path
from typing import List
from my.core import PathIsh, Paths
from my.core import Paths, PathIsh
class hypothesis: class hypothesis:
@ -76,16 +75,14 @@ class google:
takeout_path: Paths = '' takeout_path: Paths = ''
from collections.abc import Sequence from typing import Sequence, Union, Tuple
from datetime import date, datetime, timedelta from datetime import datetime, date, timedelta
from typing import Union
DateIsh = Union[datetime, date, str] DateIsh = Union[datetime, date, str]
LatLon = tuple[float, float] LatLon = Tuple[float, float]
class location: class location:
# todo ugh, need to think about it... mypy wants the type here to be general, otherwise it can't deduce # todo ugh, need to think about it... mypy wants the type here to be general, otherwise it can't deduce
# and we can't import the types from the module itself, otherwise would be circular. common module? # and we can't import the types from the module itself, otherwise would be circular. common module?
home: LatLon | Sequence[tuple[DateIsh, LatLon]] = (1.0, -1.0) home: Union[LatLon, Sequence[Tuple[DateIsh, LatLon]]] = (1.0, -1.0)
home_accuracy = 30_000.0 home_accuracy = 30_000.0
class via_ip: class via_ip:
@ -106,8 +103,6 @@ class location:
from typing import Literal from typing import Literal
class time: class time:
class tz: class tz:
policy: Literal['keep', 'convert', 'throw'] policy: Literal['keep', 'convert', 'throw']
@ -126,9 +121,10 @@ class arbtt:
logfiles: Paths logfiles: Paths
from typing import Optional
class commits: class commits:
emails: Sequence[str] | None emails: Optional[Sequence[str]]
names: Sequence[str] | None names: Optional[Sequence[str]]
roots: Sequence[PathIsh] roots: Sequence[PathIsh]
@ -154,8 +150,8 @@ class tinder:
class instagram: class instagram:
class android: class android:
export_path: Paths export_path: Paths
username: str | None username: Optional[str]
full_name: str | None full_name: Optional[str]
class gdpr: class gdpr:
export_path: Paths export_path: Paths
@ -173,7 +169,7 @@ class materialistic:
class fbmessenger: class fbmessenger:
class fbmessengerexport: class fbmessengerexport:
export_db: PathIsh export_db: PathIsh
facebook_id: str | None facebook_id: Optional[str]
class android: class android:
export_path: Paths export_path: Paths
@ -251,7 +247,7 @@ class runnerup:
class emfit: class emfit:
export_path: Path export_path: Path
timezone: tzinfo timezone: tzinfo
excluded_sids: list[str] excluded_sids: List[str]
class foursquare: class foursquare:
@ -274,7 +270,7 @@ class roamresearch:
class whatsapp: class whatsapp:
class android: class android:
export_path: Paths export_path: Paths
my_user_id: str | None my_user_id: Optional[str]
class harmonic: class harmonic:

View file

@ -1,53 +1,40 @@
# this file only keeps the most common & critical types/utility functions # this file only keeps the most common & critical types/utility functions
from typing import TYPE_CHECKING from .common import get_files, PathIsh, Paths
from .common import Json
from .common import warn_if_empty
from .common import stat, Stats
from .common import datetime_naive, datetime_aware
from .common import assert_never
from .cfg import make_config from .cfg import make_config
from .common import PathIsh, Paths, get_files from .error import Res, unwrap
from .compat import assert_never from .logging import make_logger, LazyLogger
from .error import Res, notnone, unwrap
from .logging import (
make_logger,
)
from .stats import Stats, stat
from .types import (
Json,
datetime_aware,
datetime_naive,
)
from .util import __NOT_HPI_MODULE__ from .util import __NOT_HPI_MODULE__
from .utils.itertools import warn_if_empty
LazyLogger = make_logger # TODO deprecate this in favor of make_logger
if not TYPE_CHECKING: # just for brevity in modules
# we used to keep these here for brevity, but feels like it only adds confusion, # todo not sure about these.. maybe best to rely on regular imports.. perhaps compare?
# e.g. suggest that we perhaps somehow modify builtin behaviour or whatever
# so best to prefer explicit behaviour
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
__all__ = [ __all__ = [
'__NOT_HPI_MODULE__', 'get_files', 'PathIsh', 'Paths',
'Json', 'Json',
'LazyLogger', # legacy import
'Path',
'PathIsh',
'Paths',
'Res',
'Stats',
'assert_never', # TODO maybe deprecate from use in my.core? will be in stdlib soon
'dataclass',
'datetime_aware',
'datetime_naive',
'get_files',
'make_config',
'make_logger', 'make_logger',
'notnone', 'LazyLogger', # legacy import
'stat',
'unwrap',
'warn_if_empty', 'warn_if_empty',
'stat', 'Stats',
'datetime_aware', 'datetime_naive',
'assert_never',
'make_config',
'__NOT_HPI_MODULE__',
'Res', 'unwrap',
'dataclass', 'Path',
] ]
@ -55,7 +42,7 @@ __all__ = [
# you could put _init_hook.py next to your private my/config # you could put _init_hook.py next to your private my/config
# that way you can configure logging/warnings/env variables on every HPI import # that way you can configure logging/warnings/env variables on every HPI import
try: try:
import my._init_hook # type: ignore[import-not-found] # noqa: F401 import my._init_hook # type: ignore[import-not-found]
except: except:
pass pass
## ##

View file

@ -1,26 +1,23 @@
from __future__ import annotations from contextlib import ExitStack
import functools import functools
import importlib import importlib
import inspect import inspect
from itertools import chain
import os import os
import shlex import shlex
import shutil import shutil
import sys import sys
import tempfile import tempfile
import traceback import traceback
from collections.abc import Iterable, Sequence from typing import Optional, Sequence, Iterable, List, Type, Any, Callable
from contextlib import ExitStack
from itertools import chain
from pathlib import Path from pathlib import Path
from subprocess import PIPE, CompletedProcess, Popen, check_call, run from subprocess import check_call, run, PIPE, CompletedProcess, Popen
from typing import Any, Callable
import click import click
@functools.lru_cache @functools.lru_cache()
def mypy_cmd() -> Sequence[str] | None: def mypy_cmd() -> Optional[Sequence[str]]:
try: try:
# preferably, use mypy from current python env # preferably, use mypy from current python env
import mypy # noqa: F401 fine not to use it import mypy # noqa: F401 fine not to use it
@ -35,7 +32,7 @@ def mypy_cmd() -> Sequence[str] | None:
return None return None
def run_mypy(cfg_path: Path) -> CompletedProcess | None: def run_mypy(cfg_path: Path) -> Optional[CompletedProcess]:
# todo dunno maybe use the same mypy config in repository? # todo dunno maybe use the same mypy config in repository?
# I'd need to install mypy.ini then?? # I'd need to install mypy.ini then??
env = {**os.environ} env = {**os.environ}
@ -46,7 +43,7 @@ def run_mypy(cfg_path: Path) -> CompletedProcess | None:
cmd = mypy_cmd() cmd = mypy_cmd()
if cmd is None: if cmd is None:
return None return None
mres = run([ # noqa: UP022,PLW1510 mres = run([
*cmd, *cmd,
'--namespace-packages', '--namespace-packages',
'--color-output', # not sure if works?? '--color-output', # not sure if works??
@ -66,28 +63,22 @@ def eprint(x: str) -> None:
# err=True prints to stderr # err=True prints to stderr
click.echo(x, err=True) click.echo(x, err=True)
def indent(x: str) -> str: def indent(x: str) -> str:
# todo use textwrap.indent?
return ''.join(' ' + l for l in x.splitlines(keepends=True)) return ''.join(' ' + l for l in x.splitlines(keepends=True))
OK = '' OK = ''
OFF = '🔲' OFF = '🔲'
def info(x: str) -> None: def info(x: str) -> None:
eprint(OK + ' ' + x) eprint(OK + ' ' + x)
def error(x: str) -> None: def error(x: str) -> None:
eprint('' + x) eprint('' + x)
def warning(x: str) -> None: def warning(x: str) -> None:
eprint('' + x) # todo yellow? eprint('' + x) # todo yellow?
def tb(e: Exception) -> None: def tb(e: Exception) -> None:
tb = ''.join(traceback.format_exception(Exception, e, e.__traceback__)) tb = ''.join(traceback.format_exception(Exception, e, e.__traceback__))
sys.stderr.write(indent(tb)) sys.stderr.write(indent(tb))
@ -95,7 +86,6 @@ def tb(e: Exception) -> None:
def config_create() -> None: def config_create() -> None:
from .preinit import get_mycfg_dir from .preinit import get_mycfg_dir
mycfg_dir = get_mycfg_dir() mycfg_dir = get_mycfg_dir()
created = False created = False
@ -104,8 +94,7 @@ def config_create() -> None:
my_config = mycfg_dir / 'my' / 'config' / '__init__.py' my_config = mycfg_dir / 'my' / 'config' / '__init__.py'
my_config.parent.mkdir(parents=True) my_config.parent.mkdir(parents=True)
my_config.write_text( my_config.write_text('''
'''
### HPI personal config ### HPI personal config
## see ## see
# https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-modules # https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-modules
@ -128,8 +117,7 @@ class example:
### you can insert your own configuration below ### you can insert your own configuration below
### but feel free to delete the stuff above if you don't need ti ### but feel free to delete the stuff above if you don't need ti
'''.lstrip() '''.lstrip())
)
info(f'created empty config: {my_config}') info(f'created empty config: {my_config}')
created = True created = True
else: else:
@ -142,13 +130,12 @@ class example:
# todo return the config as a result? # todo return the config as a result?
def config_ok() -> bool: def config_ok() -> bool:
errors: list[Exception] = [] errors: List[Exception] = []
# at this point 'my' should already be imported, so doesn't hurt to extract paths from it # at this point 'my' should already be imported, so doesn't hurt to extract paths from it
import my import my
try: try:
paths: list[str] = list(my.__path__) paths: List[str] = list(my.__path__)
except Exception as e: except Exception as e:
errors.append(e) errors.append(e)
error('failed to determine module import path') error('failed to determine module import path')
@ -158,23 +145,19 @@ def config_ok() -> bool:
# first try doing as much as possible without actually importing my.config # first try doing as much as possible without actually importing my.config
from .preinit import get_mycfg_dir from .preinit import get_mycfg_dir
cfg_path = get_mycfg_dir() cfg_path = get_mycfg_dir()
# alternative is importing my.config and then getting cfg_path from its __file__/__path__ # alternative is importing my.config and then getting cfg_path from its __file__/__path__
# not sure which is better tbh # not sure which is better tbh
## check we're not using stub config ## check we're not using stub config
import my.core import my.core
try: try:
core_pkg_path = str(Path(my.core.__path__[0]).parent) core_pkg_path = str(Path(my.core.__path__[0]).parent)
if str(cfg_path).startswith(core_pkg_path): if str(cfg_path).startswith(core_pkg_path):
error( error(f'''
f'''
Seems that the stub config is used ({cfg_path}). This is likely not going to work. Seems that the stub config is used ({cfg_path}). This is likely not going to work.
See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-modules for more information See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-modules for more information
'''.strip() '''.strip())
)
errors.append(RuntimeError('bad config path')) errors.append(RuntimeError('bad config path'))
except Exception as e: except Exception as e:
errors.append(e) errors.append(e)
@ -188,6 +171,8 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module
# use a temporary directory, useful because # use a temporary directory, useful because
# - compileall ignores -B, so always craps with .pyc files (annoyng on RO filesystems) # - compileall ignores -B, so always craps with .pyc files (annoyng on RO filesystems)
# - compileall isn't following symlinks, just silently ignores them # - compileall isn't following symlinks, just silently ignores them
# note: ugh, annoying that copytree requires a non-existing dir before 3.8.
# once we have min version 3.8, can use dirs_exist_ok=True param
tdir = Path(td) / 'cfg' tdir = Path(td) / 'cfg'
# NOTE: compileall still returns code 0 if the path doesn't exist.. # NOTE: compileall still returns code 0 if the path doesn't exist..
# but in our case hopefully it's not an issue # but in our case hopefully it's not an issue
@ -196,7 +181,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module
try: try:
# this will resolve symlinks when copying # this will resolve symlinks when copying
# should be under try/catch since might fail if some symlinks are missing # should be under try/catch since might fail if some symlinks are missing
shutil.copytree(cfg_path, tdir, dirs_exist_ok=True) shutil.copytree(cfg_path, tdir)
check_call(cmd) check_call(cmd)
info('syntax check: ' + ' '.join(cmd)) info('syntax check: ' + ' '.join(cmd))
except Exception as e: except Exception as e:
@ -229,15 +214,13 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-module
if len(errors) > 0: if len(errors) > 0:
error(f'config check: {len(errors)} errors') error(f'config check: {len(errors)} errors')
return False return False
else:
# note: shouldn't exit here, might run something else # note: shouldn't exit here, might run something else
info('config check: success!') info('config check: success!')
return True return True
from .util import HPIModule, modules from .util import HPIModule, modules
def _modules(*, all: bool=False) -> Iterable[HPIModule]: def _modules(*, all: bool=False) -> Iterable[HPIModule]:
skipped = [] skipped = []
for m in modules(): for m in modules():
@ -249,7 +232,7 @@ def _modules(*, all: bool = False) -> Iterable[HPIModule]:
warning(f'Skipped {len(skipped)} modules: {skipped}. Pass --all if you want to see them.') warning(f'Skipped {len(skipped)} modules: {skipped}. Pass --all if you want to see them.')
def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: list[str]) -> None: def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: List[str]) -> None:
if len(for_modules) > 0: if len(for_modules) > 0:
# if you're checking specific modules, show errors # if you're checking specific modules, show errors
# hopefully makes sense? # hopefully makes sense?
@ -260,9 +243,10 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: li
import contextlib import contextlib
from .common import quick_stats
from .util import get_stats, HPIModule
from .stats import guess_stats
from .error import warn_my_config_import_error from .error import warn_my_config_import_error
from .stats import get_stats, quick_stats
from .util import HPIModule
mods: Iterable[HPIModule] mods: Iterable[HPIModule]
if len(for_modules) == 0: if len(for_modules) == 0:
@ -292,8 +276,11 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: li
continue continue
info(f'{click.style("OK", fg="green")} : {m:<50}') info(f'{click.style("OK", fg="green")} : {m:<50}')
# TODO add hpi 'stats'? instead of doctor? not sure # first try explicitly defined stats function:
stats = get_stats(m, guess=True) stats = get_stats(m)
if stats is None:
# then try guessing.. not sure if should log somehow?
stats = guess_stats(m, quick=quick)
if stats is None: if stats is None:
eprint(" - no 'stats' function, can't check the data") eprint(" - no 'stats' function, can't check the data")
@ -304,7 +291,6 @@ def modules_check(*, verbose: bool, list_all: bool, quick: bool, for_modules: li
try: try:
kwargs = {} kwargs = {}
# todo hmm why wouldn't they be callable??
if callable(stats) and 'quick' in inspect.signature(stats).parameters: if callable(stats) and 'quick' in inspect.signature(stats).parameters:
kwargs['quick'] = quick kwargs['quick'] = quick
with quick_context: with quick_context:
@ -340,20 +326,17 @@ def tabulate_warnings() -> None:
Helper to avoid visual noise in hpi modules/doctor Helper to avoid visual noise in hpi modules/doctor
''' '''
import warnings import warnings
orig = warnings.formatwarning orig = warnings.formatwarning
def override(*args, **kwargs) -> str: def override(*args, **kwargs) -> str:
res = orig(*args, **kwargs) res = orig(*args, **kwargs)
return ''.join(' ' + x for x in res.splitlines(keepends=True)) return ''.join(' ' + x for x in res.splitlines(keepends=True))
warnings.formatwarning = override warnings.formatwarning = override
# TODO loggers as well? # TODO loggers as well?
def _requires(modules: Sequence[str]) -> Sequence[str]: def _requires(modules: Sequence[str]) -> Sequence[str]:
from .discovery_pure import module_by_name from .discovery_pure import module_by_name
mods = [module_by_name(module) for module in modules] mods = [module_by_name(module) for module in modules]
res = [] res = []
for mod in mods: for mod in mods:
@ -391,9 +374,8 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool = False,
warning('requirements list is empty, no need to install anything') warning('requirements list is empty, no need to install anything')
return return
use_uv = 'HPI_MODULE_INSTALL_USE_UV' in os.environ
pre_cmd = [ pre_cmd = [
sys.executable, '-m', *(['uv'] if use_uv else []), 'pip', sys.executable, '-m', 'pip',
'install', 'install',
*(['--user'] if user else []), # todo maybe instead, forward all the remaining args to pip? *(['--user'] if user else []), # todo maybe instead, forward all the remaining args to pip?
*(['--break-system-packages'] if break_system_packages else []), # https://peps.python.org/pep-0668/ *(['--break-system-packages'] if break_system_packages else []), # https://peps.python.org/pep-0668/
@ -411,7 +393,7 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool = False,
# I think it only helps for pypi artifacts (not git!), # I think it only helps for pypi artifacts (not git!),
# and only if they weren't cached # and only if they weren't cached
for r in requirements: for r in requirements:
cmds.append([*pre_cmd, r]) cmds.append(pre_cmd + [r])
else: else:
if parallel: if parallel:
warning('parallel install is not supported on this platform, installing sequentially...') warning('parallel install is not supported on this platform, installing sequentially...')
@ -457,8 +439,8 @@ def _ui_getchar_pick(choices: Sequence[str], prompt: str = 'Select from: ') -> i
return result_map[ch] return result_map[ch]
def _locate_functions_or_prompt(qualified_names: list[str], *, prompt: bool = True) -> Iterable[Callable[..., Any]]: def _locate_functions_or_prompt(qualified_names: List[str], prompt: bool = True) -> Iterable[Callable[..., Any]]:
from .query import QueryException, locate_qualified_function from .query import locate_qualified_function, QueryException
from .stats import is_data_provider from .stats import is_data_provider
# if not connected to a terminal, can't prompt # if not connected to a terminal, can't prompt
@ -475,9 +457,9 @@ def _locate_functions_or_prompt(qualified_names: list[str], *, prompt: bool = Tr
# user to select a 'data provider' like function # user to select a 'data provider' like function
try: try:
mod = importlib.import_module(qualname) mod = importlib.import_module(qualname)
except Exception as ie: except Exception:
eprint(f"During fallback, importing '{qualname}' as module failed") eprint(f"During fallback, importing '{qualname}' as module failed")
raise qr_err from ie raise qr_err
# find data providers in this module # find data providers in this module
data_providers = [f for _, f in inspect.getmembers(mod, inspect.isfunction) if is_data_provider(f)] data_providers = [f for _, f in inspect.getmembers(mod, inspect.isfunction) if is_data_provider(f)]
@ -506,9 +488,8 @@ def _locate_functions_or_prompt(qualified_names: list[str], *, prompt: bool = Tr
def _warn_exceptions(exc: Exception) -> None: def _warn_exceptions(exc: Exception) -> None:
from my.core import make_logger from my.core.common import LazyLogger
logger = LazyLogger('CLI', level='warning')
logger = make_logger('CLI', level='warning')
logger.exception(f'hpi query: {exc}') logger.exception(f'hpi query: {exc}')
@ -519,28 +500,26 @@ def query_hpi_functions(
*, *,
output: str = 'json', output: str = 'json',
stream: bool = False, stream: bool = False,
qualified_names: list[str], qualified_names: List[str],
order_key: str | None, order_key: Optional[str],
order_by_value_type: type | None, order_by_value_type: Optional[Type],
after: Any, after: Any,
before: Any, before: Any,
within: Any, within: Any,
reverse: bool = False, reverse: bool = False,
limit: int | None, limit: Optional[int],
drop_unsorted: bool, drop_unsorted: bool,
wrap_unsorted: bool, wrap_unsorted: bool,
warn_exceptions: bool, warn_exceptions: bool,
raise_exceptions: bool, raise_exceptions: bool,
drop_exceptions: bool, drop_exceptions: bool,
) -> None: ) -> None:
from .query_range import RangeTuple, select_range from .query_range import select_range, RangeTuple
import my.core.error as err
# chain list of functions from user, in the order they wrote them on the CLI # chain list of functions from user, in the order they wrote them on the CLI
input_src = chain(*(f() for f in _locate_functions_or_prompt(qualified_names))) input_src = chain(*(f() for f in _locate_functions_or_prompt(qualified_names)))
# NOTE: if passing just one function to this which returns a single namedtuple/dataclass,
# using both --order-key and --order-type will often be faster as it does not need to
# duplicate the iterator in memory, or try to find the --order-type type on each object before sorting
res = select_range( res = select_range(
input_src, input_src,
order_key=order_key, order_key=order_key,
@ -553,8 +532,7 @@ def query_hpi_functions(
warn_exceptions=warn_exceptions, warn_exceptions=warn_exceptions,
warn_func=_warn_exceptions, warn_func=_warn_exceptions,
raise_exceptions=raise_exceptions, raise_exceptions=raise_exceptions,
drop_exceptions=drop_exceptions, drop_exceptions=drop_exceptions)
)
if output == 'json': if output == 'json':
from .serialize import dumps from .serialize import dumps
@ -601,11 +579,10 @@ def query_hpi_functions(
# output == 'repl' # output == 'repl'
eprint(f"\nInteract with the results by using the {click.style('res', fg='green')} variable\n") eprint(f"\nInteract with the results by using the {click.style('res', fg='green')} variable\n")
try: try:
import IPython # type: ignore[import,unused-ignore] import IPython # type: ignore[import]
except ModuleNotFoundError: except ModuleNotFoundError:
eprint("'repl' typically uses ipython, install it with 'python3 -m pip install ipython'. falling back to stdlib...") eprint("'repl' typically uses ipython, install it with 'python3 -m pip install ipython'. falling back to stdlib...")
import code import code
code.interact(local=locals()) code.interact(local=locals())
else: else:
IPython.embed() IPython.embed()
@ -613,7 +590,7 @@ def query_hpi_functions(
@click.group() @click.group()
@click.option("--debug", is_flag=True, default=False, help="Show debug logs") @click.option("--debug", is_flag=True, default=False, help="Show debug logs")
def main(*, debug: bool) -> None: def main(debug: bool) -> None:
''' '''
Human Programming Interface Human Programming Interface
@ -639,19 +616,20 @@ def main(*, debug: bool) -> None:
# to run things at the end (would need to use a callback or pass context) # to run things at the end (would need to use a callback or pass context)
# https://click.palletsprojects.com/en/7.x/commands/#nested-handling-and-contexts # https://click.palletsprojects.com/en/7.x/commands/#nested-handling-and-contexts
tdir = Path(tempfile.gettempdir()) / 'hpi_temp_dir' tdir: str = os.path.join(tempfile.gettempdir(), 'hpi_temp_dir')
tdir.mkdir(exist_ok=True) if not os.path.exists(tdir):
os.makedirs(tdir)
os.chdir(tdir) os.chdir(tdir)
@functools.lru_cache(maxsize=1) @functools.lru_cache(maxsize=1)
def _all_mod_names() -> list[str]: def _all_mod_names() -> List[str]:
"""Should include all modules, in case user is trying to diagnose issues""" """Should include all modules, in case user is trying to diagnose issues"""
# sort this, so that the order doesn't change while tabbing through # sort this, so that the order doesn't change while tabbing through
return sorted([m.name for m in modules()]) return sorted([m.name for m in modules()])
def _module_autocomplete(ctx: click.Context, args: Sequence[str], incomplete: str) -> list[str]: def _module_autocomplete(ctx: click.Context, args: Sequence[str], incomplete: str) -> List[str]:
return [m for m in _all_mod_names() if m.startswith(incomplete)] return [m for m in _all_mod_names() if m.startswith(incomplete)]
@ -661,7 +639,7 @@ def _module_autocomplete(ctx: click.Context, args: Sequence[str], incomplete: st
@click.option('-q', '--quick', is_flag=True, help='Only run partial checks (first 100 items)') @click.option('-q', '--quick', is_flag=True, help='Only run partial checks (first 100 items)')
@click.option('-S', '--skip-config-check', 'skip_conf', is_flag=True, help='Skip configuration check') @click.option('-S', '--skip-config-check', 'skip_conf', is_flag=True, help='Skip configuration check')
@click.argument('MODULE', nargs=-1, required=False, shell_complete=_module_autocomplete) @click.argument('MODULE', nargs=-1, required=False, shell_complete=_module_autocomplete)
def doctor_cmd(*, verbose: bool, list_all: bool, quick: bool, skip_conf: bool, module: Sequence[str]) -> None: def doctor_cmd(verbose: bool, list_all: bool, quick: bool, skip_conf: bool, module: Sequence[str]) -> None:
''' '''
Run various checks Run various checks
@ -695,7 +673,7 @@ def config_create_cmd() -> None:
@main.command(name='modules', short_help='list available modules') @main.command(name='modules', short_help='list available modules')
@click.option('--all', 'list_all', is_flag=True, help='List all modules, including disabled') @click.option('--all', 'list_all', is_flag=True, help='List all modules, including disabled')
def module_cmd(*, list_all: bool) -> None: def module_cmd(list_all: bool) -> None:
'''List available modules''' '''List available modules'''
list_modules(list_all=list_all) list_modules(list_all=list_all)
@ -708,7 +686,7 @@ def module_grp() -> None:
@module_grp.command(name='requires', short_help='print module reqs') @module_grp.command(name='requires', short_help='print module reqs')
@click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True) @click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True)
def module_requires_cmd(*, modules: Sequence[str]) -> None: def module_requires_cmd(modules: Sequence[str]) -> None:
''' '''
Print MODULES requirements Print MODULES requirements
@ -725,7 +703,7 @@ def module_requires_cmd(*, modules: Sequence[str]) -> None:
is_flag=True, is_flag=True,
help='Bypass PEP 668 and install dependencies into the system-wide python package directory.') help='Bypass PEP 668 and install dependencies into the system-wide python package directory.')
@click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True) @click.argument('MODULES', shell_complete=_module_autocomplete, nargs=-1, required=True)
def module_install_cmd(*, user: bool, parallel: bool, break_system_packages: bool, modules: Sequence[str]) -> None: def module_install_cmd(user: bool, parallel: bool, break_system_packages: bool, modules: Sequence[str]) -> None:
''' '''
Install dependencies for modules using pip Install dependencies for modules using pip
@ -806,18 +784,17 @@ def module_install_cmd(*, user: bool, parallel: bool, break_system_packages: boo
help='ignore any errors returned as objects from the functions') help='ignore any errors returned as objects from the functions')
@click.argument('FUNCTION_NAME', nargs=-1, required=True, shell_complete=_module_autocomplete) @click.argument('FUNCTION_NAME', nargs=-1, required=True, shell_complete=_module_autocomplete)
def query_cmd( def query_cmd(
*,
function_name: Sequence[str], function_name: Sequence[str],
output: str, output: str,
stream: bool, stream: bool,
order_key: str | None, order_key: Optional[str],
order_type: str | None, order_type: Optional[str],
after: str | None, after: Optional[str],
before: str | None, before: Optional[str],
within: str | None, within: Optional[str],
recent: str | None, recent: Optional[str],
reverse: bool, reverse: bool,
limit: int | None, limit: Optional[int],
drop_unsorted: bool, drop_unsorted: bool,
wrap_unsorted: bool, wrap_unsorted: bool,
warn_exceptions: bool, warn_exceptions: bool,
@ -851,9 +828,9 @@ def query_cmd(
hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments
''' '''
from datetime import date, datetime from datetime import datetime, date
chosen_order_type: type | None chosen_order_type: Optional[Type]
if order_type == "datetime": if order_type == "datetime":
chosen_order_type = datetime chosen_order_type = datetime
elif order_type == "date": elif order_type == "date":
@ -889,8 +866,7 @@ def query_cmd(
wrap_unsorted=wrap_unsorted, wrap_unsorted=wrap_unsorted,
warn_exceptions=warn_exceptions, warn_exceptions=warn_exceptions,
raise_exceptions=raise_exceptions, raise_exceptions=raise_exceptions,
drop_exceptions=drop_exceptions, drop_exceptions=drop_exceptions)
)
except QueryException as qe: except QueryException as qe:
eprint(str(qe)) eprint(str(qe))
sys.exit(1) sys.exit(1)
@ -905,7 +881,6 @@ def query_cmd(
def test_requires() -> None: def test_requires() -> None:
from click.testing import CliRunner from click.testing import CliRunner
result = CliRunner().invoke(main, ['module', 'requires', 'my.github.ghexport', 'my.browser.export']) result = CliRunner().invoke(main, ['module', 'requires', 'my.github.ghexport', 'my.browser.export'])
assert result.exit_code == 0 assert result.exit_code == 0
assert "github.com/karlicoss/ghexport" in result.output assert "github.com/karlicoss/ghexport" in result.output

View file

@ -10,18 +10,16 @@ how many cores we want to dedicate to the DAL.
Enabled by the env variable, specifying how many cores to dedicate Enabled by the env variable, specifying how many cores to dedicate
e.g. "HPI_CPU_POOL=4 hpi query ..." e.g. "HPI_CPU_POOL=4 hpi query ..."
""" """
from __future__ import annotations
import os
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
from typing import cast import os
from typing import cast, Optional
_NOT_SET = cast(ProcessPoolExecutor, object()) _NOT_SET = cast(ProcessPoolExecutor, object())
_INSTANCE: ProcessPoolExecutor | None = _NOT_SET _INSTANCE: Optional[ProcessPoolExecutor] = _NOT_SET
def get_cpu_pool() -> ProcessPoolExecutor | None: def get_cpu_pool() -> Optional[ProcessPoolExecutor]:
global _INSTANCE global _INSTANCE
if _INSTANCE is _NOT_SET: if _INSTANCE is _NOT_SET:
use_cpu_pool = os.environ.get('HPI_CPU_POOL') use_cpu_pool = os.environ.get('HPI_CPU_POOL')

View file

@ -1,12 +0,0 @@
from ..common import PathIsh
from ..sqlite import sqlite_connect_immutable
def connect_readonly(db: PathIsh):
import dataset # type: ignore
# see https://github.com/pudo/dataset/issues/136#issuecomment-128693122
# todo not sure if mode=ro has any benefit, but it doesn't work on read-only filesystems
# maybe it should autodetect readonly filesystems and apply this? not sure
creator = lambda: sqlite_connect_immutable(db)
return dataset.connect('sqlite:///', engine_kwargs={'creator': creator})

View file

@ -1,17 +1,16 @@
""" """
Various helpers for compression Various helpers for compression
""" """
# fmt: off # fmt: off
from __future__ import annotations from __future__ import annotations
import io
import pathlib
from collections.abc import Iterator, Sequence
from datetime import datetime from datetime import datetime
from functools import total_ordering from functools import total_ordering
import io
import pathlib
from pathlib import Path from pathlib import Path
from typing import IO, Union import sys
from typing import Union, IO, Sequence, Any, Iterator
PathIsh = Union[Path, str] PathIsh = Union[Path, str]
@ -28,7 +27,7 @@ class Ext:
def is_compressed(p: Path) -> bool: def is_compressed(p: Path) -> bool:
# todo kinda lame way for now.. use mime ideally? # todo kinda lame way for now.. use mime ideally?
# should cooperate with kompress.kopen? # should cooperate with kompress.kopen?
return any(p.name.endswith(ext) for ext in [Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.zst, Ext.targz]) return any(p.name.endswith(ext) for ext in {Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.zst, Ext.targz})
def _zstd_open(path: Path, *args, **kwargs) -> IO: def _zstd_open(path: Path, *args, **kwargs) -> IO:
@ -88,7 +87,7 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO:
elif name.endswith(Ext.lz4): elif name.endswith(Ext.lz4):
import lz4.frame # type: ignore import lz4.frame # type: ignore
return lz4.frame.open(str(pp), mode, *args, **kwargs) return lz4.frame.open(str(pp), mode, *args, **kwargs)
elif name.endswith(Ext.zstd) or name.endswith(Ext.zst): # noqa: PIE810 elif name.endswith(Ext.zstd) or name.endswith(Ext.zst):
kwargs['mode'] = mode kwargs['mode'] = mode
return _zstd_open(pp, *args, **kwargs) return _zstd_open(pp, *args, **kwargs)
elif name.endswith(Ext.targz): elif name.endswith(Ext.targz):
@ -102,8 +101,8 @@ def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO:
return pp.open(mode, *args, **kwargs) return pp.open(mode, *args, **kwargs)
import os
import typing import typing
import os
if typing.TYPE_CHECKING: if typing.TYPE_CHECKING:
# otherwise mypy can't figure out that BasePath is a type alias.. # otherwise mypy can't figure out that BasePath is a type alias..
@ -121,7 +120,7 @@ class CPath(BasePath):
Path only has _accessor and _closed slots, so can't directly set .open method Path only has _accessor and _closed slots, so can't directly set .open method
_accessor.open has to return file descriptor, doesn't work for compressed stuff. _accessor.open has to return file descriptor, doesn't work for compressed stuff.
""" """
def open(self, *args, **kwargs): # noqa: ARG002 def open(self, *args, **kwargs):
kopen_kwargs = {} kopen_kwargs = {}
mode = kwargs.get('mode') mode = kwargs.get('mode')
if mode is not None: if mode is not None:
@ -142,16 +141,20 @@ open = kopen # TODO deprecate
def kexists(path: PathIsh, subpath: str) -> bool: def kexists(path: PathIsh, subpath: str) -> bool:
try: try:
kopen(path, subpath) kopen(path, subpath)
return True
except Exception: except Exception:
return False return False
else:
return True
import zipfile import zipfile
if sys.version_info[:2] >= (3, 8):
# meh... zipfile.Path is not available on 3.7 # meh... zipfile.Path is not available on 3.7
zipfile_Path = zipfile.Path zipfile_Path = zipfile.Path
else:
if typing.TYPE_CHECKING:
zipfile_Path = Any
else:
zipfile_Path = object
@total_ordering @total_ordering
@ -211,7 +214,7 @@ class ZipPath(zipfile_Path):
def iterdir(self) -> Iterator[ZipPath]: def iterdir(self) -> Iterator[ZipPath]:
for s in self._as_dir().iterdir(): for s in self._as_dir().iterdir():
yield ZipPath(s.root, s.at) yield ZipPath(s.root, s.at) # type: ignore[attr-defined]
@property @property
def stem(self) -> str: def stem(self) -> str:
@ -240,7 +243,7 @@ class ZipPath(zipfile_Path):
# see https://en.wikipedia.org/wiki/ZIP_(file_format)#Structure # see https://en.wikipedia.org/wiki/ZIP_(file_format)#Structure
dt = datetime(*self.root.getinfo(self.at).date_time) dt = datetime(*self.root.getinfo(self.at).date_time)
ts = int(dt.timestamp()) ts = int(dt.timestamp())
params = dict( # noqa: C408 params = dict(
st_mode=0, st_mode=0,
st_ino=0, st_ino=0,
st_dev=0, st_dev=0,

View file

@ -1,28 +1,14 @@
from __future__ import annotations from .common import assert_subpackage; assert_subpackage(__name__)
from .internal import assert_subpackage
assert_subpackage(__name__)
import logging
import sys
from collections.abc import Iterator
from contextlib import contextmanager from contextlib import contextmanager
import logging
from pathlib import Path from pathlib import Path
from typing import ( import sys
TYPE_CHECKING, from typing import Optional, Iterator, cast, TYPE_CHECKING, TypeVar, Callable, overload, Union, Any, Type
Any, import warnings
Callable,
TypeVar,
Union,
cast,
overload,
)
import appdirs # type: ignore[import-untyped] import appdirs # type: ignore[import-untyped]
from . import warnings
PathIsh = Union[str, Path] # avoid circular import from .common PathIsh = Union[str, Path] # avoid circular import from .common
@ -61,12 +47,12 @@ def _appdirs_cache_dir() -> Path:
_CACHE_DIR_NONE_HACK = Path('/tmp/hpi/cachew_none_hack') _CACHE_DIR_NONE_HACK = Path('/tmp/hpi/cachew_none_hack')
def cache_dir(suffix: PathIsh | None = None) -> Path: def cache_dir(suffix: Optional[PathIsh] = None) -> Path:
from . import core_config as CC from . import core_config as CC
cdir_ = CC.config.get_cache_dir() cdir_ = CC.config.get_cache_dir()
sp: Path | None = None sp: Optional[Path] = None
if suffix is not None: if suffix is not None:
sp = Path(suffix) sp = Path(suffix)
# guess if you do need absolute, better path it directly instead of as suffix? # guess if you do need absolute, better path it directly instead of as suffix?
@ -119,7 +105,7 @@ def _mcachew_impl(cache_path=_cache_path_dflt, **kwargs):
try: try:
import cachew import cachew
except ModuleNotFoundError: except ModuleNotFoundError:
warnings.high('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew') warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew')
return lambda orig_func: orig_func return lambda orig_func: orig_func
else: else:
kwargs['cache_path'] = cache_path kwargs['cache_path'] = cache_path
@ -136,7 +122,7 @@ if TYPE_CHECKING:
CC = Callable[P, R] # need to give it a name, if inlined into bound=, mypy runs in a bug CC = Callable[P, R] # need to give it a name, if inlined into bound=, mypy runs in a bug
PathProvider = Union[PathIsh, Callable[P, PathIsh]] PathProvider = Union[PathIsh, Callable[P, PathIsh]]
# NOTE: in cachew, HashFunction type returns str # NOTE: in cachew, HashFunction type returns str
# however in practice, cachew always calls str for its result # however in practice, cachew alwasy calls str for its result
# so perhaps better to switch it to Any in cachew as well # so perhaps better to switch it to Any in cachew as well
HashFunction = Callable[P, Any] HashFunction = Callable[P, Any]
@ -145,19 +131,21 @@ if TYPE_CHECKING:
# we need two versions due to @doublewrap # we need two versions due to @doublewrap
# this is when we just annotate as @cachew without any args # this is when we just annotate as @cachew without any args
@overload # type: ignore[no-overload-impl] @overload # type: ignore[no-overload-impl]
def mcachew(fun: F) -> F: ... def mcachew(fun: F) -> F:
...
@overload @overload
def mcachew( def mcachew(
cache_path: PathProvider | None = ..., cache_path: Optional[PathProvider] = ...,
*, *,
force_file: bool = ..., force_file: bool = ...,
cls: type | None = ..., cls: Optional[Type] = ...,
depends_on: HashFunction = ..., depends_on: HashFunction = ...,
logger: logging.Logger | None = ..., logger: Optional[logging.Logger] = ...,
chunk_by: int = ..., chunk_by: int = ...,
synthetic_key: str | None = ..., synthetic_key: Optional[str] = ...,
) -> Callable[[F], F]: ... ) -> Callable[[F], F]:
...
else: else:
mcachew = _mcachew_impl mcachew = _mcachew_impl

View file

@ -1,30 +1,22 @@
from __future__ import annotations from __future__ import annotations
import importlib from typing import TypeVar, Type, Callable, Dict, Any
import re
import sys
from collections.abc import Iterator
from contextlib import ExitStack, contextmanager
from typing import Any, Callable, TypeVar
Attrs = dict[str, Any] Attrs = Dict[str, Any]
C = TypeVar('C') C = TypeVar('C')
# todo not sure about it, could be overthinking... # todo not sure about it, could be overthinking...
# but short enough to change later # but short enough to change later
# TODO document why it's necessary? # TODO document why it's necessary?
def make_config(cls: type[C], migration: Callable[[Attrs], Attrs] = lambda x: x) -> C: def make_config(cls: Type[C], migration: Callable[[Attrs], Attrs]=lambda x: x) -> C:
user_config = cls.__base__ user_config = cls.__base__
old_props = { old_props = {
# NOTE: deliberately use gettatr to 'force' class properties here # NOTE: deliberately use gettatr to 'force' class properties here
k: getattr(user_config, k) k: getattr(user_config, k) for k in vars(user_config)
for k in vars(user_config)
} }
new_props = migration(old_props) new_props = migration(old_props)
from dataclasses import fields from dataclasses import fields
params = { params = {
k: v k: v
for k, v in new_props.items() for k, v in new_props.items()
@ -35,8 +27,8 @@ def make_config(cls: type[C], migration: Callable[[Attrs], Attrs] = lambda x: x)
F = TypeVar('F') F = TypeVar('F')
from contextlib import contextmanager
from typing import Iterator
@contextmanager @contextmanager
def _override_config(config: F) -> Iterator[F]: def _override_config(config: F) -> Iterator[F]:
''' '''
@ -54,9 +46,10 @@ def _override_config(config: F) -> Iterator[F]:
delattr(config, k) delattr(config, k)
import importlib
import sys
from typing import Optional
ModuleRegex = str ModuleRegex = str
@contextmanager @contextmanager
def _reload_modules(modules: ModuleRegex) -> Iterator[None]: def _reload_modules(modules: ModuleRegex) -> Iterator[None]:
# need to use list here, otherwise reordering with set might mess things up # need to use list here, otherwise reordering with set might mess things up
@ -86,15 +79,16 @@ def _reload_modules(modules: ModuleRegex) -> Iterator[None]:
sys.modules.pop(m, None) sys.modules.pop(m, None)
from contextlib import ExitStack
import re
@contextmanager @contextmanager
def tmp_config(*, modules: ModuleRegex | None = None, config=None): def tmp_config(*, modules: Optional[ModuleRegex]=None, config=None):
if modules is None: if modules is None:
assert config is None assert config is None
if modules is not None: if modules is not None:
assert config is not None assert config is not None
import my.config import my.config
with ExitStack() as module_reload_stack, _override_config(my.config) as new_config: with ExitStack() as module_reload_stack, _override_config(my.config) as new_config:
if config is not None: if config is not None:
overrides = {k: v for k, v in vars(config).items() if not k.startswith('__')} overrides = {k: v for k, v in vars(config).items() if not k.startswith('__')}
@ -109,7 +103,6 @@ def tmp_config(*, modules: ModuleRegex | None = None, config=None):
def test_tmp_config() -> None: def test_tmp_config() -> None:
class extra: class extra:
data_path = '/path/to/data' data_path = '/path/to/data'
with tmp_config() as c: with tmp_config() as c:
assert c.google != 'whatever' assert c.google != 'whatever'
assert not hasattr(c, 'extra') assert not hasattr(c, 'extra')

View file

@ -1,43 +1,199 @@
from __future__ import annotations
import os
from collections.abc import Iterable, Sequence
from glob import glob as do_glob from glob import glob as do_glob
from pathlib import Path from pathlib import Path
from datetime import datetime
import functools
from contextlib import contextmanager
import os
import sys
import types
from typing import ( from typing import (
TYPE_CHECKING, Any,
Callable, Callable,
Generic, Dict,
Iterable,
Iterator,
List,
NoReturn,
Optional,
Sequence,
TYPE_CHECKING,
Tuple,
TypeVar, TypeVar,
Union, Union,
cast,
get_args,
get_type_hints,
get_origin,
) )
import warnings
from . import compat, warnings from . import warnings as core_warnings
# some helper functions # some helper functions
# TODO start deprecating this? soon we'd be able to use Path | str syntax which is shorter and more explicit
PathIsh = Union[Path, str] PathIsh = Union[Path, str]
# TODO only used in tests? not sure if useful at all.
def import_file(p: PathIsh, name: Optional[str] = None) -> types.ModuleType:
p = Path(p)
if name is None:
name = p.stem
import importlib.util
spec = importlib.util.spec_from_file_location(name, p)
assert spec is not None, f"Fatal error; Could not create module spec from {name} {p}"
foo = importlib.util.module_from_spec(spec)
loader = spec.loader; assert loader is not None
loader.exec_module(foo)
return foo
def import_from(path: PathIsh, name: str) -> types.ModuleType:
path = str(path)
try:
sys.path.append(path)
import importlib
return importlib.import_module(name)
finally:
sys.path.remove(path)
def import_dir(path: PathIsh, extra: str='') -> types.ModuleType:
p = Path(path)
if p.parts[0] == '~':
p = p.expanduser() # TODO eh. not sure about this..
return import_from(p.parent, p.name + extra)
T = TypeVar('T')
K = TypeVar('K')
V = TypeVar('V')
# TODO deprecate? more_itertools.one should be used
def the(l: Iterable[T]) -> T:
it = iter(l)
try:
first = next(it)
except StopIteration:
raise RuntimeError('Empty iterator?')
assert all(e == first for e in it)
return first
# TODO more_itertools.bucket?
def group_by_key(l: Iterable[T], key: Callable[[T], K]) -> Dict[K, List[T]]:
res: Dict[K, List[T]] = {}
for i in l:
kk = key(i)
lst = res.get(kk, [])
lst.append(i)
res[kk] = lst
return res
def _identity(v: T) -> V: # type: ignore[type-var]
return cast(V, v)
# ugh. nothing in more_itertools?
def ensure_unique(
it: Iterable[T],
*,
key: Callable[[T], K],
value: Callable[[T], V]=_identity,
key2value: Optional[Dict[K, V]]=None
) -> Iterable[T]:
if key2value is None:
key2value = {}
for i in it:
k = key(i)
v = value(i)
pv = key2value.get(k, None)
if pv is not None:
raise RuntimeError(f"Duplicate key: {k}. Previous value: {pv}, new value: {v}")
key2value[k] = v
yield i
def test_ensure_unique() -> None:
import pytest
assert list(ensure_unique([1, 2, 3], key=lambda i: i)) == [1, 2, 3]
dups = [1, 2, 1, 4]
# this works because it's lazy
it = ensure_unique(dups, key=lambda i: i)
# but forcing throws
with pytest.raises(RuntimeError, match='Duplicate key'):
list(it)
# hacky way to force distinct objects?
list(ensure_unique(dups, key=lambda i: object()))
def make_dict(
it: Iterable[T],
*,
key: Callable[[T], K],
value: Callable[[T], V]=_identity
) -> Dict[K, V]:
res: Dict[K, V] = {}
uniques = ensure_unique(it, key=key, value=value, key2value=res)
for _ in uniques:
pass # force the iterator
return res
def test_make_dict() -> None:
it = range(5)
d = make_dict(it, key=lambda i: i, value=lambda i: i % 2)
assert d == {0: 0, 1: 1, 2: 0, 3: 1, 4: 0}
# check type inference
d2: Dict[str, int ] = make_dict(it, key=lambda i: str(i))
d3: Dict[str, bool] = make_dict(it, key=lambda i: str(i), value=lambda i: i % 2 == 0)
# https://stackoverflow.com/a/12377059/706389
def listify(fn=None, wrapper=list):
"""
Wraps a function's return value in wrapper (e.g. list)
Useful when an algorithm can be expressed more cleanly as a generator
"""
def listify_return(fn):
@functools.wraps(fn)
def listify_helper(*args, **kw):
return wrapper(fn(*args, **kw))
return listify_helper
if fn is None:
return listify_return
return listify_return(fn)
# todo use in bluemaestro
# def dictify(fn=None, key=None, value=None):
# def md(it):
# return make_dict(it, key=key, value=value)
# return listify(fn=fn, wrapper=md)
from .logging import setup_logger, LazyLogger
Paths = Union[Sequence[PathIsh], PathIsh] Paths = Union[Sequence[PathIsh], PathIsh]
DEFAULT_GLOB = '*' DEFAULT_GLOB = '*'
def get_files( def get_files(
pp: Paths, pp: Paths,
glob: str=DEFAULT_GLOB, glob: str=DEFAULT_GLOB,
*,
sort: bool=True, sort: bool=True,
guess_compression: bool=True, guess_compression: bool=True,
) -> tuple[Path, ...]: ) -> Tuple[Path, ...]:
""" """
Helper function to avoid boilerplate. Helper function to avoid boilerplate.
Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense
""" """
# TODO FIXME mm, some wrapper to assert iterator isn't empty? # TODO FIXME mm, some wrapper to assert iterator isn't empty?
sources: list[Path] sources: List[Path]
if isinstance(pp, Path): if isinstance(pp, Path):
sources = [pp] sources = [pp]
elif isinstance(pp, str): elif isinstance(pp, str):
@ -54,7 +210,7 @@ def get_files(
# TODO ugh. very flaky... -3 because [<this function>, get_files(), <actual caller>] # TODO ugh. very flaky... -3 because [<this function>, get_files(), <actual caller>]
return traceback.extract_stack()[-3].filename return traceback.extract_stack()[-3].filename
paths: list[Path] = [] paths: List[Path] = []
for src in sources: for src in sources:
if src.parts[0] == '~': if src.parts[0] == '~':
src = src.expanduser() src = src.expanduser()
@ -62,9 +218,9 @@ def get_files(
gs = str(src) gs = str(src)
if '*' in gs: if '*' in gs:
if glob != DEFAULT_GLOB: if glob != DEFAULT_GLOB:
warnings.medium(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!") warnings.warn(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!")
paths.extend(map(Path, do_glob(gs))) # noqa: PTH207 paths.extend(map(Path, do_glob(gs)))
elif os.path.isdir(str(src)): # noqa: PTH112 elif os.path.isdir(str(src)):
# NOTE: we're using os.path here on purpose instead of src.is_dir # NOTE: we're using os.path here on purpose instead of src.is_dir
# the reason is is_dir for archives might return True and then # the reason is is_dir for archives might return True and then
# this clause would try globbing insize the archives # this clause would try globbing insize the archives
@ -80,11 +236,11 @@ def get_files(
paths.append(src) paths.append(src)
if sort: if sort:
paths = sorted(paths) paths = list(sorted(paths))
if len(paths) == 0: if len(paths) == 0:
# todo make it conditionally defensive based on some global settings # todo make it conditionally defensive based on some global settings
warnings.high(f''' core_warnings.high(f'''
{caller()}: no paths were matched against {pp}. This might result in missing data. Likely, the directory you passed is empty. {caller()}: no paths were matched against {pp}. This might result in missing data. Likely, the directory you passed is empty.
'''.strip()) '''.strip())
# traceback is useful to figure out what config caused it? # traceback is useful to figure out what config caused it?
@ -93,7 +249,7 @@ def get_files(
traceback.print_stack() traceback.print_stack()
if guess_compression: if guess_compression:
from .kompress import CPath, ZipPath, is_compressed from .kompress import CPath, is_compressed, ZipPath
# NOTE: wrap is just for backwards compat with vendorized kompress # NOTE: wrap is just for backwards compat with vendorized kompress
# with kompress library, only is_compressed check and Cpath should be enough # with kompress library, only is_compressed check and Cpath should be enough
@ -110,33 +266,44 @@ def get_files(
return tuple(paths) return tuple(paths)
@functools.lru_cache(1)
def _magic():
import magic # type: ignore
return magic.Magic(mime=True)
# TODO could reuse in pdf module?
import mimetypes # todo do I need init()?
# todo wtf? fastermime thinks it's mime is application/json even if the extension is xz??
# whereas magic detects correctly: application/x-zstd and application/x-xz
def fastermime(path: PathIsh) -> str:
paths = str(path)
# mimetypes is faster
(mime, _) = mimetypes.guess_type(paths)
if mime is not None:
return mime
# magic is slower but returns more stuff
# TODO Result type?; it's kinda racey, but perhaps better to let the caller decide?
return _magic().from_file(paths)
Json = Dict[str, Any]
from typing import TypeVar, Callable, Generic
_C = TypeVar('_C')
_R = TypeVar('_R') _R = TypeVar('_R')
# https://stackoverflow.com/a/5192374/706389 # https://stackoverflow.com/a/5192374/706389
# NOTE: it was added to stdlib in 3.9 and then deprecated in 3.11
# seems that the suggested solution is to use custom decorator?
class classproperty(Generic[_R]): class classproperty(Generic[_R]):
def __init__(self, f: Callable[..., _R]) -> None: def __init__(self, f: Callable[[_C], _R]) -> None:
self.f = f self.f = f
def __get__(self, obj, cls) -> _R: def __get__(self, obj: None, cls: _C) -> _R:
return self.f(cls) return self.f(cls)
def test_classproperty() -> None:
from .compat import assert_type
class C:
@classproperty
def prop(cls) -> str:
return 'hello'
res = C.prop
assert_type(res, str)
assert res == 'hello'
# hmm, this doesn't really work with mypy well.. # hmm, this doesn't really work with mypy well..
# https://github.com/python/mypy/issues/6244 # https://github.com/python/mypy/issues/6244
# class staticproperty(Generic[_R]): # class staticproperty(Generic[_R]):
@ -146,117 +313,387 @@ def test_classproperty() -> None:
# def __get__(self) -> _R: # def __get__(self) -> _R:
# return self.f() # return self.f()
# for now just serves documentation purposes... but one day might make it statically verifiable where possible?
# TODO e.g. maybe use opaque mypy alias?
datetime_naive = datetime
datetime_aware = datetime
# TODO deprecate
tzdatetime = datetime_aware
# TODO deprecate (although could be used in modules)
from .compat import fromisoformat as isoparse
import re import re
# https://stackoverflow.com/a/295466/706389 # https://stackoverflow.com/a/295466/706389
def get_valid_filename(s: str) -> str: def get_valid_filename(s: str) -> str:
s = str(s).strip().replace(' ', '_') s = str(s).strip().replace(' ', '_')
return re.sub(r'(?u)[^-\w.]', '', s) return re.sub(r'(?u)[^-\w.]', '', s)
# TODO deprecate and suggest to use one from my.core directly? not sure from typing import Generic, Sized, Callable
from .utils.itertools import unique_everseen # noqa: F401
### legacy imports, keeping them here for backwards compatibility
## hiding behind TYPE_CHECKING so it works in runtime
## in principle, warnings.deprecated decorator should cooperate with mypy, but doesn't look like it works atm?
## perhaps it doesn't work when it's used from typing_extensions
if not TYPE_CHECKING: # X = TypeVar('X')
from .compat import deprecated def _warn_iterator(it, f: Any=None):
emitted = False
for i in it:
yield i
emitted = True
if not emitted:
warnings.warn(f"Function {f} didn't emit any data, make sure your config paths are correct")
@deprecated('use my.core.compat.assert_never instead')
def assert_never(*args, **kwargs):
return compat.assert_never(*args, **kwargs)
@deprecated('use my.core.compat.fromisoformat instead') # TODO ugh, so I want to express something like:
def isoparse(*args, **kwargs): # X = TypeVar('X')
return compat.fromisoformat(*args, **kwargs) # C = TypeVar('C', bound=Iterable[X])
# _warn_iterable(it: C) -> C
# but apparently I can't??? ugh.
# https://github.com/python/typing/issues/548
# I guess for now overloads are fine...
@deprecated('use more_itertools.one instead') from typing import overload
def the(*args, **kwargs): X = TypeVar('X')
import more_itertools @overload
def _warn_iterable(it: List[X] , f: Any=None) -> List[X] : ...
@overload
def _warn_iterable(it: Iterable[X], f: Any=None) -> Iterable[X]: ...
def _warn_iterable(it, f=None):
if isinstance(it, Sized):
sz = len(it)
if sz == 0:
warnings.warn(f"Function {f} returned empty container, make sure your config paths are correct")
return it
else:
return _warn_iterator(it, f=f)
return more_itertools.one(*args, **kwargs)
@deprecated('use functools.cached_property instead') # ok, this seems to work...
def cproperty(*args, **kwargs): # https://github.com/python/mypy/issues/1927#issue-167100413
import functools FL = TypeVar('FL', bound=Callable[..., List])
FI = TypeVar('FI', bound=Callable[..., Iterable])
return functools.cached_property(*args, **kwargs) @overload
def warn_if_empty(f: FL) -> FL: ...
@overload
def warn_if_empty(f: FI) -> FI: ...
def warn_if_empty(f):
from functools import wraps
@wraps(f)
def wrapped(*args, **kwargs):
res = f(*args, **kwargs)
return _warn_iterable(res, f=f)
return wrapped
# global state that turns on/off quick stats
# can use the 'quick_stats' contextmanager
# to enable/disable this in cli so that module 'stats'
# functions don't have to implement custom 'quick' logic
QUICK_STATS = False
# in case user wants to use the stats functions/quick option
# elsewhere -- can use this decorator instead of editing
# the global state directly
@contextmanager
def quick_stats():
global QUICK_STATS
prev = QUICK_STATS
try:
QUICK_STATS = True
yield
finally:
QUICK_STATS = prev
C = TypeVar('C')
Stats = Dict[str, Any]
StatsFun = Callable[[], Stats]
# todo not sure about return type...
def stat(
func: Union[Callable[[], Iterable[C]], Iterable[C]],
*,
quick: bool = False,
name: Optional[str] = None,
) -> Stats:
if callable(func):
fr = func()
if hasattr(fr, '__enter__') and hasattr(fr, '__exit__'):
# context managers has Iterable type, but they aren't data providers
# sadly doesn't look like there is a way to tell from typing annotations
return {}
fname = func.__name__
else:
# meh. means it's just a list.. not sure how to generate a name then
fr = func
fname = f'unnamed_{id(fr)}'
type_name = type(fr).__name__
if type_name == 'DataFrame':
# dynamic, because pandas is an optional dependency..
df = cast(Any, fr) # todo ugh, not sure how to annotate properly
res = dict(
dtypes=df.dtypes.to_dict(),
rows=len(df),
)
else:
res = _stat_iterable(fr, quick=quick)
stat_name = name if name is not None else fname
return {
stat_name: res,
}
def _stat_iterable(it: Iterable[C], quick: bool = False) -> Any:
from more_itertools import ilen, take, first
# todo not sure if there is something in more_itertools to compute this?
total = 0
errors = 0
first_item = None
last_item = None
def funcit():
nonlocal errors, first_item, last_item, total
for x in it:
total += 1
if isinstance(x, Exception):
errors += 1
else:
last_item = x
if first_item is None:
first_item = x
yield x
eit = funcit()
count: Any
if quick or QUICK_STATS:
initial = take(100, eit)
count = len(initial)
if first(eit, None) is not None: # todo can actually be none...
# haven't exhausted
count = f'{count}+'
else:
count = ilen(eit)
res = {
'count': count,
}
if total == 0:
# not sure but I guess a good balance? wouldn't want to throw early here?
res['warning'] = 'THE ITERABLE RETURNED NO DATA'
if errors > 0:
res['errors'] = errors
def stat_item(item):
if item is None:
return None
if isinstance(item, Path):
return str(item)
return guess_datetime(item)
if (stat_first := stat_item(first_item)) is not None:
res['first'] = stat_first
if (stat_last := stat_item(last_item)) is not None:
res['last'] = stat_last
@deprecated('use more_itertools.bucket instead')
def group_by_key(l, key):
res = {}
for i in l:
kk = key(i)
lst = res.get(kk, [])
lst.append(i)
res[kk] = lst
return res return res
@deprecated('use my.core.utils.itertools.make_dict instead')
def make_dict(*args, **kwargs):
from .utils import itertools as UI
return UI.make_dict(*args, **kwargs) def test_stat_iterable() -> None:
from datetime import datetime, timedelta
from typing import NamedTuple
@deprecated('use my.core.utils.itertools.listify instead') dd = datetime.utcfromtimestamp(123)
def listify(*args, **kwargs): day = timedelta(days=3)
from .utils import itertools as UI
return UI.listify(*args, **kwargs) X = NamedTuple('X', [('x', int), ('d', datetime)])
@deprecated('use my.core.warn_if_empty instead') def it():
def warn_if_empty(*args, **kwargs): yield RuntimeError('oops!')
from .utils import itertools as UI for i in range(2):
yield X(x=i, d=dd + day * i)
yield RuntimeError('bad!')
for i in range(3):
yield X(x=i * 10, d=dd + day * (i * 10))
yield X(x=123, d=dd + day * 50)
return UI.listify(*args, **kwargs) res = _stat_iterable(it())
assert res['count'] == 1 + 2 + 1 + 3 + 1
assert res['errors'] == 1 + 1
assert res['last'] == dd + day * 50
@deprecated('use my.core.stat instead')
def stat(*args, **kwargs):
from . import stats
return stats.stat(*args, **kwargs) # experimental, not sure about it..
def guess_datetime(x: Any) -> Optional[datetime]:
# todo hmm implement withoutexception..
try:
d = asdict(x)
except: # noqa: E722 bare except
return None
for k, v in d.items():
if isinstance(v, datetime):
return v
return None
@deprecated('use my.core.make_logger instead') def test_guess_datetime() -> None:
def LazyLogger(*args, **kwargs): from datetime import datetime
from . import logging from dataclasses import dataclass
from typing import NamedTuple
return logging.LazyLogger(*args, **kwargs) dd = isoparse('2021-02-01T12:34:56Z')
@deprecated('use my.core.types.asdict instead') # ugh.. https://github.com/python/mypy/issues/7281
def asdict(*args, **kwargs): A = NamedTuple('A', [('x', int)])
from . import types B = NamedTuple('B', [('x', int), ('created', datetime)])
return types.asdict(*args, **kwargs) assert guess_datetime(A(x=4)) is None
assert guess_datetime(B(x=4, created=dd)) == dd
# todo wrap these in deprecated decorator as well? @dataclass
# TODO hmm how to deprecate these in runtime? class C:
# tricky cause they are actually classes/types a: datetime
from typing import Literal # noqa: F401 x: int
assert guess_datetime(C(a=dd, x=435)) == dd
# TODO not sure what to return when multiple datetime fields?
# TODO test @property?
from .cachew import mcachew # noqa: F401
# this is kinda internal, should just use my.core.logging.setup_logger if necessary def is_namedtuple(thing: Any) -> bool:
from .logging import setup_logger # basic check to see if this is namedtuple-like
from .stats import Stats _asdict = getattr(thing, '_asdict', None)
from .types import ( return (_asdict is not None) and callable(_asdict)
Json,
datetime_aware,
datetime_naive,
)
tzdatetime = datetime_aware
def asdict(thing: Any) -> Json:
# todo primitive?
# todo exception?
if isinstance(thing, dict):
return thing
import dataclasses as D
if D.is_dataclass(thing):
return D.asdict(thing)
if is_namedtuple(thing):
return thing._asdict()
raise TypeError(f'Could not convert object {thing} to dict')
def assert_subpackage(name: str) -> None:
# can lead to some unexpected issues if you 'import cachew' which being in my/core directory.. so let's protect against it
# NOTE: if we use overlay, name can be smth like my.origg.my.core.cachew ...
assert name == '__main__' or 'my.core' in name, f'Expected module __name__ ({name}) to be __main__ or start with my.core'
from .compat import ParamSpec
_P = ParamSpec('_P')
_T = TypeVar('_T')
# https://stackoverflow.com/a/10436851/706389
from concurrent.futures import Future, Executor
class DummyExecutor(Executor):
def __init__(self, max_workers: Optional[int]=1) -> None:
self._shutdown = False
self._max_workers = max_workers
if TYPE_CHECKING:
if sys.version_info[:2] <= (3, 8):
# 3.8 doesn't support ParamSpec as Callable arg :(
# and any attempt to type results in incompatible supertype.. so whatever
def submit(self, fn, *args, **kwargs): ...
else: else:
from .compat import Never def submit(self, fn: Callable[_P, _T], /, *args: _P.args, **kwargs: _P.kwargs) -> Future[_T]: ...
else:
def submit(self, fn, *args, **kwargs):
if self._shutdown:
raise RuntimeError('cannot schedule new futures after shutdown')
# make these invalid during type check while working in runtime f: Future[Any] = Future()
Stats = Never try:
tzdatetime = Never result = fn(*args, **kwargs)
Json = Never except KeyboardInterrupt:
datetime_naive = Never raise
datetime_aware = Never except BaseException as e:
### f.set_exception(e)
else:
f.set_result(result)
return f
def shutdown(self, wait: bool=True, **kwargs) -> None:
self._shutdown = True
# see https://hakibenita.com/python-mypy-exhaustive-checking#exhaustiveness-checking
def assert_never(value: NoReturn) -> NoReturn:
assert False, f'Unhandled value: {value} ({type(value).__name__})'
def _check_all_hashable(fun):
# TODO ok, take callable?
hints = get_type_hints(fun)
# TODO needs to be defensive like in cachew?
return_type = hints.get('return')
# TODO check if None
origin = get_origin(return_type) # Iterator etc?
(arg,) = get_args(return_type)
# options we wanna handle are simple type on the top level or union
arg_origin = get_origin(arg)
if sys.version_info[:2] >= (3, 10):
is_uniontype = arg_origin is types.UnionType
else:
is_uniontype = False
is_union = arg_origin is Union or is_uniontype
if is_union:
to_check = get_args(arg)
else:
to_check = (arg,)
no_hash = [
t
for t in to_check
# seems that objects that have not overridden hash have the attribute but it's set to None
if getattr(t, '__hash__', None) is None
]
assert len(no_hash) == 0, f'Types {no_hash} are not hashable, this will result in significant performance downgrade for unique_everseen'
_UET = TypeVar('_UET')
_UEU = TypeVar('_UEU')
def unique_everseen(
fun: Callable[[], Iterable[_UET]],
key: Optional[Callable[[_UET], _UEU]] = None,
) -> Iterator[_UET]:
# TODO support normal iterable as well?
import more_itertools
# NOTE: it has to take original callable, because otherwise we don't have access to generator type annotations
iterable = fun()
if key is None:
# todo check key return type as well? but it's more likely to be hashable
if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
# TODO return better error here, e.g. if there is no return type it crashes
_check_all_hashable(fun)
return more_itertools.unique_everseen(iterable=iterable, key=key)
## legacy imports, keeping them here for backwards compatibility
from functools import cached_property as cproperty
from typing import Literal
from .cachew import mcachew
##

View file

@ -2,39 +2,28 @@
Contains backwards compatibility helpers for different python versions. Contains backwards compatibility helpers for different python versions.
If something is relevant to HPI itself, please put it in .hpi_compat instead If something is relevant to HPI itself, please put it in .hpi_compat instead
''' '''
import os
from __future__ import annotations
import sys import sys
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
if sys.version_info[:2] >= (3, 13):
from warnings import deprecated windows = os.name == 'nt'
else:
from typing_extensions import deprecated
# keeping just for backwards compatibility, used to have compat implementation for 3.6 # keeping just for backwards compatibility, used to have compat implementation for 3.6
if not TYPE_CHECKING:
import sqlite3 import sqlite3
@deprecated('use .backup method on sqlite3.Connection directly instead')
def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwargs) -> None: def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwargs) -> None:
# TODO warn here?
source.backup(dest, **kwargs) source.backup(dest, **kwargs)
# keeping for runtime backwards compatibility (added in 3.9)
@deprecated('use .removeprefix method on string directly instead') # can remove after python3.9 (although need to keep the method itself for bwd compat)
def removeprefix(text: str, prefix: str) -> str: def removeprefix(text: str, prefix: str) -> str:
return text.removeprefix(prefix) if text.startswith(prefix):
return text[len(prefix):]
return text
@deprecated('use .removesuffix method on string directly instead')
def removesuffix(text: str, suffix: str) -> str:
return text.removesuffix(suffix)
## ## used to have compat function before 3.8 for these
## used to have compat function before 3.8 for these, keeping for runtime back compatibility
from functools import cached_property from functools import cached_property
from typing import Literal, Protocol, TypedDict from typing import Literal, Protocol, TypedDict
## ##
@ -43,19 +32,27 @@ if not TYPE_CHECKING:
if sys.version_info[:2] >= (3, 10): if sys.version_info[:2] >= (3, 10):
from typing import ParamSpec from typing import ParamSpec
else: else:
if TYPE_CHECKING:
from typing_extensions import ParamSpec from typing_extensions import ParamSpec
else:
from typing import NamedTuple, Any
# erm.. I guess as long as it's not crashing, whatever...
class _ParamSpec:
def __call__(self, args):
class _res:
args = None
kwargs = None
return _res
ParamSpec = _ParamSpec()
# bisect_left doesn't have a 'key' parameter (which we use) # bisect_left doesn't have a 'key' parameter (which we use)
# till python3.10 # till python3.10
if sys.version_info[:2] <= (3, 9): if sys.version_info[:2] <= (3, 9):
from typing import Any, Callable, List, Optional, TypeVar # noqa: UP035 from typing import List, TypeVar, Any, Optional, Callable
X = TypeVar('X') X = TypeVar('X')
# copied from python src # copied from python src
# fmt: off def bisect_left(a: List[Any], x: Any, lo: int=0, hi: Optional[int]=None, *, key: Optional[Callable[..., Any]]=None) -> int:
def bisect_left(a: list[Any], x: Any, lo: int=0, hi: int | None=None, *, key: Callable[..., Any] | None=None) -> int:
if lo < 0: if lo < 0:
raise ValueError('lo must be non-negative') raise ValueError('lo must be non-negative')
if hi is None: if hi is None:
@ -77,22 +74,19 @@ if sys.version_info[:2] <= (3, 9):
else: else:
hi = mid hi = mid
return lo return lo
# fmt: on
else: else:
from bisect import bisect_left from bisect import bisect_left
from datetime import datetime from datetime import datetime
if sys.version_info[:2] >= (3, 11): if sys.version_info[:2] >= (3, 11):
fromisoformat = datetime.fromisoformat fromisoformat = datetime.fromisoformat
else: else:
# fromisoformat didn't support Z as "utc" before 3.11
# https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat
def fromisoformat(date_string: str) -> datetime: def fromisoformat(date_string: str) -> datetime:
# didn't support Z as "utc" before 3.11
if date_string.endswith('Z'): if date_string.endswith('Z'):
# NOTE: can be removed from 3.11?
# https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat
date_string = date_string[:-1] + '+00:00' date_string = date_string[:-1] + '+00:00'
return datetime.fromisoformat(date_string) return datetime.fromisoformat(date_string)
@ -100,7 +94,6 @@ else:
def test_fromisoformat() -> None: def test_fromisoformat() -> None:
from datetime import timezone from datetime import timezone
# fmt: off
# feedbin has this format # feedbin has this format
assert fromisoformat('2020-05-01T10:32:02.925961Z') == datetime( assert fromisoformat('2020-05-01T10:32:02.925961Z') == datetime(
2020, 5, 1, 10, 32, 2, 925961, timezone.utc, 2020, 5, 1, 10, 32, 2, 925961, timezone.utc,
@ -115,7 +108,6 @@ def test_fromisoformat() -> None:
assert fromisoformat('2020-11-30T00:53:12Z') == datetime( assert fromisoformat('2020-11-30T00:53:12Z') == datetime(
2020, 11, 30, 0, 53, 12, 0, timezone.utc, 2020, 11, 30, 0, 53, 12, 0, timezone.utc,
) )
# fmt: on
# arbtt has this format (sometimes less/more than 6 digits in milliseconds) # arbtt has this format (sometimes less/more than 6 digits in milliseconds)
# TODO doesn't work atm, not sure if really should be supported... # TODO doesn't work atm, not sure if really should be supported...
@ -123,17 +115,3 @@ def test_fromisoformat() -> None:
# assert isoparse('2017-07-18T18:59:38.21731Z') == datetime( # assert isoparse('2017-07-18T18:59:38.21731Z') == datetime(
# 2017, 7, 18, 18, 59, 38, 217310, timezone.utc, # 2017, 7, 18, 18, 59, 38, 217310, timezone.utc,
# ) # )
if sys.version_info[:2] >= (3, 10):
from types import NoneType
from typing import TypeAlias
else:
NoneType = type(None)
from typing_extensions import TypeAlias
if sys.version_info[:2] >= (3, 11):
from typing import Never, assert_never, assert_type
else:
from typing_extensions import Never, assert_never, assert_type

View file

@ -1,22 +1,16 @@
''' '''
Bindings for the 'core' HPI configuration Bindings for the 'core' HPI configuration
''' '''
from __future__ import annotations
import re import re
from collections.abc import Sequence from typing import Sequence, Optional
from dataclasses import dataclass
from pathlib import Path
from . import warnings from . import warnings, PathIsh, Path
try: try:
from my.config import core as user_config # type: ignore[attr-defined] from my.config import core as user_config # type: ignore[attr-defined]
except Exception as e: except Exception as e:
try: try:
from my.config import common as user_config # type: ignore[attr-defined] from my.config import common as user_config # type: ignore[attr-defined]
warnings.high("'common' config section is deprecated. Please rename it to 'core'.") warnings.high("'common' config section is deprecated. Please rename it to 'core'.")
except Exception as e2: except Exception as e2:
# make it defensive, because it's pretty commonly used and would be annoying if it breaks hpi doctor etc. # make it defensive, because it's pretty commonly used and would be annoying if it breaks hpi doctor etc.
@ -27,7 +21,7 @@ except Exception as e:
_HPI_CACHE_DIR_DEFAULT = '' _HPI_CACHE_DIR_DEFAULT = ''
from dataclasses import dataclass
@dataclass @dataclass
class Config(user_config): class Config(user_config):
''' '''
@ -38,7 +32,7 @@ class Config(user_config):
cache_dir = '/your/custom/cache/path' cache_dir = '/your/custom/cache/path'
''' '''
cache_dir: Path | str | None = _HPI_CACHE_DIR_DEFAULT cache_dir: Optional[PathIsh] = _HPI_CACHE_DIR_DEFAULT
''' '''
Base directory for cachew. Base directory for cachew.
- if None , means cache is disabled - if None , means cache is disabled
@ -48,7 +42,7 @@ class Config(user_config):
NOTE: you shouldn't use this attribute in HPI modules directly, use Config.get_cache_dir()/cachew.cache_dir() instead NOTE: you shouldn't use this attribute in HPI modules directly, use Config.get_cache_dir()/cachew.cache_dir() instead
''' '''
tmp_dir: Path | str | None = None tmp_dir: Optional[PathIsh] = None
''' '''
Path to a temporary directory. Path to a temporary directory.
This can be used temporarily while extracting zipfiles etc... This can be used temporarily while extracting zipfiles etc...
@ -56,36 +50,34 @@ class Config(user_config):
- otherwise , use the specified directory as the base temporary directory - otherwise , use the specified directory as the base temporary directory
''' '''
enabled_modules: Sequence[str] | None = None enabled_modules : Optional[Sequence[str]] = None
''' '''
list of regexes/globs list of regexes/globs
- None means 'rely on disabled_modules' - None means 'rely on disabled_modules'
''' '''
disabled_modules: Sequence[str] | None = None disabled_modules: Optional[Sequence[str]] = None
''' '''
list of regexes/globs list of regexes/globs
- None means 'rely on enabled_modules' - None means 'rely on enabled_modules'
''' '''
def get_cache_dir(self) -> Path | None: def get_cache_dir(self) -> Optional[Path]:
cdir = self.cache_dir cdir = self.cache_dir
if cdir is None: if cdir is None:
return None return None
if cdir == _HPI_CACHE_DIR_DEFAULT: if cdir == _HPI_CACHE_DIR_DEFAULT:
from .cachew import _appdirs_cache_dir from .cachew import _appdirs_cache_dir
return _appdirs_cache_dir() return _appdirs_cache_dir()
else: else:
return Path(cdir).expanduser() return Path(cdir).expanduser()
def get_tmp_dir(self) -> Path: def get_tmp_dir(self) -> Path:
tdir: Path | str | None = self.tmp_dir tdir: Optional[PathIsh] = self.tmp_dir
tpath: Path tpath: Path
# use tempfile if unset # use tempfile if unset
if tdir is None: if tdir is None:
import tempfile import tempfile
tpath = Path(tempfile.gettempdir()) / 'HPI' tpath = Path(tempfile.gettempdir()) / 'HPI'
else: else:
tpath = Path(tdir) tpath = Path(tdir)
@ -93,10 +85,10 @@ class Config(user_config):
tpath.mkdir(parents=True, exist_ok=True) tpath.mkdir(parents=True, exist_ok=True)
return tpath return tpath
def _is_module_active(self, module: str) -> bool | None: def _is_module_active(self, module: str) -> Optional[bool]:
# None means the config doesn't specify anything # None means the config doesn't specify anything
# todo might be nice to return the 'reason' too? e.g. which option has matched # todo might be nice to return the 'reason' too? e.g. which option has matched
def matches(specs: Sequence[str]) -> str | None: def matches(specs: Sequence[str]) -> Optional[str]:
for spec in specs: for spec in specs:
# not sure because . (packages separate) matches anything, but I guess unlikely to clash # not sure because . (packages separate) matches anything, but I guess unlikely to clash
if re.match(spec, module): if re.match(spec, module):
@ -122,15 +114,12 @@ class Config(user_config):
from .cfg import make_config from .cfg import make_config
config = make_config(Config) config = make_config(Config)
### tests start ### tests start
from collections.abc import Iterator from typing import Iterator
from contextlib import contextmanager as ctx from contextlib import contextmanager as ctx
@ctx @ctx
def _reset_config() -> Iterator[Config]: def _reset_config() -> Iterator[Config]:
# todo maybe have this decorator for the whole of my.config? # todo maybe have this decorator for the whole of my.config?
@ -169,5 +158,4 @@ def test_active_modules() -> None:
assert cc._is_module_active("my.body.exercise") is True assert cc._is_module_active("my.body.exercise") is True
assert len(record_warnings) == 1 assert len(record_warnings) == 1
### tests end ### tests end

View file

@ -1,5 +1,31 @@
from . import warnings from __future__ import annotations
from .common import assert_subpackage; assert_subpackage(__name__)
warnings.high(f"{__name__} is deprecated, please use dataset directly if you need or switch to my.core.sqlite") from .common import PathIsh
from .sqlite import sqlite_connect_immutable
from ._deprecated.dataset import * ## sadly dataset doesn't have any type definitions
from typing import Iterable, Iterator, Dict, Optional, Any, Protocol
from contextlib import AbstractContextManager
# NOTE: may not be true in general, but will be in the vast majority of cases
row_type_T = Dict[str, Any]
class TableT(Iterable, Protocol):
def find(self, *, order_by: Optional[str]=None) -> Iterator[row_type_T]: ...
class DatabaseT(AbstractContextManager['DatabaseT'], Protocol):
def __getitem__(self, table: str) -> TableT: ...
##
# TODO wonder if also need to open without WAL.. test this on read-only directory/db file
def connect_readonly(db: PathIsh) -> DatabaseT:
import dataset # type: ignore
# see https://github.com/pudo/dataset/issues/136#issuecomment-128693122
# todo not sure if mode=ro has any benefit, but it doesn't work on read-only filesystems
# maybe it should autodetect readonly filesystems and apply this? not sure
creator = lambda: sqlite_connect_immutable(db)
return dataset.connect('sqlite:///', engine_kwargs={'creator': creator})

View file

@ -5,25 +5,23 @@ A helper module for defining denylists for sources programmatically
For docs, see doc/DENYLIST.md For docs, see doc/DENYLIST.md
""" """
from __future__ import annotations
import functools
import json
import sys import sys
import json
import functools
from collections import defaultdict from collections import defaultdict
from collections.abc import Iterator, Mapping from typing import TypeVar, Set, Any, Mapping, Iterator, Dict, List
from pathlib import Path from pathlib import Path
from typing import Any, TypeVar
import click import click
from more_itertools import seekable from more_itertools import seekable
from my.core.serialize import dumps
from my.core.common import PathIsh
from my.core.warnings import medium
from .serialize import dumps
from .warnings import medium
T = TypeVar("T") T = TypeVar("T")
DenyMap = Mapping[str, set[Any]] DenyMap = Mapping[str, Set[Any]]
def _default_key_func(obj: T) -> str: def _default_key_func(obj: T) -> str:
@ -31,9 +29,9 @@ def _default_key_func(obj: T) -> str:
class DenyList: class DenyList:
def __init__(self, denylist_file: Path | str) -> None: def __init__(self, denylist_file: PathIsh):
self.file = Path(denylist_file).expanduser().absolute() self.file = Path(denylist_file).expanduser().absolute()
self._deny_raw_list: list[dict[str, Any]] = [] self._deny_raw_list: List[Dict[str, Any]] = []
self._deny_map: DenyMap = defaultdict(set) self._deny_map: DenyMap = defaultdict(set)
# deny cli, user can override these # deny cli, user can override these
@ -47,7 +45,7 @@ class DenyList:
return return
deny_map: DenyMap = defaultdict(set) deny_map: DenyMap = defaultdict(set)
data: list[dict[str, Any]] = json.loads(self.file.read_text()) data: List[Dict[str, Any]]= json.loads(self.file.read_text())
self._deny_raw_list = data self._deny_raw_list = data
for ignore in data: for ignore in data:
@ -98,7 +96,6 @@ class DenyList:
def filter( def filter(
self, self,
itr: Iterator[T], itr: Iterator[T],
*,
invert: bool = False, invert: bool = False,
) -> Iterator[T]: ) -> Iterator[T]:
denyf = functools.partial(self._allow, deny_map=self.load()) denyf = functools.partial(self._allow, deny_map=self.load())
@ -106,7 +103,7 @@ class DenyList:
return filter(lambda x: not denyf(x), itr) return filter(lambda x: not denyf(x), itr)
return filter(denyf, itr) return filter(denyf, itr)
def deny(self, key: str, value: Any, *, write: bool = False) -> None: def deny(self, key: str, value: Any, write: bool = False) -> None:
''' '''
add a key/value pair to the denylist add a key/value pair to the denylist
''' '''
@ -114,7 +111,7 @@ class DenyList:
self._load() self._load()
self._deny_raw({key: self._stringify_value(value)}, write=write) self._deny_raw({key: self._stringify_value(value)}, write=write)
def _deny_raw(self, data: dict[str, Any], *, write: bool = False) -> None: def _deny_raw(self, data: Dict[str, Any], write: bool = False) -> None:
self._deny_raw_list.append(data) self._deny_raw_list.append(data)
if write: if write:
self.write() self.write()
@ -133,7 +130,7 @@ class DenyList:
def _deny_cli_remember( def _deny_cli_remember(
self, self,
items: Iterator[T], items: Iterator[T],
mem: dict[str, T], mem: Dict[str, T],
) -> Iterator[str]: ) -> Iterator[str]:
keyf = self._deny_cli_key_func or _default_key_func keyf = self._deny_cli_key_func or _default_key_func
# i.e., convert each item to a string, and map str -> item # i.e., convert each item to a string, and map str -> item
@ -159,8 +156,10 @@ class DenyList:
# reset the iterator # reset the iterator
sit.seek(0) sit.seek(0)
# so we can map the selected string from fzf back to the original objects # so we can map the selected string from fzf back to the original objects
memory_map: dict[str, T] = {} memory_map: Dict[str, T] = {}
picker = FzfPrompt(executable_path=self.fzf_path, default_options="--no-multi") picker = FzfPrompt(
executable_path=self.fzf_path, default_options="--no-multi"
)
picked_l = picker.prompt( picked_l = picker.prompt(
self._deny_cli_remember(itr, memory_map), self._deny_cli_remember(itr, memory_map),
"--read0", "--read0",

View file

@ -10,20 +10,17 @@ This potentially allows it to be:
It should be free of external modules, importlib, exec, etc. etc. It should be free of external modules, importlib, exec, etc. etc.
''' '''
from __future__ import annotations
REQUIRES = 'REQUIRES' REQUIRES = 'REQUIRES'
NOT_HPI_MODULE_VAR = '__NOT_HPI_MODULE__' NOT_HPI_MODULE_VAR = '__NOT_HPI_MODULE__'
### ###
import ast import ast
import logging
import os import os
import re from typing import Optional, Sequence, List, NamedTuple, Iterable, cast, Any
from collections.abc import Iterable, Sequence
from pathlib import Path from pathlib import Path
from typing import Any, NamedTuple, Optional, cast import re
import logging
''' '''
None means that requirements weren't defined (different from empty requirements) None means that requirements weren't defined (different from empty requirements)
@ -33,11 +30,11 @@ Requires = Optional[Sequence[str]]
class HPIModule(NamedTuple): class HPIModule(NamedTuple):
name: str name: str
skip_reason: str | None skip_reason: Optional[str]
doc: str | None = None doc: Optional[str] = None
file: Path | None = None file: Optional[Path] = None
requires: Requires = None requires: Requires = None
legacy: str | None = None # contains reason/deprecation warning legacy: Optional[str] = None # contains reason/deprecation warning
def ignored(m: str) -> bool: def ignored(m: str) -> bool:
@ -147,7 +144,7 @@ def all_modules() -> Iterable[HPIModule]:
def _iter_my_roots() -> Iterable[Path]: def _iter_my_roots() -> Iterable[Path]:
import my # doesn't import any code, because of namespace package import my # doesn't import any code, because of namespace package
paths: list[str] = list(my.__path__) paths: List[str] = list(my.__path__)
if len(paths) == 0: if len(paths) == 0:
# should probably never happen?, if this code is running, it was imported # should probably never happen?, if this code is running, it was imported
# because something was added to __path__ to match this name # because something was added to __path__ to match this name
@ -245,7 +242,7 @@ def test_pure() -> None:
src = Path(__file__).read_text() src = Path(__file__).read_text()
# 'import my' is allowed, but # 'import my' is allowed, but
# dont allow anything other HPI modules # dont allow anything other HPI modules
assert re.findall('import ' + r'my\.\S+', src, re.MULTILINE) == [] assert re.findall('import ' + r'my\.\S+', src, re.M) == []
assert 'from ' + 'my' not in src assert 'from ' + 'my' not in src

View file

@ -3,22 +3,9 @@ Various error handling helpers
See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail
""" """
from __future__ import annotations
import traceback
from collections.abc import Iterable, Iterator
from datetime import datetime
from itertools import tee from itertools import tee
from typing import ( from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast, Iterator, Literal
Any,
Callable,
Literal,
TypeVar,
Union,
cast,
)
from .types import Json
T = TypeVar('T') T = TypeVar('T')
E = TypeVar('E', bound=Exception) # TODO make covariant? E = TypeVar('E', bound=Exception) # TODO make covariant?
@ -29,8 +16,7 @@ Res = ResT[T, Exception]
ErrorPolicy = Literal["yield", "raise", "drop"] ErrorPolicy = Literal["yield", "raise", "drop"]
def notnone(x: Optional[T]) -> T:
def notnone(x: T | None) -> T:
assert x is not None assert x is not None
return x return x
@ -38,9 +24,9 @@ def notnone(x: T | None) -> T:
def unwrap(res: Res[T]) -> T: def unwrap(res: Res[T]) -> T:
if isinstance(res, Exception): if isinstance(res, Exception):
raise res raise res
else:
return res return res
def drop_exceptions(itr: Iterator[Res[T]]) -> Iterator[T]: def drop_exceptions(itr: Iterator[Res[T]]) -> Iterator[T]:
"""Return non-errors from the iterable""" """Return non-errors from the iterable"""
for o in itr: for o in itr:
@ -57,15 +43,13 @@ def raise_exceptions(itr: Iterable[Res[T]]) -> Iterator[T]:
yield o yield o
def warn_exceptions(itr: Iterable[Res[T]], warn_func: Callable[[Exception], None] | None = None) -> Iterator[T]: def warn_exceptions(itr: Iterable[Res[T]], warn_func: Optional[Callable[[Exception], None]] = None) -> Iterator[T]:
# if not provided, use the 'warnings' module # if not provided, use the 'warnings' module
if warn_func is None: if warn_func is None:
from my.core.warnings import medium from my.core.warnings import medium
def _warn_func(e: Exception) -> None: def _warn_func(e: Exception) -> None:
# TODO: print traceback? but user could always --raise-exceptions as well # TODO: print traceback? but user could always --raise-exceptions as well
medium(str(e)) medium(str(e))
warn_func = _warn_func warn_func = _warn_func
for o in itr: for o in itr:
@ -80,7 +64,7 @@ def echain(ex: E, cause: Exception) -> E:
return ex return ex
def split_errors(l: Iterable[ResT[T, E]], ET: type[E]) -> tuple[Iterable[T], Iterable[E]]: def split_errors(l: Iterable[ResT[T, E]], ET: Type[E]) -> Tuple[Iterable[T], Iterable[E]]:
# TODO would be nice to have ET=Exception default? but it causes some mypy complaints? # TODO would be nice to have ET=Exception default? but it causes some mypy complaints?
vit, eit = tee(l) vit, eit = tee(l)
# TODO ugh, not sure if I can reconcile type checking and runtime and convince mypy that ET and E are the same type? # TODO ugh, not sure if I can reconcile type checking and runtime and convince mypy that ET and E are the same type?
@ -98,9 +82,7 @@ def split_errors(l: Iterable[ResT[T, E]], ET: type[E]) -> tuple[Iterable[T], Ite
K = TypeVar('K') K = TypeVar('K')
def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> List[Res[T]]:
def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> list[Res[T]]:
""" """
Sort a sequence potentially interleaved with errors/entries on which the key can't be computed. Sort a sequence potentially interleaved with errors/entries on which the key can't be computed.
The general idea is: the error sticks to the non-error entry that follows it The general idea is: the error sticks to the non-error entry that follows it
@ -108,7 +90,7 @@ def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> list[Res[T]
group = [] group = []
groups = [] groups = []
for i in items: for i in items:
k: K | None k: Optional[K]
try: try:
k = key(i) k = key(i)
except Exception: # error white computing key? dunno, might be nice to handle... except Exception: # error white computing key? dunno, might be nice to handle...
@ -118,8 +100,8 @@ def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> list[Res[T]
groups.append((k, group)) groups.append((k, group))
group = [] group = []
results: list[Res[T]] = [] results: List[Res[T]] = []
for _v, grp in sorted(groups, key=lambda p: p[0]): # type: ignore[return-value, arg-type] # TODO SupportsLessThan?? for v, grp in sorted(groups, key=lambda p: p[0]): # type: ignore[return-value, arg-type] # TODO SupportsLessThan??
results.extend(grp) results.extend(grp)
results.extend(group) # handle last group (it will always be errors only) results.extend(group) # handle last group (it will always be errors only)
@ -153,7 +135,7 @@ def test_sort_res_by() -> None:
Exc('last'), Exc('last'),
] ]
results2 = sort_res_by([*ress, 0], lambda x: int(x)) results2 = sort_res_by(ress + [0], lambda x: int(x))
assert results2 == [Exc('last'), 0] + results[:-1] assert results2 == [Exc('last'), 0] + results[:-1]
assert sort_res_by(['caba', 'a', 'aba', 'daba'], key=lambda x: len(x)) == ['a', 'aba', 'caba', 'daba'] assert sort_res_by(['caba', 'a', 'aba', 'daba'], key=lambda x: len(x)) == ['a', 'aba', 'caba', 'daba']
@ -162,23 +144,23 @@ def test_sort_res_by() -> None:
# helpers to associate timestamps with the errors (so something meaningful could be displayed on the plots, for example) # helpers to associate timestamps with the errors (so something meaningful could be displayed on the plots, for example)
# todo document it under 'patterns' somewhere... # todo document it under 'patterns' somewhere...
# todo proper typevar? # todo proper typevar?
def set_error_datetime(e: Exception, dt: datetime | None) -> None: from datetime import datetime
def set_error_datetime(e: Exception, dt: Optional[datetime]) -> None:
if dt is None: if dt is None:
return return
e.args = (*e.args, dt) e.args = e.args + (dt,)
# todo not sure if should return new exception? # todo not sure if should return new exception?
def attach_dt(e: Exception, *, dt: Optional[datetime]) -> Exception:
def attach_dt(e: Exception, *, dt: datetime | None) -> Exception:
set_error_datetime(e, dt) set_error_datetime(e, dt)
return e return e
# todo it might be problematic because might mess with timezones (when it's converted to string, it's converted to a shift) # todo it might be problematic because might mess with timezones (when it's converted to string, it's converted to a shift)
def extract_error_datetime(e: Exception) -> datetime | None: def extract_error_datetime(e: Exception) -> Optional[datetime]:
import re import re
from datetime import datetime
for x in reversed(e.args): for x in reversed(e.args):
if isinstance(x, datetime): if isinstance(x, datetime):
return x return x
@ -193,6 +175,8 @@ def extract_error_datetime(e: Exception) -> datetime | None:
return None return None
import traceback
from .common import Json
def error_to_json(e: Exception) -> Json: def error_to_json(e: Exception) -> Json:
estr = ''.join(traceback.format_exception(Exception, e, e.__traceback__)) estr = ''.join(traceback.format_exception(Exception, e, e.__traceback__))
return {'error': estr} return {'error': estr}
@ -200,13 +184,7 @@ def error_to_json(e: Exception) -> Json:
MODULE_SETUP_URL = 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#private-configuration-myconfig' MODULE_SETUP_URL = 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#private-configuration-myconfig'
def warn_my_config_import_error(err: Union[ImportError, AttributeError], help_url: Optional[str] = None) -> bool:
def warn_my_config_import_error(
err: ImportError | AttributeError,
*,
help_url: str | None = None,
module_name: str | None = None,
) -> bool:
""" """
If the user tried to import something from my.config but it failed, If the user tried to import something from my.config but it failed,
possibly due to missing the config block in my.config? possibly due to missing the config block in my.config?
@ -214,12 +192,10 @@ def warn_my_config_import_error(
Returns True if it matched a possible config error Returns True if it matched a possible config error
""" """
import re import re
import click import click
if help_url is None: if help_url is None:
help_url = MODULE_SETUP_URL help_url = MODULE_SETUP_URL
if type(err) is ImportError: if type(err) == ImportError:
if err.name != 'my.config': if err.name != 'my.config':
return False return False
# parse name that user attempted to import # parse name that user attempted to import
@ -231,31 +207,17 @@ You may be missing the '{section_name}' section from your config.
See {help_url}\ See {help_url}\
""", fg='yellow', err=True) """, fg='yellow', err=True)
return True return True
elif type(err) is AttributeError: elif type(err) == AttributeError:
# test if user had a nested config block missing # test if user had a nested config block missing
# https://github.com/karlicoss/HPI/issues/223 # https://github.com/karlicoss/HPI/issues/223
if hasattr(err, 'obj') and hasattr(err, "name"): if hasattr(err, 'obj') and hasattr(err, "name"):
config_obj = cast(object, getattr(err, 'obj')) # the object that caused the attribute error config_obj = cast(object, getattr(err, 'obj')) # the object that caused the attribute error
# e.g. active_browser for my.browser # e.g. active_browser for my.browser
nested_block_name = err.name nested_block_name = err.name
errmsg = f"""You're likely missing the nested config block for '{getattr(config_obj, '__name__', str(config_obj))}.{nested_block_name}'.
See {help_url} or check the corresponding module.py file for an example\
"""
if config_obj.__module__ == 'my.config': if config_obj.__module__ == 'my.config':
click.secho(errmsg, fg='yellow', err=True) click.secho(f"""You're likely missing the nested config block for '{getattr(config_obj, '__name__', str(config_obj))}.{nested_block_name}'.
return True See {help_url} or check the corresponding module.py file for an example\
if module_name is not None and nested_block_name == module_name.split('.')[-1]: """, fg='yellow', err=True)
# this tries to cover cases like these
# user config:
# class location:
# class via_ip:
# accuracy = 10_000
# then when we import it, we do something like
# from my.config import location
# user_config = location.via_ip
# so if location is present, but via_ip is not, we get
# AttributeError: type object 'location' has no attribute 'via_ip'
click.secho(errmsg, fg='yellow', err=True)
return True return True
else: else:
click.echo(f"Unexpected error... {err}", err=True) click.echo(f"Unexpected error... {err}", err=True)
@ -263,8 +225,7 @@ See {help_url} or check the corresponding module.py file for an example\
def test_datetime_errors() -> None: def test_datetime_errors() -> None:
import pytz # noqa: I001 import pytz
dt_notz = datetime.now() dt_notz = datetime.now()
dt_tz = datetime.now(tz=pytz.timezone('Europe/Amsterdam')) dt_tz = datetime.now(tz=pytz.timezone('Europe/Amsterdam'))
for dt in [dt_tz, dt_notz]: for dt in [dt_tz, dt_notz]:

View file

@ -1,8 +1,6 @@
from __future__ import annotations
import sys import sys
from typing import Any, Dict, Optional
import types import types
from typing import Any
# The idea behind this one is to support accessing "overlaid/shadowed" modules from namespace packages # The idea behind this one is to support accessing "overlaid/shadowed" modules from namespace packages
@ -22,7 +20,7 @@ def import_original_module(
file: str, file: str,
*, *,
star: bool = False, star: bool = False,
globals: dict[str, Any] | None = None, globals: Optional[Dict[str, Any]] = None,
) -> types.ModuleType: ) -> types.ModuleType:
module_to_restore = sys.modules[module_name] module_to_restore = sys.modules[module_name]

View file

@ -1,29 +1,27 @@
from __future__ import annotations from .common import assert_subpackage; assert_subpackage(__name__)
from .internal import assert_subpackage import dataclasses as dcl
assert_subpackage(__name__)
import dataclasses
import inspect import inspect
from typing import Any, Generic, TypeVar from typing import TypeVar, Type, Any
D = TypeVar('D') D = TypeVar('D')
def _freeze_dataclass(Orig: type[D]): def _freeze_dataclass(Orig: Type[D]):
ofields = [(f.name, f.type, f) for f in dataclasses.fields(Orig)] # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115 ofields = [(f.name, f.type, f) for f in dcl.fields(Orig)] # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115
# extract properties along with their types # extract properties along with their types
props = list(inspect.getmembers(Orig, lambda o: isinstance(o, property))) props = list(inspect.getmembers(Orig, lambda o: isinstance(o, property)))
pfields = [(name, inspect.signature(getattr(prop, 'fget')).return_annotation) for name, prop in props] pfields = [(name, inspect.signature(getattr(prop, 'fget')).return_annotation) for name, prop in props]
# FIXME not sure about name? # FIXME not sure about name?
# NOTE: sadly passing bases=[Orig] won't work, python won't let us override properties with fields # NOTE: sadly passing bases=[Orig] won't work, python won't let us override properties with fields
RRR = dataclasses.make_dataclass('RRR', fields=[*ofields, *pfields]) RRR = dcl.make_dataclass('RRR', fields=[*ofields, *pfields])
# todo maybe even declare as slots? # todo maybe even declare as slots?
return props, RRR return props, RRR
# todo need some decorator thingie?
from typing import Generic
class Freezer(Generic[D]): class Freezer(Generic[D]):
''' '''
Some magic which converts dataclass properties into fields. Some magic which converts dataclass properties into fields.
@ -31,13 +29,13 @@ class Freezer(Generic[D]):
For now only supports dataclasses. For now only supports dataclasses.
''' '''
def __init__(self, Orig: type[D]) -> None: def __init__(self, Orig: Type[D]) -> None:
self.Orig = Orig self.Orig = Orig
self.props, self.Frozen = _freeze_dataclass(Orig) self.props, self.Frozen = _freeze_dataclass(Orig)
def freeze(self, value: D) -> D: def freeze(self, value: D) -> D:
pvalues = {name: getattr(value, name) for name, _ in self.props} pvalues = {name: getattr(value, name) for name, _ in self.props}
return self.Frozen(**dataclasses.asdict(value), **pvalues) # type: ignore[call-overload] # see https://github.com/python/typing_extensions/issues/115 return self.Frozen(**dcl.asdict(value), **pvalues) # type: ignore[call-overload] # see https://github.com/python/typing_extensions/issues/115
### tests ### tests
@ -45,7 +43,7 @@ class Freezer(Generic[D]):
# this needs to be defined here to prevent a mypy bug # this needs to be defined here to prevent a mypy bug
# see https://github.com/python/mypy/issues/7281 # see https://github.com/python/mypy/issues/7281
@dataclasses.dataclass @dcl.dataclass
class _A: class _A:
x: Any x: Any
@ -60,10 +58,8 @@ class _A:
def test_freezer() -> None: def test_freezer() -> None:
val = _A(x={
'an_int': 123, val = _A(x=dict(an_int=123, an_any=[1, 2, 3]))
'an_any': [1, 2, 3],
})
af = Freezer(_A) af = Freezer(_A)
fval = af.freeze(val) fval = af.freeze(val)
@ -71,7 +67,6 @@ def test_freezer() -> None:
assert fd['typed'] == 123 assert fd['typed'] == 123
assert fd['untyped'] == [1, 2, 3] assert fd['untyped'] == [1, 2, 3]
### ###
# TODO shit. what to do with exceptions? # TODO shit. what to do with exceptions?

View file

@ -2,15 +2,11 @@
Contains various backwards compatibility/deprecation helpers relevant to HPI itself. Contains various backwards compatibility/deprecation helpers relevant to HPI itself.
(as opposed to .compat module which implements compatibility between python versions) (as opposed to .compat module which implements compatibility between python versions)
""" """
from __future__ import annotations
import inspect
import os import os
import inspect
import re import re
from collections.abc import Iterator, Sequence
from types import ModuleType from types import ModuleType
from typing import TypeVar from typing import Iterator, List, Optional, TypeVar
from . import warnings from . import warnings
@ -18,7 +14,7 @@ from . import warnings
def handle_legacy_import( def handle_legacy_import(
parent_module_name: str, parent_module_name: str,
legacy_submodule_name: str, legacy_submodule_name: str,
parent_module_path: list[str], parent_module_path: List[str],
) -> bool: ) -> bool:
### ###
# this is to trick mypy into treating this as a proper namespace package # this is to trick mypy into treating this as a proper namespace package
@ -75,7 +71,7 @@ def pre_pip_dal_handler(
name: str, name: str,
e: ModuleNotFoundError, e: ModuleNotFoundError,
cfg, cfg,
requires: Sequence[str] = (), requires=[],
) -> ModuleType: ) -> ModuleType:
''' '''
https://github.com/karlicoss/HPI/issues/79 https://github.com/karlicoss/HPI/issues/79
@ -105,7 +101,7 @@ Please install {' '.join(requires)} as PIP packages (see the corresponding READM
def _get_dal(cfg, module_name: str): def _get_dal(cfg, module_name: str):
mpath = getattr(cfg, module_name, None) mpath = getattr(cfg, module_name, None)
if mpath is not None: if mpath is not None:
from .utils.imports import import_dir from .common import import_dir
return import_dir(mpath, '.dal') return import_dir(mpath, '.dal')
else: else:
@ -120,141 +116,32 @@ V = TypeVar('V')
# named to be kinda consistent with more_itertools, e.g. more_itertools.always_iterable # named to be kinda consistent with more_itertools, e.g. more_itertools.always_iterable
class always_supports_sequence(Iterator[V]): class always_supports_sequence(Iterator[V]):
""" """
Helper to make migration from Sequence/List to Iterable/Iterator type backwards compatible in runtime Helper to make migration from Sequence/List to Iterable/Iterator type backwards compatible
""" """
def __init__(self, it: Iterator[V]) -> None: def __init__(self, it: Iterator[V]) -> None:
self._it = it self.it = it
self._list: list[V] | None = None self._list: Optional[List] = None
self._lit: Iterator[V] | None = None
def __iter__(self) -> Iterator[V]: # noqa: PYI034 def __iter__(self) -> Iterator[V]:
if self._list is not None: return self.it.__iter__()
self._lit = iter(self._list)
return self
def __next__(self) -> V: def __next__(self) -> V:
if self._list is not None: return self.it.__next__()
assert self._lit is not None
delegate = self._lit
else:
delegate = self._it
return next(delegate)
def __getattr__(self, name): def __getattr__(self, name):
return getattr(self._it, name) return getattr(self.it, name)
@property @property
def _aslist(self) -> list[V]: def aslist(self) -> List[V]:
if self._list is None: if self._list is None:
qualname = getattr(self._it, '__qualname__', '<no qualname>') # defensive just in case qualname = getattr(self.it, '__qualname__', '<no qualname>') # defensive just in case
warnings.medium(f'Using {qualname} as list is deprecated. Migrate to iterative processing or call list() explicitly.') warnings.medium(f'Using {qualname} as list is deprecated. Migrate to iterative processing or call list() explicitly.')
self._list = list(self._it) self._list = list(self.it)
# this is necessary for list constructor to work correctly
# since it's __iter__ first, then tries to compute length and then starts iterating...
self._lit = iter(self._list)
return self._list return self._list
def __len__(self) -> int: def __len__(self) -> int:
return len(self._aslist) return len(self.aslist)
def __getitem__(self, i: int) -> V: def __getitem__(self, i: int) -> V:
return self._aslist[i] return self.aslist[i]
def test_always_supports_sequence_list_constructor() -> None:
exhausted = 0
def it() -> Iterator[str]:
nonlocal exhausted
yield from ['a', 'b', 'c']
exhausted += 1
sit = always_supports_sequence(it())
# list constructor is a bit special... it's trying to compute length if it's available to optimize memory allocation
# so, what's happening in this case is
# - sit.__iter__ is called
# - sit.__len__ is called
# - sit.__next__ is called
res = list(sit)
assert res == ['a', 'b', 'c']
assert exhausted == 1
res = list(sit)
assert res == ['a', 'b', 'c']
assert exhausted == 1 # this will iterate over 'cached' list now, so original generator is only exhausted once
def test_always_supports_sequence_indexing() -> None:
exhausted = 0
def it() -> Iterator[str]:
nonlocal exhausted
yield from ['a', 'b', 'c']
exhausted += 1
sit = always_supports_sequence(it())
assert len(sit) == 3
assert exhausted == 1
assert sit[2] == 'c'
assert sit[1] == 'b'
assert sit[0] == 'a'
assert exhausted == 1
# a few tests to make sure list-like operations are working..
assert list(sit) == ['a', 'b', 'c']
assert [x for x in sit] == ['a', 'b', 'c'] # noqa: C416
assert list(sit) == ['a', 'b', 'c']
assert [x for x in sit] == ['a', 'b', 'c'] # noqa: C416
assert exhausted == 1
def test_always_supports_sequence_next() -> None:
exhausted = 0
def it() -> Iterator[str]:
nonlocal exhausted
yield from ['a', 'b', 'c']
exhausted += 1
sit = always_supports_sequence(it())
x = next(sit)
assert x == 'a'
assert exhausted == 0
x = next(sit)
assert x == 'b'
assert exhausted == 0
def test_always_supports_sequence_iter() -> None:
exhausted = 0
def it() -> Iterator[str]:
nonlocal exhausted
yield from ['a', 'b', 'c']
exhausted += 1
sit = always_supports_sequence(it())
for x in sit:
assert x == 'a'
break
x = next(sit)
assert x == 'b'
assert exhausted == 0
x = next(sit)
assert x == 'c'
assert exhausted == 0
for _ in sit:
raise RuntimeError # shouldn't trigger, just exhaust the iterator
assert exhausted == 1

View file

@ -1,22 +1,14 @@
''' '''
TODO doesn't really belong to 'core' morally, but can think of moving out later TODO doesn't really belong to 'core' morally, but can think of moving out later
''' '''
from .common import assert_subpackage; assert_subpackage(__name__)
from __future__ import annotations from typing import Iterable, Any, Optional, Dict
from .internal import assert_subpackage from .common import LazyLogger, asdict, Json
assert_subpackage(__name__)
from collections.abc import Iterable logger = LazyLogger(__name__)
from typing import Any
import click
from .logging import make_logger
from .types import Json, asdict
logger = make_logger(__name__)
class config: class config:
@ -35,7 +27,6 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool = RESET_DEFAULT, dt
db = config.db db = config.db
from influxdb import InfluxDBClient # type: ignore from influxdb import InfluxDBClient # type: ignore
client = InfluxDBClient() client = InfluxDBClient()
# todo maybe create if not exists? # todo maybe create if not exists?
# client.create_database(db) # client.create_database(db)
@ -46,7 +37,7 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool = RESET_DEFAULT, dt
client.delete_series(database=db, measurement=measurement) client.delete_series(database=db, measurement=measurement)
# TODO need to take schema here... # TODO need to take schema here...
cache: dict[str, bool] = {} cache: Dict[str, bool] = {}
def good(f, v) -> bool: def good(f, v) -> bool:
c = cache.get(f) c = cache.get(f)
@ -65,7 +56,7 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool = RESET_DEFAULT, dt
def dit() -> Iterable[Json]: def dit() -> Iterable[Json]:
for i in it: for i in it:
d = asdict(i) d = asdict(i)
tags: Json | None = None tags: Optional[Json] = None
tags_ = d.get('tags') # meh... handle in a more robust manner tags_ = d.get('tags') # meh... handle in a more robust manner
if tags_ is not None and isinstance(tags_, dict): # FIXME meh. if tags_ is not None and isinstance(tags_, dict): # FIXME meh.
del d['tags'] del d['tags']
@ -78,19 +69,18 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool = RESET_DEFAULT, dt
fields = filter_dict(d) fields = filter_dict(d)
yield { yield dict(
'measurement': measurement, measurement=measurement,
# TODO maybe good idea to tag with database file/name? to inspect inconsistencies etc.. # TODO maybe good idea to tag with database file/name? to inspect inconsistencies etc..
# hmm, so tags are autoindexed and might be faster? # hmm, so tags are autoindexed and might be faster?
# not sure what's the big difference though # not sure what's the big difference though
# "fields are data and tags are metadata" # "fields are data and tags are metadata"
'tags': tags, tags=tags,
'time': dt, time=dt,
'fields': fields, fields=fields,
} )
from more_itertools import chunked from more_itertools import chunked
# "The optimal batch size is 5000 lines of line protocol." # "The optimal batch size is 5000 lines of line protocol."
# some chunking is def necessary, otherwise it fails # some chunking is def necessary, otherwise it fails
inserted = 0 inserted = 0
@ -104,7 +94,7 @@ def fill(it: Iterable[Any], *, measurement: str, reset: bool = RESET_DEFAULT, dt
# todo "Specify timestamp precision when writing to InfluxDB."? # todo "Specify timestamp precision when writing to InfluxDB."?
def magic_fill(it, *, name: str | None = None, reset: bool = RESET_DEFAULT) -> None: def magic_fill(it, *, name: Optional[str]=None, reset: bool=RESET_DEFAULT) -> None:
if name is None: if name is None:
assert callable(it) # generators have no name/module assert callable(it) # generators have no name/module
name = f'{it.__module__}:{it.__name__}' name = f'{it.__module__}:{it.__name__}'
@ -114,9 +104,7 @@ def magic_fill(it, *, name: str | None = None, reset: bool = RESET_DEFAULT) -> N
it = it() it = it()
from itertools import tee from itertools import tee
from more_itertools import first, one from more_itertools import first, one
it, x = tee(it) it, x = tee(it)
f = first(x, default=None) f = first(x, default=None)
if f is None: if f is None:
@ -126,17 +114,17 @@ def magic_fill(it, *, name: str | None = None, reset: bool = RESET_DEFAULT) -> N
# TODO can we reuse pandas code or something? # TODO can we reuse pandas code or something?
# #
from .pandas import _as_columns from .pandas import _as_columns
schema = _as_columns(type(f)) schema = _as_columns(type(f))
from datetime import datetime from datetime import datetime
dtex = RuntimeError(f'expected single datetime field. schema: {schema}') dtex = RuntimeError(f'expected single datetime field. schema: {schema}')
dtf = one((f for f, t in schema.items() if t == datetime), too_short=dtex, too_long=dtex) dtf = one((f for f, t in schema.items() if t == datetime), too_short=dtex, too_long=dtex)
fill(it, measurement=name, reset=reset, dt_col=dtf) fill(it, measurement=name, reset=reset, dt_col=dtf)
import click
@click.group() @click.group()
def main() -> None: def main() -> None:
pass pass
@ -145,9 +133,8 @@ def main() -> None:
@main.command(name='populate', short_help='populate influxdb') @main.command(name='populate', short_help='populate influxdb')
@click.option('--reset', is_flag=True, help='Reset Influx measurements before inserting', show_default=True) @click.option('--reset', is_flag=True, help='Reset Influx measurements before inserting', show_default=True)
@click.argument('FUNCTION_NAME', type=str, required=True) @click.argument('FUNCTION_NAME', type=str, required=True)
def populate(*, function_name: str, reset: bool) -> None: def populate(function_name: str, reset: bool) -> None:
from .__main__ import _locate_functions_or_prompt from .__main__ import _locate_functions_or_prompt
[provider] = list(_locate_functions_or_prompt([function_name])) [provider] = list(_locate_functions_or_prompt([function_name]))
# todo could have a non-interactive version which populates from all data sources for the provider? # todo could have a non-interactive version which populates from all data sources for the provider?
magic_fill(provider, reset=reset) magic_fill(provider, reset=reset)

View file

@ -14,19 +14,18 @@ Please let me know if you are aware of a better way of dealing with this!
# separate function to present namespace pollution # separate function to present namespace pollution
def setup_config() -> None: def setup_config() -> None:
from pathlib import Path
import sys import sys
import warnings import warnings
from pathlib import Path
from .preinit import get_mycfg_dir from .preinit import get_mycfg_dir
mycfg_dir = get_mycfg_dir() mycfg_dir = get_mycfg_dir()
if not mycfg_dir.exists(): if not mycfg_dir.exists():
warnings.warn(f""" warnings.warn(f"""
'my.config' package isn't found! (expected at '{mycfg_dir}'). This is likely to result in issues. 'my.config' package isn't found! (expected at '{mycfg_dir}'). This is likely to result in issues.
See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info.
""".strip(), stacklevel=1) """.strip())
return return
mpath = str(mycfg_dir) mpath = str(mycfg_dir)
@ -44,12 +43,11 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-mo
except ImportError as ex: except ImportError as ex:
# just in case... who knows what crazy setup users have # just in case... who knows what crazy setup users have
import logging import logging
logging.exception(ex) logging.exception(ex)
warnings.warn(f""" warnings.warn(f"""
Importing 'my.config' failed! (error: {ex}). This is likely to result in issues. Importing 'my.config' failed! (error: {ex}). This is likely to result in issues.
See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info.
""", stacklevel=1) """)
else: else:
# defensive just in case -- __file__ may not be present if there is some dynamic magic involved # defensive just in case -- __file__ may not be present if there is some dynamic magic involved
used_config_file = getattr(my.config, '__file__', None) used_config_file = getattr(my.config, '__file__', None)
@ -65,7 +63,7 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-mo
Expected my.config to be located at {mycfg_dir}, but instead its path is {used_config_path}. Expected my.config to be located at {mycfg_dir}, but instead its path is {used_config_path}.
This will likely cause issues down the line -- double check {mycfg_dir} structure. This will likely cause issues down the line -- double check {mycfg_dir} structure.
See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info. See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info.
""", stacklevel=1 """,
) )

View file

@ -1,9 +0,0 @@
"""
Utils specific to hpi core, shouldn't really be used by HPI modules
"""
def assert_subpackage(name: str) -> None:
# can lead to some unexpected issues if you 'import cachew' which being in my/core directory.. so let's protect against it
# NOTE: if we use overlay, name can be smth like my.origg.my.core.cachew ...
assert name == '__main__' or 'my.core' in name, f'Expected module __name__ ({name}) to be __main__ or start with my.core'

View file

@ -1,7 +1,4 @@
from .internal import assert_subpackage from .common import assert_subpackage; assert_subpackage(__name__)
assert_subpackage(__name__)
from . import warnings from . import warnings
# do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath) # do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath)
@ -11,7 +8,10 @@ try:
from kompress import * from kompress import *
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
if e.name == 'kompress': if e.name == 'kompress':
warnings.high('Please install kompress (pip3 install kompress). Falling onto vendorized kompress for now.') warnings.high('Please install kompress (pip3 install kompress), it will be required in the future. Falling onto vendorized kompress for now.')
from ._deprecated.kompress import * # type: ignore[assignment] from ._deprecated.kompress import * # type: ignore[assignment]
else: else:
raise e raise e
# this is deprecated in compress, keep here for backwards compatibility
open = kopen # noqa: F405

View file

@ -5,21 +5,17 @@ This can potentially allow both for safer defensive parsing, and let you know if
TODO perhaps need to get some inspiration from linear logic to decide on a nice API... TODO perhaps need to get some inspiration from linear logic to decide on a nice API...
''' '''
from __future__ import annotations
from collections import OrderedDict from collections import OrderedDict
from typing import Any from typing import Any, List
def ignore(w, *keys): def ignore(w, *keys):
for k in keys: for k in keys:
w[k].ignore() w[k].ignore()
def zoom(w, *keys): def zoom(w, *keys):
return [w[k].zoom() for k in keys] return [w[k].zoom() for k in keys]
# TODO need to support lists # TODO need to support lists
class Zoomable: class Zoomable:
def __init__(self, parent, *args, **kwargs) -> None: def __init__(self, parent, *args, **kwargs) -> None:
@ -44,7 +40,7 @@ class Zoomable:
assert self.parent is not None assert self.parent is not None
self.parent._remove(self) self.parent._remove(self)
def zoom(self) -> Zoomable: def zoom(self) -> 'Zoomable':
self.consume() self.consume()
return self return self
@ -67,7 +63,6 @@ class Wdict(Zoomable, OrderedDict):
def this_consumed(self): def this_consumed(self):
return len(self) == 0 return len(self) == 0
# TODO specify mypy type for the index special method? # TODO specify mypy type for the index special method?
@ -82,7 +77,6 @@ class Wlist(Zoomable, list):
def this_consumed(self): def this_consumed(self):
return len(self) == 0 return len(self) == 0
class Wvalue(Zoomable): class Wvalue(Zoomable):
def __init__(self, parent, value: Any) -> None: def __init__(self, parent, value: Any) -> None:
super().__init__(parent) super().__init__(parent)
@ -99,9 +93,10 @@ class Wvalue(Zoomable):
return 'WValue{' + repr(self.value) + '}' return 'WValue{' + repr(self.value) + '}'
def _wrap(j, parent=None) -> tuple[Zoomable, list[Zoomable]]: from typing import Tuple
def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]:
res: Zoomable res: Zoomable
cc: list[Zoomable] cc: List[Zoomable]
if isinstance(j, dict): if isinstance(j, dict):
res = Wdict(parent) res = Wdict(parent)
cc = [res] cc = [res]
@ -125,17 +120,15 @@ def _wrap(j, parent=None) -> tuple[Zoomable, list[Zoomable]]:
raise RuntimeError(f'Unexpected type: {type(j)} {j}') raise RuntimeError(f'Unexpected type: {type(j)} {j}')
from collections.abc import Iterator
from contextlib import contextmanager from contextlib import contextmanager
from typing import Iterator
class UnconsumedError(Exception): class UnconsumedError(Exception):
pass pass
# TODO think about error policy later... # TODO think about error policy later...
@contextmanager @contextmanager
def wrap(j, *, throw=True) -> Iterator[Zoomable]: def wrap(j, throw=True) -> Iterator[Zoomable]:
w, children = _wrap(j) w, children = _wrap(j)
yield w yield w
@ -153,11 +146,8 @@ Expected {c} to be fully consumed by the parser.
from typing import cast from typing import cast
def test_unconsumed() -> None: def test_unconsumed() -> None:
import pytest import pytest
with pytest.raises(UnconsumedError): with pytest.raises(UnconsumedError):
with wrap({'a': 1234}) as w: with wrap({'a': 1234}) as w:
w = cast(Wdict, w) w = cast(Wdict, w)
@ -168,7 +158,6 @@ def test_unconsumed() -> None:
w = cast(Wdict, w) w = cast(Wdict, w)
d = w['c']['d'].zoom() d = w['c']['d'].zoom()
def test_consumed() -> None: def test_consumed() -> None:
with wrap({'a': 1234}) as w: with wrap({'a': 1234}) as w:
w = cast(Wdict, w) w = cast(Wdict, w)
@ -179,7 +168,6 @@ def test_consumed() -> None:
c = w['c'].zoom() c = w['c'].zoom()
d = c['d'].zoom() d = c['d'].zoom()
def test_types() -> None: def test_types() -> None:
# (string, number, object, array, boolean or nul # (string, number, object, array, boolean or nul
with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w: with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w:
@ -191,7 +179,6 @@ def test_types() -> None:
for x in list(w['list'].zoom()): # TODO eh. how to avoid the extra list thing? for x in list(w['list'].zoom()): # TODO eh. how to avoid the extra list thing?
x.consume() x.consume()
def test_consume_all() -> None: def test_consume_all() -> None:
with wrap({'aaa': {'bbb': {'hi': 123}}}) as w: with wrap({'aaa': {'bbb': {'hi': 123}}}) as w:
w = cast(Wdict, w) w = cast(Wdict, w)
@ -201,9 +188,11 @@ def test_consume_all() -> None:
def test_consume_few() -> None: def test_consume_few() -> None:
import pytest import pytest
pytest.skip('Will think about it later..') pytest.skip('Will think about it later..')
with wrap({'important': 123, 'unimportant': 'whatever'}) as w: with wrap({
'important': 123,
'unimportant': 'whatever'
}) as w:
w = cast(Wdict, w) w = cast(Wdict, w)
w['important'].zoom() w['important'].zoom()
w.consume_all() w.consume_all()
@ -212,7 +201,6 @@ def test_consume_few() -> None:
def test_zoom() -> None: def test_zoom() -> None:
import pytest import pytest
with wrap({'aaa': 'whatever'}) as w: with wrap({'aaa': 'whatever'}) as w:
w = cast(Wdict, w) w = cast(Wdict, w)
with pytest.raises(KeyError): with pytest.raises(KeyError):
@ -221,34 +209,3 @@ def test_zoom() -> None:
# TODO type check this... # TODO type check this...
# TODO feels like the whole thing kind of unnecessarily complex
# - cons:
# - in most cases this is not even needed? who cares if we miss a few attributes?
# - pro: on the other hand it could be interesting to know about new attributes in data,
# and without this kind of processing we wouldn't even know
# alternatives
# - manually process data
# e.g. use asserts, dict.pop and dict.values() methods to unpack things
# - pros:
# - very simple, since uses built in syntax
# - very performant, as fast as it gets
# - very flexible, easy to adjust behaviour
# - cons:
# - can forget to assert about extra entities etc, so error prone
# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes error handling harder
# - a bit verbose.. so probably requires some helper functions though (could be much leaner than current konsume though)
# - if we assert, then terminates parsing too early, if we're defensive then inflates the code a lot with if statements
# - TODO perhaps combine warnings somehow or at least only emit once per module?
# - hmm actually tbh if we carefully go through everything and don't make copies, then only requires one assert at the very end?
# - TODO this is kinda useful? https://discuss.python.org/t/syntax-for-dictionnary-unpacking-to-variables/18718
# operator.itemgetter?
# - TODO can use match operator in python for this? quite nice actually! and allows for dynamic behaviour
# only from 3.10 tho, and gonna be tricky to do dynamic defensive behaviour with this
# - TODO in a sense, blenser already would hint if some meaningful fields aren't being processed? only if they are changing though
# - define a "schema" for data, then just recursively match data against the schema?
# possibly pydantic already does something like that? not sure about performance though
# pros:
# - much simpler to extend and understand what's going on
# cons:
# - more rigid, so it becomes tricky to do dynamic stuff (e.g. if schema actually changes)

View file

@ -1,11 +1,11 @@
from __future__ import annotations from __future__ import annotations
from functools import lru_cache
import logging import logging
import os import os
import sys import sys
from typing import Union
import warnings import warnings
from functools import lru_cache
from typing import TYPE_CHECKING, Union
def test() -> None: def test() -> None:
@ -15,7 +15,7 @@ def test() -> None:
## prepare exception for later ## prepare exception for later
try: try:
None.whatever # type: ignore[attr-defined] # noqa: B018 None.whatever # type: ignore[attr-defined]
except Exception as e: except Exception as e:
ex = e ex = e
## ##
@ -146,7 +146,7 @@ def _setup_handlers_and_formatters(name: str) -> None:
# try colorlog first, so user gets nice colored logs # try colorlog first, so user gets nice colored logs
import colorlog import colorlog
except ModuleNotFoundError: except ModuleNotFoundError:
warnings.warn("You might want to 'pip install colorlog' for nice colored logs", stacklevel=1) warnings.warn("You might want to 'pip install colorlog' for nice colored logs")
formatter = logging.Formatter(FORMAT_NOCOLOR) formatter = logging.Formatter(FORMAT_NOCOLOR)
else: else:
# log_color/reset are specific to colorlog # log_color/reset are specific to colorlog
@ -222,9 +222,7 @@ def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger:
# OK, when stdout is not a tty, enlighten doesn't log anything, good # OK, when stdout is not a tty, enlighten doesn't log anything, good
def get_enlighten(): def get_enlighten():
# TODO could add env variable to disable enlighten for a module? # TODO could add env variable to disable enlighten for a module?
from unittest.mock import ( from unittest.mock import Mock # Mock to return stub so cients don't have to think about it
Mock, # Mock to return stub so cients don't have to think about it
)
# for now hidden behind the flag since it's a little experimental # for now hidden behind the flag since it's a little experimental
if os.environ.get('ENLIGHTEN_ENABLE', None) is None: if os.environ.get('ENLIGHTEN_ENABLE', None) is None:
@ -233,7 +231,7 @@ def get_enlighten():
try: try:
import enlighten # type: ignore[import-untyped] import enlighten # type: ignore[import-untyped]
except ModuleNotFoundError: except ModuleNotFoundError:
warnings.warn("You might want to 'pip install enlighten' for a nice progress bar", stacklevel=1) warnings.warn("You might want to 'pip install enlighten' for a nice progress bar")
return Mock() return Mock()
@ -250,17 +248,7 @@ if __name__ == '__main__':
test() test()
## legacy/deprecated methods for backwards compatibility ## legacy/deprecated methods for backwards compatilibity
if not TYPE_CHECKING: LazyLogger = make_logger
from .compat import deprecated logger = make_logger
@deprecated('use make_logger instead')
def LazyLogger(*args, **kwargs):
return make_logger(*args, **kwargs)
@deprecated('use make_logger instead')
def logger(*args, **kwargs):
return make_logger(*args, **kwargs)
## ##

View file

@ -1,37 +0,0 @@
"""
Utils for mime/filetype handling
"""
from __future__ import annotations
from .internal import assert_subpackage
assert_subpackage(__name__)
import functools
from pathlib import Path
@functools.lru_cache(1)
def _magic():
import magic # type: ignore
# TODO also has uncompess=True? could be useful
return magic.Magic(mime=True)
# TODO could reuse in pdf module?
import mimetypes # todo do I need init()?
# todo wtf? fastermime thinks it's mime is application/json even if the extension is xz??
# whereas magic detects correctly: application/x-zstd and application/x-xz
def fastermime(path: Path | str) -> str:
paths = str(path)
# mimetypes is faster, so try it first
(mime, _) = mimetypes.guess_type(paths)
if mime is not None:
return mime
# magic is slower but handles more types
# TODO Result type?; it's kinda racey, but perhaps better to let the caller decide?
return _magic().from_file(paths)

View file

@ -1,13 +1,10 @@
""" """
Various helpers for reading org-mode data Various helpers for reading org-mode data
""" """
from datetime import datetime from datetime import datetime
def parse_org_datetime(s: str) -> datetime: def parse_org_datetime(s: str) -> datetime:
s = s.strip('[]') s = s.strip('[]')
for fmt, _cls in [ for fmt, cl in [
("%Y-%m-%d %a %H:%M", datetime), ("%Y-%m-%d %a %H:%M", datetime),
("%Y-%m-%d %H:%M" , datetime), ("%Y-%m-%d %H:%M" , datetime),
# todo not sure about these... fallback on 00:00? # todo not sure about these... fallback on 00:00?
@ -18,29 +15,23 @@ def parse_org_datetime(s: str) -> datetime:
return datetime.strptime(s, fmt) return datetime.strptime(s, fmt)
except ValueError: except ValueError:
continue continue
else:
raise RuntimeError(f"Bad datetime string {s}") raise RuntimeError(f"Bad datetime string {s}")
# TODO I guess want to borrow inspiration from bs4? element type <-> tag; and similar logic for find_one, find_all # TODO I guess want to borrow inspiration from bs4? element type <-> tag; and similar logic for find_one, find_all
from collections.abc import Iterable
from typing import Callable, TypeVar
from orgparse import OrgNode from orgparse import OrgNode
from typing import Iterable, TypeVar, Callable
V = TypeVar('V') V = TypeVar('V')
def collect(n: OrgNode, cfun: Callable[[OrgNode], Iterable[V]]) -> Iterable[V]: def collect(n: OrgNode, cfun: Callable[[OrgNode], Iterable[V]]) -> Iterable[V]:
yield from cfun(n) yield from cfun(n)
for c in n.children: for c in n.children:
yield from collect(c, cfun) yield from collect(c, cfun)
from more_itertools import one from more_itertools import one
from orgparse.extra import Table from orgparse.extra import Table
def one_table(o: OrgNode) -> Table: def one_table(o: OrgNode) -> Table:
return one(collect(o, lambda n: (x for x in n.body_rich if isinstance(x, Table)))) return one(collect(o, lambda n: (x for x in n.body_rich if isinstance(x, Table))))

View file

@ -1,31 +1,23 @@
''' '''
Various pandas helpers and convenience functions Various pandas helpers and convenience functions
''' '''
from __future__ import annotations from __future__ import annotations
# todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential # todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential
# NOTE: this file is meant to be importable without Pandas installed # NOTE: this file is meant to be importable without Pandas installed
import dataclasses import dataclasses
from collections.abc import Iterable, Iterator
from datetime import datetime, timezone from datetime import datetime, timezone
from pprint import pformat from pprint import pformat
from typing import ( from typing import TYPE_CHECKING, Any, Iterable, Type, Dict, Literal, Callable, TypeVar
TYPE_CHECKING,
Any,
Callable,
Literal,
TypeVar,
)
from decorator import decorator from decorator import decorator
from . import warnings from . import warnings, Res
from .error import Res, error_to_json, extract_error_datetime from .common import LazyLogger, Json, asdict
from .logging import make_logger from .error import error_to_json, extract_error_datetime
from .types import Json, asdict
logger = make_logger(__name__)
logger = LazyLogger(__name__)
if TYPE_CHECKING: if TYPE_CHECKING:
@ -46,7 +38,7 @@ else:
S1 = Any S1 = Any
def _check_dateish(s: SeriesT[S1]) -> Iterable[str]: def check_dateish(s: SeriesT[S1]) -> Iterable[str]:
import pandas as pd # noqa: F811 not actually a redefinition import pandas as pd # noqa: F811 not actually a redefinition
ctype = s.dtype ctype = s.dtype
@ -58,7 +50,7 @@ def _check_dateish(s: SeriesT[S1]) -> Iterable[str]:
all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all() all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all()
if not all_timestamps: if not all_timestamps:
return # not sure why it would happen, but ok return # not sure why it would happen, but ok
tzs = s.map(lambda x: x.tzinfo).drop_duplicates() # type: ignore[union-attr, var-annotated, arg-type, return-value, unused-ignore] tzs = s.map(lambda x: x.tzinfo).drop_duplicates()
examples = s[tzs.index] examples = s[tzs.index]
# todo not so sure this warning is that useful... except for stuff without tz # todo not so sure this warning is that useful... except for stuff without tz
yield f''' yield f'''
@ -70,37 +62,9 @@ def _check_dateish(s: SeriesT[S1]) -> Iterable[str]:
def test_check_dateish() -> None: def test_check_dateish() -> None:
import pandas as pd import pandas as pd
from .compat import fromisoformat # todo just a dummy test to check it doesn't crash, need something meaningful
s1 = pd.Series([1, 2, 3])
# empty series shouldn't warn list(check_dateish(s1))
assert list(_check_dateish(pd.Series([]))) == []
# if no dateimes, shouldn't return any warnings
assert list(_check_dateish(pd.Series([1, 2, 3]))) == []
# all values are datetimes, shouldn't warn
# fmt: off
assert list(_check_dateish(pd.Series([
fromisoformat('2024-08-19T01:02:03'),
fromisoformat('2024-08-19T03:04:05'),
]))) == []
# fmt: on
# mixture of timezones -- should warn
# fmt: off
assert len(list(_check_dateish(pd.Series([
fromisoformat('2024-08-19T01:02:03'),
fromisoformat('2024-08-19T03:04:05Z'),
])))) == 1
# fmt: on
# TODO hmm. maybe this should actually warn?
# fmt: off
assert len(list(_check_dateish(pd.Series([
'whatever',
fromisoformat('2024-08-19T01:02:03'),
])))) == 0
# fmt: on
# fmt: off # fmt: off
@ -138,7 +102,7 @@ def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy = 'add_if_missing
# makes sense to keep super defensive # makes sense to keep super defensive
try: try:
for col, data in df.reset_index().items(): for col, data in df.reset_index().items():
for w in _check_dateish(data): for w in check_dateish(data):
warnings.low(f"{tag}, column '{col}': {w}") warnings.low(f"{tag}, column '{col}': {w}")
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)
@ -162,7 +126,8 @@ def error_to_row(e: Exception, *, dt_col: str = 'dt', tz: timezone | None = None
return err_dict return err_dict
def _to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]: # todo not sure about naming
def to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]:
for r in it: for r in it:
if isinstance(r, Exception): if isinstance(r, Exception):
yield error_to_row(r) yield error_to_row(r)
@ -175,10 +140,10 @@ def _to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]:
Schema = Any Schema = Any
def _as_columns(s: Schema) -> dict[str, type]: def _as_columns(s: Schema) -> Dict[str, Type]:
# todo would be nice to extract properties; add tests for this as well # todo would be nice to extract properties; add tests for this as well
if dataclasses.is_dataclass(s): if dataclasses.is_dataclass(s):
return {f.name: f.type for f in dataclasses.fields(s)} # type: ignore[misc] # ugh, why mypy thinks f.type can return str?? return {f.name: f.type for f in dataclasses.fields(s)}
# else must be NamedTuple?? # else must be NamedTuple??
# todo assert my.core.common.is_namedtuple? # todo assert my.core.common.is_namedtuple?
return getattr(s, '_field_types') return getattr(s, '_field_types')
@ -197,7 +162,7 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFr
import pandas as pd # noqa: F811 not actually a redefinition import pandas as pd # noqa: F811 not actually a redefinition
columns = None if schema is None else list(_as_columns(schema).keys()) columns = None if schema is None else list(_as_columns(schema).keys())
return pd.DataFrame(_to_jsons(it), columns=columns) return pd.DataFrame(to_jsons(it), columns=columns)
# ugh. in principle this could be inside the test # ugh. in principle this could be inside the test
@ -207,76 +172,20 @@ def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFr
# see https://github.com/pytest-dev/pytest/issues/7856 # see https://github.com/pytest-dev/pytest/issues/7856
@dataclasses.dataclass @dataclasses.dataclass
class _X: class _X:
# FIXME try moving inside?
x: int x: int
def test_as_dataframe() -> None: def test_as_dataframe() -> None:
import numpy as np
import pandas as pd
import pytest import pytest
from pandas.testing import assert_frame_equal
from .compat import fromisoformat it = (dict(i=i, s=f'str{i}') for i in range(10))
it = ({'i': i, 's': f'str{i}'} for i in range(5))
with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841 with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841
df: DataFrameT = as_dataframe(it) df: DataFrameT = as_dataframe(it)
# todo test other error col policies # todo test other error col policies
assert list(df.columns) == ['i', 's', 'error']
# fmt: off assert len(as_dataframe([])) == 0
assert_frame_equal(
df,
pd.DataFrame({
'i' : [0 , 1 , 2 , 3 , 4 ],
's' : ['str0', 'str1', 'str2', 'str3', 'str4'],
# NOTE: error column is always added
'error': [None , None , None , None , None ],
}),
)
# fmt: on
assert_frame_equal(as_dataframe([]), pd.DataFrame(columns=['error']))
# makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable
df2: DataFrameT = as_dataframe([], schema=_X) df2: DataFrameT = as_dataframe([], schema=_X)
assert_frame_equal( assert list(df2.columns) == ['x', 'error']
df2,
# FIXME hmm. x column type should be an int?? and error should be string (or object??)
pd.DataFrame(columns=['x', 'error']),
)
@dataclasses.dataclass
class S:
value: str
def it2() -> Iterator[Res[S]]:
yield S(value='test')
yield RuntimeError('i failed')
df = as_dataframe(it2())
# fmt: off
assert_frame_equal(
df,
pd.DataFrame(data={
'value': ['test', np.nan ],
'error': [np.nan, 'RuntimeError: i failed\n'],
'dt' : [np.nan, np.nan ],
}).astype(dtype={'dt': 'float'}), # FIXME should be datetime64 as below
)
# fmt: on
def it3() -> Iterator[Res[S]]:
yield S(value='aba')
yield RuntimeError('whoops')
yield S(value='cde')
yield RuntimeError('exception with datetime', fromisoformat('2024-08-19T22:47:01Z'))
df = as_dataframe(it3())
# fmt: off
assert_frame_equal(df, pd.DataFrame(data={
'value': ['aba' , np.nan , 'cde' , np.nan ],
'error': [np.nan, 'RuntimeError: whoops\n', np.nan, "RuntimeError: ('exception with datetime', datetime.datetime(2024, 8, 19, 22, 47, 1, tzinfo=datetime.timezone.utc))\n"],
# note: dt column is added even if errors don't have an associated datetime
'dt' : [np.nan, np.nan , np.nan, '2024-08-19 22:47:01+00:00'],
}).astype(dtype={'dt': 'datetime64[ns, UTC]'}))
# fmt: on

View file

@ -1,14 +1,11 @@
from pathlib import Path from pathlib import Path
# todo preinit isn't really a good name? it's only in a separate file because # todo preinit isn't really a good name? it's only in a separate file because
# - it's imported from my.core.init (so we wan't to keep this file as small/reliable as possible, hence not common or something) # - it's imported from my.core.init (so we wan't to keep this file as small/reliable as possible, hence not common or something)
# - we still need this function in __main__, so has to be separate from my/core/init.py # - we still need this function in __main__, so has to be separate from my/core/init.py
def get_mycfg_dir() -> Path: def get_mycfg_dir() -> Path:
import os
import appdirs # type: ignore[import-untyped] import appdirs # type: ignore[import-untyped]
import os
# not sure if that's necessary, i.e. could rely on PYTHONPATH instead # not sure if that's necessary, i.e. could rely on PYTHONPATH instead
# on the other hand, by using MY_CONFIG we are guaranteed to load it from the desired path? # on the other hand, by using MY_CONFIG we are guaranteed to load it from the desired path?
mvar = os.environ.get('MY_CONFIG') mvar = os.environ.get('MY_CONFIG')

View file

@ -1,24 +0,0 @@
"""
Helpers to prevent depending on pytest in runtime
"""
from .internal import assert_subpackage
assert_subpackage(__name__)
import sys
import typing
under_pytest = 'pytest' in sys.modules
if typing.TYPE_CHECKING or under_pytest:
import pytest
parametrize = pytest.mark.parametrize
else:
def parametrize(*_args, **_kwargs):
def wrapper(f):
return f
return wrapper

View file

@ -5,29 +5,21 @@ The main entrypoint to this library is the 'select' function below; try:
python3 -c "from my.core.query import select; help(select)" python3 -c "from my.core.query import select; help(select)"
""" """
from __future__ import annotations
import dataclasses import dataclasses
import importlib import importlib
import inspect import inspect
import itertools import itertools
from collections.abc import Iterable, Iterator
from datetime import datetime from datetime import datetime
from typing import ( from typing import TypeVar, Tuple, Optional, Union, Callable, Iterable, Iterator, Dict, Any, NamedTuple, List
Any,
Callable,
NamedTuple,
Optional,
TypeVar,
)
import more_itertools import more_itertools
from . import error as err import my.core.error as err
from .common import is_namedtuple
from .error import Res, unwrap from .error import Res, unwrap
from .types import is_namedtuple
from .warnings import low from .warnings import low
T = TypeVar("T") T = TypeVar("T")
ET = Res[T] ET = Res[T]
@ -48,7 +40,6 @@ class Unsortable(NamedTuple):
class QueryException(ValueError): class QueryException(ValueError):
"""Used to differentiate query-related errors, so the CLI interface is more expressive""" """Used to differentiate query-related errors, so the CLI interface is more expressive"""
pass pass
@ -61,7 +52,7 @@ def locate_function(module_name: str, function_name: str) -> Callable[[], Iterab
""" """
try: try:
mod = importlib.import_module(module_name) mod = importlib.import_module(module_name)
for fname, f in inspect.getmembers(mod, inspect.isfunction): for (fname, f) in inspect.getmembers(mod, inspect.isfunction):
if fname == function_name: if fname == function_name:
return f return f
# in case the function is defined dynamically, # in case the function is defined dynamically,
@ -70,7 +61,7 @@ def locate_function(module_name: str, function_name: str) -> Callable[[], Iterab
if func is not None and callable(func): if func is not None and callable(func):
return func return func
except Exception as e: except Exception as e:
raise QueryException(str(e)) # noqa: B904 raise QueryException(str(e))
raise QueryException(f"Could not find function '{function_name}' in '{module_name}'") raise QueryException(f"Could not find function '{function_name}' in '{module_name}'")
@ -84,7 +75,7 @@ def locate_qualified_function(qualified_name: str) -> Callable[[], Iterable[ET]]
return locate_function(qualified_name[:rdot_index], qualified_name[rdot_index + 1:]) return locate_function(qualified_name[:rdot_index], qualified_name[rdot_index + 1:])
def attribute_func(obj: T, where: Where, default: U | None = None) -> OrderFunc | None: def attribute_func(obj: T, where: Where, default: Optional[U] = None) -> Optional[OrderFunc]:
""" """
Attempts to find an attribute which matches the 'where_function' on the object, Attempts to find an attribute which matches the 'where_function' on the object,
using some getattr/dict checks. Returns a function which when called with using some getattr/dict checks. Returns a function which when called with
@ -112,7 +103,7 @@ def attribute_func(obj: T, where: Where, default: U | None = None) -> OrderFunc
if where(v): if where(v):
return lambda o: o.get(k, default) # type: ignore[union-attr] return lambda o: o.get(k, default) # type: ignore[union-attr]
elif dataclasses.is_dataclass(obj): elif dataclasses.is_dataclass(obj):
for field_name in obj.__annotations__.keys(): for (field_name, _annotation) in obj.__annotations__.items():
if where(getattr(obj, field_name)): if where(getattr(obj, field_name)):
return lambda o: getattr(o, field_name, default) return lambda o: getattr(o, field_name, default)
elif is_namedtuple(obj): elif is_namedtuple(obj):
@ -130,12 +121,11 @@ def attribute_func(obj: T, where: Where, default: U | None = None) -> OrderFunc
def _generate_order_by_func( def _generate_order_by_func(
obj_res: Res[T], obj_res: Res[T],
*, key: Optional[str] = None,
key: str | None = None, where_function: Optional[Where] = None,
where_function: Where | None = None, default: Optional[U] = None,
default: U | None = None,
force_unsortable: bool = False, force_unsortable: bool = False,
) -> OrderFunc | None: ) -> Optional[OrderFunc]:
""" """
Accepts an object Res[T] (Instance of some class or Exception) Accepts an object Res[T] (Instance of some class or Exception)
@ -200,7 +190,7 @@ pass 'drop_exceptions' to ignore exceptions""")
# user must provide either a key or a where predicate # user must provide either a key or a where predicate
if where_function is not None: if where_function is not None:
func: OrderFunc | None = attribute_func(obj, where_function, default) func: Optional[OrderFunc] = attribute_func(obj, where_function, default)
if func is not None: if func is not None:
return func return func
@ -216,13 +206,15 @@ pass 'drop_exceptions' to ignore exceptions""")
return None # couldn't compute a OrderFunc for this class/instance return None # couldn't compute a OrderFunc for this class/instance
# currently using the 'key set' as a proxy for 'this is the same type of thing' # currently using the 'key set' as a proxy for 'this is the same type of thing'
def _determine_order_by_value_key(obj_res: ET) -> Any: def _determine_order_by_value_key(obj_res: ET) -> Any:
""" """
Returns either the class, or a tuple of the dictionary keys Returns either the class, or a tuple of the dictionary keys
""" """
key = obj_res.__class__ key = obj_res.__class__
if key is dict: if key == dict:
# assuming same keys signify same way to determine ordering # assuming same keys signify same way to determine ordering
return tuple(obj_res.keys()) # type: ignore[union-attr] return tuple(obj_res.keys()) # type: ignore[union-attr]
return key return key
@ -240,7 +232,7 @@ def _drop_unsorted(itr: Iterator[ET], orderfunc: OrderFunc) -> Iterator[ET]:
# try getting the first value from the iterator # try getting the first value from the iterator
# similar to my.core.common.warn_if_empty? this doesn't go through the whole iterator though # similar to my.core.common.warn_if_empty? this doesn't go through the whole iterator though
def _peek_iter(itr: Iterator[ET]) -> tuple[ET | None, Iterator[ET]]: def _peek_iter(itr: Iterator[ET]) -> Tuple[Optional[ET], Iterator[ET]]:
itr = more_itertools.peekable(itr) itr = more_itertools.peekable(itr)
try: try:
first_item = itr.peek() first_item = itr.peek()
@ -251,9 +243,9 @@ def _peek_iter(itr: Iterator[ET]) -> tuple[ET | None, Iterator[ET]]:
# similar to 'my.core.error.sort_res_by'? # similar to 'my.core.error.sort_res_by'?
def _wrap_unsorted(itr: Iterator[ET], orderfunc: OrderFunc) -> tuple[Iterator[Unsortable], Iterator[ET]]: def _wrap_unsorted(itr: Iterator[ET], orderfunc: OrderFunc) -> Tuple[Iterator[Unsortable], Iterator[ET]]:
unsortable: list[Unsortable] = [] unsortable: List[Unsortable] = []
sortable: list[ET] = [] sortable: List[ET] = []
for o in itr: for o in itr:
# if input to select was another select # if input to select was another select
if isinstance(o, Unsortable): if isinstance(o, Unsortable):
@ -271,11 +263,10 @@ def _wrap_unsorted(itr: Iterator[ET], orderfunc: OrderFunc) -> tuple[Iterator[Un
# the second being items for which orderfunc returned a non-none value # the second being items for which orderfunc returned a non-none value
def _handle_unsorted( def _handle_unsorted(
itr: Iterator[ET], itr: Iterator[ET],
*,
orderfunc: OrderFunc, orderfunc: OrderFunc,
drop_unsorted: bool, drop_unsorted: bool,
wrap_unsorted: bool wrap_unsorted: bool
) -> tuple[Iterator[Unsortable], Iterator[ET]]: ) -> Tuple[Iterator[Unsortable], Iterator[ET]]:
# prefer drop_unsorted to wrap_unsorted, if both were present # prefer drop_unsorted to wrap_unsorted, if both were present
if drop_unsorted: if drop_unsorted:
return iter([]), _drop_unsorted(itr, orderfunc) return iter([]), _drop_unsorted(itr, orderfunc)
@ -290,16 +281,16 @@ def _handle_unsorted(
# different types. ***This consumes the iterator***, so # different types. ***This consumes the iterator***, so
# you should definitely itertoolts.tee it beforehand # you should definitely itertoolts.tee it beforehand
# as to not exhaust the values # as to not exhaust the values
def _generate_order_value_func(itr: Iterator[ET], order_value: Where, default: U | None = None) -> OrderFunc: def _generate_order_value_func(itr: Iterator[ET], order_value: Where, default: Optional[U] = None) -> OrderFunc:
# TODO: add a kwarg to force lookup for every item? would sort of be like core.common.guess_datetime then # TODO: add a kwarg to force lookup for every item? would sort of be like core.common.guess_datetime then
order_by_lookup: dict[Any, OrderFunc] = {} order_by_lookup: Dict[Any, OrderFunc] = {}
# need to go through a copy of the whole iterator here to # need to go through a copy of the whole iterator here to
# pre-generate functions to support sorting mixed types # pre-generate functions to support sorting mixed types
for obj_res in itr: for obj_res in itr:
key: Any = _determine_order_by_value_key(obj_res) key: Any = _determine_order_by_value_key(obj_res)
if key not in order_by_lookup: if key not in order_by_lookup:
keyfunc: OrderFunc | None = _generate_order_by_func( keyfunc: Optional[OrderFunc] = _generate_order_by_func(
obj_res, obj_res,
where_function=order_value, where_function=order_value,
default=default, default=default,
@ -320,12 +311,12 @@ def _generate_order_value_func(itr: Iterator[ET], order_value: Where, default: U
def _handle_generate_order_by( def _handle_generate_order_by(
itr, itr,
*, *,
order_by: OrderFunc | None = None, order_by: Optional[OrderFunc] = None,
order_key: str | None = None, order_key: Optional[str] = None,
order_value: Where | None = None, order_value: Optional[Where] = None,
default: U | None = None, default: Optional[U] = None,
) -> tuple[OrderFunc | None, Iterator[ET]]: ) -> Tuple[Optional[OrderFunc], Iterator[ET]]:
order_by_chosen: OrderFunc | None = order_by # if the user just supplied a function themselves order_by_chosen: Optional[OrderFunc] = order_by # if the user just supplied a function themselves
if order_by is not None: if order_by is not None:
return order_by, itr return order_by, itr
if order_key is not None: if order_key is not None:
@ -350,19 +341,19 @@ def _handle_generate_order_by(
def select( def select(
src: Iterable[ET] | Callable[[], Iterable[ET]], src: Union[Iterable[ET], Callable[[], Iterable[ET]]],
*, *,
where: Where | None = None, where: Optional[Where] = None,
order_by: OrderFunc | None = None, order_by: Optional[OrderFunc] = None,
order_key: str | None = None, order_key: Optional[str] = None,
order_value: Where | None = None, order_value: Optional[Where] = None,
default: U | None = None, default: Optional[U] = None,
reverse: bool = False, reverse: bool = False,
limit: int | None = None, limit: Optional[int] = None,
drop_unsorted: bool = False, drop_unsorted: bool = False,
wrap_unsorted: bool = True, wrap_unsorted: bool = True,
warn_exceptions: bool = False, warn_exceptions: bool = False,
warn_func: Callable[[Exception], None] | None = None, warn_func: Optional[Callable[[Exception], None]] = None,
drop_exceptions: bool = False, drop_exceptions: bool = False,
raise_exceptions: bool = False, raise_exceptions: bool = False,
) -> Iterator[ET]: ) -> Iterator[ET]:
@ -464,7 +455,7 @@ Will attempt to call iter() on the value""")
try: try:
itr: Iterator[ET] = iter(it) itr: Iterator[ET] = iter(it)
except TypeError as t: except TypeError as t:
raise QueryException("Could not convert input src to an Iterator: " + str(t)) # noqa: B904 raise QueryException("Could not convert input src to an Iterator: " + str(t))
# if both drop_exceptions and drop_exceptions are provided for some reason, # if both drop_exceptions and drop_exceptions are provided for some reason,
# should raise exceptions before dropping them # should raise exceptions before dropping them
@ -501,12 +492,7 @@ Will attempt to call iter() on the value""")
# note: can't just attach sort unsortable values in the same iterable as the # note: can't just attach sort unsortable values in the same iterable as the
# other items because they don't have any lookups for order_key or functions # other items because they don't have any lookups for order_key or functions
# to handle items in the order_by_lookup dictionary # to handle items in the order_by_lookup dictionary
unsortable, itr = _handle_unsorted( unsortable, itr = _handle_unsorted(itr, order_by_chosen, drop_unsorted, wrap_unsorted)
itr,
orderfunc=order_by_chosen,
drop_unsorted=drop_unsorted,
wrap_unsorted=wrap_unsorted,
)
# run the sort, with the computed order by function # run the sort, with the computed order by function
itr = iter(sorted(itr, key=order_by_chosen, reverse=reverse)) # type: ignore[arg-type] itr = iter(sorted(itr, key=order_by_chosen, reverse=reverse)) # type: ignore[arg-type]
@ -597,7 +583,7 @@ def test_couldnt_determine_order() -> None:
res = list(select(iter([object()]), order_value=lambda o: isinstance(o, datetime))) res = list(select(iter([object()]), order_value=lambda o: isinstance(o, datetime)))
assert len(res) == 1 assert len(res) == 1
assert isinstance(res[0], Unsortable) assert isinstance(res[0], Unsortable)
assert type(res[0].obj) is object assert type(res[0].obj) == object
# same value type, different keys, with clashing keys # same value type, different keys, with clashing keys
@ -613,7 +599,7 @@ class _B(NamedTuple):
# move these to tests/? They are re-used so much in the tests below, # move these to tests/? They are re-used so much in the tests below,
# not sure where the best place for these is # not sure where the best place for these is
def _mixed_iter() -> Iterator[_A | _B]: def _mixed_iter() -> Iterator[Union[_A, _B]]:
yield _A(x=datetime(year=2009, month=5, day=10, hour=4, minute=10, second=1), y=5, z=10) yield _A(x=datetime(year=2009, month=5, day=10, hour=4, minute=10, second=1), y=5, z=10)
yield _B(y=datetime(year=2015, month=5, day=10, hour=4, minute=10, second=1)) yield _B(y=datetime(year=2015, month=5, day=10, hour=4, minute=10, second=1))
yield _A(x=datetime(year=2005, month=5, day=10, hour=4, minute=10, second=1), y=10, z=2) yield _A(x=datetime(year=2005, month=5, day=10, hour=4, minute=10, second=1), y=10, z=2)
@ -622,7 +608,7 @@ def _mixed_iter() -> Iterator[_A | _B]:
yield _A(x=datetime(year=2005, month=4, day=10, hour=4, minute=10, second=1), y=2, z=-5) yield _A(x=datetime(year=2005, month=4, day=10, hour=4, minute=10, second=1), y=2, z=-5)
def _mixed_iter_errors() -> Iterator[Res[_A | _B]]: def _mixed_iter_errors() -> Iterator[Res[Union[_A, _B]]]:
m = _mixed_iter() m = _mixed_iter()
yield from itertools.islice(m, 0, 3) yield from itertools.islice(m, 0, 3)
yield RuntimeError("Unhandled error!") yield RuntimeError("Unhandled error!")
@ -658,7 +644,7 @@ def test_wrap_unsortable() -> None:
# by default, wrap unsortable # by default, wrap unsortable
res = list(select(_mixed_iter(), order_key="z")) res = list(select(_mixed_iter(), order_key="z"))
assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4, "Unsortable": 2}) assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4, "Unsortable": 2})
def test_disabled_wrap_unsorted() -> None: def test_disabled_wrap_unsorted() -> None:
@ -677,7 +663,7 @@ def test_drop_unsorted() -> None:
# test drop unsortable, should remove them before the 'sorted' call # test drop unsortable, should remove them before the 'sorted' call
res = list(select(_mixed_iter(), order_key="z", wrap_unsorted=False, drop_unsorted=True)) res = list(select(_mixed_iter(), order_key="z", wrap_unsorted=False, drop_unsorted=True))
assert len(res) == 4 assert len(res) == 4
assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4}) assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4})
def test_drop_exceptions() -> None: def test_drop_exceptions() -> None:
@ -701,16 +687,15 @@ def test_raise_exceptions() -> None:
def test_wrap_unsortable_with_error_and_warning() -> None: def test_wrap_unsortable_with_error_and_warning() -> None:
from collections import Counter
import pytest import pytest
from collections import Counter
# by default should wrap unsortable (error) # by default should wrap unsortable (error)
with pytest.warns(UserWarning, match=r"encountered exception"): with pytest.warns(UserWarning, match=r"encountered exception"):
res = list(select(_mixed_iter_errors(), order_value=lambda o: isinstance(o, datetime))) res = list(select(_mixed_iter_errors(), order_value=lambda o: isinstance(o, datetime)))
assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4, "_B": 2, "Unsortable": 1}) assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4, "_B": 2, "Unsortable": 1})
# compare the returned error wrapped in the Unsortable # compare the returned error wrapped in the Unsortable
returned_error = next(o for o in res if isinstance(o, Unsortable)).obj returned_error = next((o for o in res if isinstance(o, Unsortable))).obj
assert "Unhandled error!" == str(returned_error) assert "Unhandled error!" == str(returned_error)
@ -720,7 +705,7 @@ def test_order_key_unsortable() -> None:
# both unsortable and items which dont match the order_by (order_key) in this case should be classified unsorted # both unsortable and items which dont match the order_by (order_key) in this case should be classified unsorted
res = list(select(_mixed_iter_errors(), order_key="z")) res = list(select(_mixed_iter_errors(), order_key="z"))
assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4, "Unsortable": 3}) assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4, "Unsortable": 3})
def test_order_default_param() -> None: def test_order_default_param() -> None:
@ -740,7 +725,7 @@ def test_no_recursive_unsortables() -> None:
# select to select as input, wrapping unsortables the first time, second should drop them # select to select as input, wrapping unsortables the first time, second should drop them
# reverse=True to send errors to the end, so the below order_key works # reverse=True to send errors to the end, so the below order_key works
res = list(select(_mixed_iter_errors(), order_key="z", reverse=True)) res = list(select(_mixed_iter_errors(), order_key="z", reverse=True))
assert Counter(type(t).__name__ for t in res) == Counter({"_A": 4, "Unsortable": 3}) assert Counter(map(lambda t: type(t).__name__, res)) == Counter({"_A": 4, "Unsortable": 3})
# drop_unsorted # drop_unsorted
dropped = list(select(res, order_key="z", drop_unsorted=True)) dropped = list(select(res, order_key="z", drop_unsorted=True))

View file

@ -7,30 +7,27 @@ filtered iterator
See the select_range function below See the select_range function below
""" """
from __future__ import annotations
import re import re
import time import time
from collections.abc import Iterator from functools import lru_cache
from datetime import date, datetime, timedelta from datetime import datetime, timedelta, date
from functools import cache from typing import Callable, Iterator, NamedTuple, Optional, Any, Type
from typing import Any, Callable, NamedTuple
import more_itertools import more_itertools
from .compat import fromisoformat
from .query import ( from .query import (
ET,
OrderFunc,
QueryException, QueryException,
select,
OrderFunc,
Where, Where,
_handle_generate_order_by, _handle_generate_order_by,
select, ET,
) )
timedelta_regex = re.compile( from .compat import fromisoformat
r"^((?P<weeks>[\.\d]+?)w)?((?P<days>[\.\d]+?)d)?((?P<hours>[\.\d]+?)h)?((?P<minutes>[\.\d]+?)m)?((?P<seconds>[\.\d]+?)s)?$"
)
timedelta_regex = re.compile(r"^((?P<weeks>[\.\d]+?)w)?((?P<days>[\.\d]+?)d)?((?P<hours>[\.\d]+?)h)?((?P<minutes>[\.\d]+?)m)?((?P<seconds>[\.\d]+?)s)?$")
# https://stackoverflow.com/a/51916936 # https://stackoverflow.com/a/51916936
@ -93,7 +90,7 @@ def parse_datetime_float(date_str: str) -> float:
# dateparser is a bit more lenient than the above, lets you type # dateparser is a bit more lenient than the above, lets you type
# all sorts of dates as inputs # all sorts of dates as inputs
# https://github.com/scrapinghub/dateparser#how-to-use # https://github.com/scrapinghub/dateparser#how-to-use
res: datetime | None = dateparser.parse(ds, settings={"DATE_ORDER": "YMD"}) res: Optional[datetime] = dateparser.parse(ds, settings={"DATE_ORDER": "YMD"})
if res is not None: if res is not None:
return res.timestamp() return res.timestamp()
@ -103,7 +100,7 @@ def parse_datetime_float(date_str: str) -> float:
# probably DateLike input? but a user could specify an order_key # probably DateLike input? but a user could specify an order_key
# which is an epoch timestamp or a float value which they # which is an epoch timestamp or a float value which they
# expect to be converted to a datetime to compare # expect to be converted to a datetime to compare
@cache @lru_cache(maxsize=None)
def _datelike_to_float(dl: Any) -> float: def _datelike_to_float(dl: Any) -> float:
if isinstance(dl, datetime): if isinstance(dl, datetime):
return dl.timestamp() return dl.timestamp()
@ -114,7 +111,7 @@ def _datelike_to_float(dl: Any) -> float:
try: try:
return parse_datetime_float(dl) return parse_datetime_float(dl)
except QueryException as q: except QueryException as q:
raise QueryException(f"While attempting to extract datetime from {dl}, to order by datetime:\n\n" + str(q)) # noqa: B904 raise QueryException(f"While attempting to extract datetime from {dl}, to order by datetime:\n\n" + str(q))
class RangeTuple(NamedTuple): class RangeTuple(NamedTuple):
@ -135,12 +132,11 @@ class RangeTuple(NamedTuple):
of the timeframe -- 'before' of the timeframe -- 'before'
- before and after - anything after 'after' and before 'before', acts as a time range - before and after - anything after 'after' and before 'before', acts as a time range
""" """
# technically doesn't need to be Optional[Any], # technically doesn't need to be Optional[Any],
# just to make it more clear these can be None # just to make it more clear these can be None
after: Any | None after: Optional[Any]
before: Any | None before: Optional[Any]
within: Any | None within: Optional[Any]
Converter = Callable[[Any], Any] Converter = Callable[[Any], Any]
@ -151,9 +147,9 @@ def _parse_range(
unparsed_range: RangeTuple, unparsed_range: RangeTuple,
end_parser: Converter, end_parser: Converter,
within_parser: Converter, within_parser: Converter,
parsed_range: RangeTuple | None = None, parsed_range: Optional[RangeTuple] = None,
error_message: str | None = None, error_message: Optional[str] = None
) -> RangeTuple | None: ) -> Optional[RangeTuple]:
if parsed_range is not None: if parsed_range is not None:
return parsed_range return parsed_range
@ -182,11 +178,11 @@ def _create_range_filter(
end_parser: Converter, end_parser: Converter,
within_parser: Converter, within_parser: Converter,
attr_func: Where, attr_func: Where,
parsed_range: RangeTuple | None = None, parsed_range: Optional[RangeTuple] = None,
default_before: Any | None = None, default_before: Optional[Any] = None,
value_coercion_func: Converter | None = None, value_coercion_func: Optional[Converter] = None,
error_message: str | None = None, error_message: Optional[str] = None,
) -> Where | None: ) -> Optional[Where]:
""" """
Handles: Handles:
- parsing the user input into values that are comparable to items the iterable returns - parsing the user input into values that are comparable to items the iterable returns
@ -278,17 +274,17 @@ def _create_range_filter(
def select_range( def select_range(
itr: Iterator[ET], itr: Iterator[ET],
*, *,
where: Where | None = None, where: Optional[Where] = None,
order_key: str | None = None, order_key: Optional[str] = None,
order_value: Where | None = None, order_value: Optional[Where] = None,
order_by_value_type: type | None = None, order_by_value_type: Optional[Type] = None,
unparsed_range: RangeTuple | None = None, unparsed_range: Optional[RangeTuple] = None,
reverse: bool = False, reverse: bool = False,
limit: int | None = None, limit: Optional[int] = None,
drop_unsorted: bool = False, drop_unsorted: bool = False,
wrap_unsorted: bool = False, wrap_unsorted: bool = False,
warn_exceptions: bool = False, warn_exceptions: bool = False,
warn_func: Callable[[Exception], None] | None = None, warn_func: Optional[Callable[[Exception], None]] = None,
drop_exceptions: bool = False, drop_exceptions: bool = False,
raise_exceptions: bool = False, raise_exceptions: bool = False,
) -> Iterator[ET]: ) -> Iterator[ET]:
@ -323,10 +319,9 @@ def select_range(
drop_exceptions=drop_exceptions, drop_exceptions=drop_exceptions,
raise_exceptions=raise_exceptions, raise_exceptions=raise_exceptions,
warn_exceptions=warn_exceptions, warn_exceptions=warn_exceptions,
warn_func=warn_func, warn_func=warn_func)
)
order_by_chosen: OrderFunc | None = None order_by_chosen: Optional[OrderFunc] = None
# if the user didn't specify an attribute to order value, but specified a type # if the user didn't specify an attribute to order value, but specified a type
# we should search for on each value in the iterator # we should search for on each value in the iterator
@ -337,8 +332,6 @@ def select_range(
# if the user supplied a order_key, and/or we've generated an order_value, create # if the user supplied a order_key, and/or we've generated an order_value, create
# the function that accesses that type on each value in the iterator # the function that accesses that type on each value in the iterator
if order_key is not None or order_value is not None: if order_key is not None or order_value is not None:
# _generate_order_value_func internally here creates a copy of the iterator, which has to
# be consumed in-case we're sorting by mixed types
order_by_chosen, itr = _handle_generate_order_by(itr, order_key=order_key, order_value=order_value) order_by_chosen, itr = _handle_generate_order_by(itr, order_key=order_key, order_value=order_value)
# signifies that itr is empty -- can early return here # signifies that itr is empty -- can early return here
if order_by_chosen is None: if order_by_chosen is None:
@ -350,11 +343,11 @@ def select_range(
if order_by_chosen is None: if order_by_chosen is None:
raise QueryException("""Can't order by range if we have no way to order_by! raise QueryException("""Can't order by range if we have no way to order_by!
Specify a type or a key to order the value by""") Specify a type or a key to order the value by""")
else:
# force drop_unsorted=True so we can use _create_range_filter # force drop_unsorted=True so we can use _create_range_filter
# sort the iterable by the generated order_by_chosen function # sort the iterable by the generated order_by_chosen function
itr = select(itr, order_by=order_by_chosen, drop_unsorted=True) itr = select(itr, order_by=order_by_chosen, drop_unsorted=True)
filter_func: Where | None filter_func: Optional[Where]
if order_by_value_type in [datetime, date]: if order_by_value_type in [datetime, date]:
filter_func = _create_range_filter( filter_func = _create_range_filter(
unparsed_range=unparsed_range, unparsed_range=unparsed_range,
@ -362,8 +355,7 @@ Specify a type or a key to order the value by""")
within_parser=parse_timedelta_float, within_parser=parse_timedelta_float,
attr_func=order_by_chosen, # type: ignore[arg-type] attr_func=order_by_chosen, # type: ignore[arg-type]
default_before=time.time(), default_before=time.time(),
value_coercion_func=_datelike_to_float, value_coercion_func=_datelike_to_float)
)
elif order_by_value_type in [int, float]: elif order_by_value_type in [int, float]:
# allow primitives to be converted using the default int(), float() callables # allow primitives to be converted using the default int(), float() callables
filter_func = _create_range_filter( filter_func = _create_range_filter(
@ -372,8 +364,7 @@ Specify a type or a key to order the value by""")
within_parser=order_by_value_type, within_parser=order_by_value_type,
attr_func=order_by_chosen, # type: ignore[arg-type] attr_func=order_by_chosen, # type: ignore[arg-type]
default_before=None, default_before=None,
value_coercion_func=order_by_value_type, value_coercion_func=order_by_value_type)
)
else: else:
# TODO: add additional kwargs to let the user sort by other values, by specifying the parsers? # TODO: add additional kwargs to let the user sort by other values, by specifying the parsers?
# would need to allow passing the end_parser, within parser, default before and value_coercion_func... # would need to allow passing the end_parser, within parser, default before and value_coercion_func...
@ -400,7 +391,7 @@ Specify a type or a key to order the value by""")
return itr return itr
# reuse items from query for testing # re-use items from query for testing
from .query import _A, _B, _Float, _mixed_iter_errors from .query import _A, _B, _Float, _mixed_iter_errors
@ -480,8 +471,8 @@ def test_range_predicate() -> None:
) )
# filter from 0 to 5 # filter from 0 to 5
rn: RangeTuple = RangeTuple("0", "5", None) rn: Optional[RangeTuple] = RangeTuple("0", "5", None)
zero_to_five_filter: Where | None = int_filter_func(unparsed_range=rn) zero_to_five_filter: Optional[Where] = int_filter_func(unparsed_range=rn)
assert zero_to_five_filter is not None assert zero_to_five_filter is not None
# this is just a Where function, given some input it return True/False if the value is allowed # this is just a Where function, given some input it return True/False if the value is allowed
assert zero_to_five_filter(3) is True assert zero_to_five_filter(3) is True
@ -494,7 +485,6 @@ def test_range_predicate() -> None:
rn = RangeTuple(None, 3, "3.5") rn = RangeTuple(None, 3, "3.5")
assert list(filter(int_filter_func(unparsed_range=rn, attr_func=identity), src())) == ["0", "1", "2"] assert list(filter(int_filter_func(unparsed_range=rn, attr_func=identity), src())) == ["0", "1", "2"]
def test_parse_range() -> None: def test_parse_range() -> None:
from functools import partial from functools import partial
@ -538,8 +528,9 @@ def test_parse_timedelta_string() -> None:
def test_parse_datetime_float() -> None: def test_parse_datetime_float() -> None:
pnow = parse_datetime_float("now") pnow = parse_datetime_float("now")
sec_diff = abs(pnow - datetime.now().timestamp()) sec_diff = abs((pnow - datetime.now().timestamp()))
# should probably never fail? could mock time.time # should probably never fail? could mock time.time
# but there seems to be issues with doing that use C-libraries (as time.time) does # but there seems to be issues with doing that use C-libraries (as time.time) does
# https://docs.python.org/3/library/unittest.mock-examples.html#partial-mocking # https://docs.python.org/3/library/unittest.mock-examples.html#partial-mocking

View file

@ -1,15 +1,12 @@
from __future__ import annotations
import datetime import datetime
from dataclasses import asdict, is_dataclass import dataclasses
from decimal import Decimal
from functools import cache
from pathlib import Path from pathlib import Path
from typing import Any, Callable, NamedTuple from decimal import Decimal
from typing import Any, Optional, Callable, NamedTuple
from functools import lru_cache
from .common import is_namedtuple
from .error import error_to_json from .error import error_to_json
from .pytest import parametrize
from .types import is_namedtuple
# note: it would be nice to combine the 'asdict' and _default_encode to some function # note: it would be nice to combine the 'asdict' and _default_encode to some function
# that takes a complex python object and returns JSON-compatible fields, while still # that takes a complex python object and returns JSON-compatible fields, while still
@ -19,8 +16,6 @@ from .types import is_namedtuple
DefaultEncoder = Callable[[Any], Any] DefaultEncoder = Callable[[Any], Any]
Dumps = Callable[[Any], str]
def _default_encode(obj: Any) -> Any: def _default_encode(obj: Any) -> Any:
""" """
@ -38,9 +33,8 @@ def _default_encode(obj: Any) -> Any:
# convert paths to their string representation # convert paths to their string representation
if isinstance(obj, Path): if isinstance(obj, Path):
return str(obj) return str(obj)
if is_dataclass(obj): if dataclasses.is_dataclass(obj):
assert not isinstance(obj, type) # to help mypy return dataclasses.asdict(obj)
return asdict(obj)
if isinstance(obj, Exception): if isinstance(obj, Exception):
return error_to_json(obj) return error_to_json(obj)
# if something was stored as 'decimal', you likely # if something was stored as 'decimal', you likely
@ -59,12 +53,12 @@ def _default_encode(obj: Any) -> Any:
# could possibly run multiple times/raise warning if you provide different 'default' # could possibly run multiple times/raise warning if you provide different 'default'
# functions or change the kwargs? The alternative is to maintain all of this at the module # functions or change the kwargs? The alternative is to maintain all of this at the module
# level, which is just as annoying # level, which is just as annoying
@cache @lru_cache(maxsize=None)
def _dumps_factory(**kwargs) -> Callable[[Any], str]: def _dumps_factory(**kwargs) -> Callable[[Any], str]:
use_default: DefaultEncoder = _default_encode use_default: DefaultEncoder = _default_encode
# if the user passed an additional 'default' parameter, # if the user passed an additional 'default' parameter,
# try using that to serialize before before _default_encode # try using that to serialize before before _default_encode
_additional_default: DefaultEncoder | None = kwargs.get("default") _additional_default: Optional[DefaultEncoder] = kwargs.get("default")
if _additional_default is not None and callable(_additional_default): if _additional_default is not None and callable(_additional_default):
def wrapped_default(obj: Any) -> Any: def wrapped_default(obj: Any) -> Any:
@ -80,29 +74,22 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]:
kwargs["default"] = use_default kwargs["default"] = use_default
prefer_factory: str | None = kwargs.pop('_prefer_factory', None)
def orjson_factory() -> Dumps | None:
try: try:
import orjson import orjson
except ModuleNotFoundError:
return None
# todo: add orjson.OPT_NON_STR_KEYS? would require some bitwise ops # todo: add orjson.OPT_NON_STR_KEYS? would require some bitwise ops
# most keys are typically attributes from a NT/Dataclass, # most keys are typically attributes from a NT/Dataclass,
# so most seem to work: https://github.com/ijl/orjson#opt_non_str_keys # so most seem to work: https://github.com/ijl/orjson#opt_non_str_keys
def _orjson_dumps(obj: Any) -> str: # TODO rename? def _orjson_dumps(obj: Any) -> str:
# orjson returns json as bytes, encode to string # orjson returns json as bytes, encode to string
return orjson.dumps(obj, **kwargs).decode('utf-8') return orjson.dumps(obj, **kwargs).decode('utf-8')
return _orjson_dumps return _orjson_dumps
except ModuleNotFoundError:
pass
def simplejson_factory() -> Dumps | None:
try: try:
from simplejson import dumps as simplejson_dumps from simplejson import dumps as simplejson_dumps
except ModuleNotFoundError:
return None
# if orjson couldn't be imported, try simplejson # if orjson couldn't be imported, try simplejson
# This is included for compatibility reasons because orjson # This is included for compatibility reasons because orjson
# is rust-based and compiling on rarer architectures may not work # is rust-based and compiling on rarer architectures may not work
@ -117,42 +104,23 @@ def _dumps_factory(**kwargs) -> Callable[[Any], str]:
return _simplejson_dumps return _simplejson_dumps
def stdlib_factory() -> Dumps | None: except ModuleNotFoundError:
import json pass
import json
from .warnings import high from .warnings import high
high( high("You might want to install 'orjson' to support serialization for lots more types! If that does not work for you, you can install 'simplejson' instead")
"You might want to install 'orjson' to support serialization for lots more types! If that does not work for you, you can install 'simplejson' instead"
)
def _stdlib_dumps(obj: Any) -> str: def _stdlib_dumps(obj: Any) -> str:
return json.dumps(obj, **kwargs) return json.dumps(obj, **kwargs)
return _stdlib_dumps return _stdlib_dumps
factories = {
'orjson': orjson_factory,
'simplejson': simplejson_factory,
'stdlib': stdlib_factory,
}
if prefer_factory is not None:
factory = factories[prefer_factory]
res = factory()
assert res is not None, prefer_factory
return res
for factory in factories.values():
res = factory()
if res is not None:
return res
raise RuntimeError("Should not happen!")
def dumps( def dumps(
obj: Any, obj: Any,
default: DefaultEncoder | None = None, default: Optional[DefaultEncoder] = None,
**kwargs, **kwargs,
) -> str: ) -> str:
""" """
@ -185,17 +153,8 @@ def dumps(
return _dumps_factory(default=default, **kwargs)(obj) return _dumps_factory(default=default, **kwargs)(obj)
@parametrize('factory', ['orjson', 'simplejson', 'stdlib']) def test_serialize_fallback() -> None:
def test_dumps(factory: str) -> None: import json as jsn # dont cause possible conflicts with module code
import pytest
orig_dumps = globals()['dumps'] # hack to prevent error from using local variable before declaring
def dumps(*args, **kwargs) -> str:
kwargs['_prefer_factory'] = factory
return orig_dumps(*args, **kwargs)
import json as json_builtin # dont cause possible conflicts with module code
# can't use a namedtuple here, since the default json.dump serializer # can't use a namedtuple here, since the default json.dump serializer
# serializes namedtuples as tuples, which become arrays # serializes namedtuples as tuples, which become arrays
@ -206,12 +165,36 @@ def test_dumps(factory: str) -> None:
# the lru_cache'd warning may have already been sent, # the lru_cache'd warning may have already been sent,
# so checking may be nondeterministic? # so checking may be nondeterministic?
import warnings import warnings
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore") warnings.simplefilter("ignore")
res = json_builtin.loads(dumps(X)) res = jsn.loads(dumps(X))
assert res == [5, 5.0] assert res == [5, 5.0]
# this needs to be defined here to prevent a mypy bug
# see https://github.com/python/mypy/issues/7281
class _A(NamedTuple):
x: int
y: float
def test_nt_serialize() -> None:
import json as jsn # dont cause possible conflicts with module code
import orjson # import to make sure this is installed
res: str = dumps(_A(x=1, y=2.0))
assert res == '{"x":1,"y":2.0}'
# test orjson option kwarg
data = {datetime.date(year=1970, month=1, day=1): 5}
res2 = jsn.loads(dumps(data, option=orjson.OPT_NON_STR_KEYS))
assert res2 == {'1970-01-01': 5}
def test_default_serializer() -> None:
import pytest
import json as jsn # dont cause possible conflicts with module code
class Unserializable: class Unserializable:
def __init__(self, x: int): def __init__(self, x: int):
self.x = x self.x = x
@ -225,7 +208,7 @@ def test_dumps(factory: str) -> None:
def _serialize(self) -> Any: def _serialize(self) -> Any:
return {"x": self.x, "y": self.y} return {"x": self.x, "y": self.y}
res = json_builtin.loads(dumps(WithUnderscoreSerialize(6))) res = jsn.loads(dumps(WithUnderscoreSerialize(6)))
assert res == {"x": 6, "y": 6.0} assert res == {"x": 6, "y": 6.0}
# test passing additional 'default' func # test passing additional 'default' func
@ -237,25 +220,5 @@ def test_dumps(factory: str) -> None:
# this serializes both Unserializable, which is a custom type otherwise # this serializes both Unserializable, which is a custom type otherwise
# not handled, and timedelta, which is handled by the '_default_encode' # not handled, and timedelta, which is handled by the '_default_encode'
# in the 'wrapped_default' function # in the 'wrapped_default' function
res2 = json_builtin.loads(dumps(Unserializable(10), default=_serialize_with_default)) res2 = jsn.loads(dumps(Unserializable(10), default=_serialize_with_default))
assert res2 == {"x": 10, "y": 10.0} assert res2 == {"x": 10, "y": 10.0}
if factory == 'orjson':
import orjson
# test orjson option kwarg
data = {datetime.date(year=1970, month=1, day=1): 5}
res2 = json_builtin.loads(dumps(data, option=orjson.OPT_NON_STR_KEYS))
assert res2 == {'1970-01-01': 5}
@parametrize('factory', ['orjson', 'simplejson'])
def test_dumps_namedtuple(factory: str) -> None:
import json as json_builtin # dont cause possible conflicts with module code
class _A(NamedTuple):
x: int
y: float
res: str = dumps(_A(x=1, y=2.0), _prefer_factory=factory)
assert json_builtin.loads(res) == {'x': 1, 'y': 2.0}

View file

@ -3,12 +3,9 @@ Decorator to gracefully handle importing a data source, or warning
and yielding nothing (or a default) when its not available and yielding nothing (or a default) when its not available
""" """
from __future__ import annotations
import warnings
from collections.abc import Iterable, Iterator
from functools import wraps from functools import wraps
from typing import Any, Callable, TypeVar from typing import Any, Iterator, TypeVar, Callable, Optional, Iterable
import warnings
from .warnings import medium from .warnings import medium
@ -29,8 +26,8 @@ _DEFAULT_ITR = ()
def import_source( def import_source(
*, *,
default: Iterable[T] = _DEFAULT_ITR, default: Iterable[T] = _DEFAULT_ITR,
module_name: str | None = None, module_name: Optional[str] = None,
help_url: str | None = None, help_url: Optional[str] = None,
) -> Callable[..., Callable[..., Iterator[T]]]: ) -> Callable[..., Callable[..., Iterator[T]]]:
""" """
doesn't really play well with types, but is used to catch doesn't really play well with types, but is used to catch
@ -53,7 +50,6 @@ def import_source(
except (ImportError, AttributeError) as err: except (ImportError, AttributeError) as err:
from . import core_config as CC from . import core_config as CC
from .error import warn_my_config_import_error from .error import warn_my_config_import_error
suppressed_in_conf = False suppressed_in_conf = False
if module_name is not None and CC.config._is_module_active(module_name) is False: if module_name is not None and CC.config._is_module_active(module_name) is False:
suppressed_in_conf = True suppressed_in_conf = True
@ -65,18 +61,16 @@ def import_source(
warnings.warn(f"""If you don't want to use this module, to hide this message, add '{module_name}' to your core config disabled_modules in your config, like: warnings.warn(f"""If you don't want to use this module, to hide this message, add '{module_name}' to your core config disabled_modules in your config, like:
class core: class core:
disabled_modules = [{module_name!r}] disabled_modules = [{repr(module_name)}]
""", stacklevel=1) """)
# try to check if this is a config error or based on dependencies not being installed # try to check if this is a config error or based on dependencies not being installed
if isinstance(err, (ImportError, AttributeError)): if isinstance(err, (ImportError, AttributeError)):
matched_config_err = warn_my_config_import_error(err, module_name=module_name, help_url=help_url) matched_config_err = warn_my_config_import_error(err, help_url=help_url)
# if we determined this wasn't a config error, and it was an attribute error # if we determined this wasn't a config error, and it was an attribute error
# it could be *any* attribute error -- we should raise this since its otherwise a fatal error # it could be *any* attribute error -- we should raise this since its otherwise a fatal error
# from some code in the module failing # from some code in the module failing
if not matched_config_err and isinstance(err, AttributeError): if not matched_config_err and isinstance(err, AttributeError):
raise err raise err
yield from default yield from default
return wrapper return wrapper
return decorator return decorator

View file

@ -1,19 +1,15 @@
from __future__ import annotations from .common import assert_subpackage; assert_subpackage(__name__)
from .internal import assert_subpackage # noqa: I001
assert_subpackage(__name__)
import shutil
import sqlite3
from collections.abc import Iterator
from contextlib import contextmanager from contextlib import contextmanager
from pathlib import Path from pathlib import Path
import shutil
import sqlite3
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from typing import Any, Callable, Literal, Union, overload from typing import Tuple, Any, Iterator, Callable, Optional, Union, Literal
from .common import PathIsh
from .compat import assert_never from .common import PathIsh, assert_never
def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection: def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection:
@ -26,7 +22,6 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None:
conn.execute('CREATE TABLE testtable (col)') conn.execute('CREATE TABLE testtable (col)')
import pytest import pytest
with pytest.raises(sqlite3.OperationalError, match='readonly database'): with pytest.raises(sqlite3.OperationalError, match='readonly database'):
with sqlite_connect_immutable(db) as conn: with sqlite_connect_immutable(db) as conn:
conn.execute('DROP TABLE testtable') conn.execute('DROP TABLE testtable')
@ -38,17 +33,15 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None:
SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any] SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any]
def dict_factory(cursor, row): def dict_factory(cursor, row):
fields = [column[0] for column in cursor.description] fields = [column[0] for column in cursor.description]
return dict(zip(fields, row)) return {key: value for key, value in zip(fields, row)}
Factory = Union[SqliteRowFactory, Literal['row', 'dict']] Factory = Union[SqliteRowFactory, Literal['row', 'dict']]
@contextmanager @contextmanager
def sqlite_connection(db: PathIsh, *, immutable: bool = False, row_factory: Factory | None = None) -> Iterator[sqlite3.Connection]: def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]:
dbp = f'file:{db}' dbp = f'file:{db}'
# https://www.sqlite.org/draft/uri.html#uriimmutable # https://www.sqlite.org/draft/uri.html#uriimmutable
if immutable: if immutable:
@ -104,76 +97,32 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
# and then the return type ends up as Iterator[Tuple[str, ...]], which isn't desirable :( # and then the return type ends up as Iterator[Tuple[str, ...]], which isn't desirable :(
# a bit annoying to have this copy-pasting, but hopefully not a big issue # a bit annoying to have this copy-pasting, but hopefully not a big issue
# fmt: off from typing import overload
@overload @overload
def select(cols: tuple[str ], rest: str, *, db: sqlite3.Connection) -> \ def select(cols: Tuple[str ], rest: str, *, db: sqlite3.Connection) -> \
Iterator[tuple[Any ]]: ... Iterator[Tuple[Any ]]: ...
@overload @overload
def select(cols: tuple[str, str ], rest: str, *, db: sqlite3.Connection) -> \ def select(cols: Tuple[str, str ], rest: str, *, db: sqlite3.Connection) -> \
Iterator[tuple[Any, Any ]]: ... Iterator[Tuple[Any, Any ]]: ...
@overload @overload
def select(cols: tuple[str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ def select(cols: Tuple[str, str, str ], rest: str, *, db: sqlite3.Connection) -> \
Iterator[tuple[Any, Any, Any ]]: ... Iterator[Tuple[Any, Any, Any ]]: ...
@overload @overload
def select(cols: tuple[str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ def select(cols: Tuple[str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \
Iterator[tuple[Any, Any, Any, Any ]]: ... Iterator[Tuple[Any, Any, Any, Any ]]: ...
@overload @overload
def select(cols: tuple[str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ def select(cols: Tuple[str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \
Iterator[tuple[Any, Any, Any, Any, Any ]]: ... Iterator[Tuple[Any, Any, Any, Any, Any ]]: ...
@overload @overload
def select(cols: tuple[str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ def select(cols: Tuple[str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \
Iterator[tuple[Any, Any, Any, Any, Any, Any ]]: ... Iterator[Tuple[Any, Any, Any, Any, Any, Any ]]: ...
@overload @overload
def select(cols: tuple[str, str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \ def select(cols: Tuple[str, str, str, str, str, str, str ], rest: str, *, db: sqlite3.Connection) -> \
Iterator[tuple[Any, Any, Any, Any, Any, Any, Any ]]: ... Iterator[Tuple[Any, Any, Any, Any, Any, Any, Any ]]: ...
@overload @overload
def select(cols: tuple[str, str, str, str, str, str, str, str], rest: str, *, db: sqlite3.Connection) -> \ def select(cols: Tuple[str, str, str, str, str, str, str, str], rest: str, *, db: sqlite3.Connection) -> \
Iterator[tuple[Any, Any, Any, Any, Any, Any, Any, Any]]: ... Iterator[Tuple[Any, Any, Any, Any, Any, Any, Any, Any]]: ...
# fmt: on
def select(cols, rest, *, db): def select(cols, rest, *, db):
# db arg is last cause that results in nicer code formatting.. # db arg is last cause that results in nicer code formatting..
return db.execute('SELECT ' + ','.join(cols) + ' ' + rest) return db.execute('SELECT ' + ','.join(cols) + ' ' + rest)
class SqliteTool:
def __init__(self, connection: sqlite3.Connection) -> None:
self.connection = connection
def _get_sqlite_master(self) -> dict[str, str]:
res = {}
for c in self.connection.execute('SELECT name, type FROM sqlite_master'):
[name, type_] = c
assert type_ in {'table', 'index', 'view', 'trigger'}, (name, type_) # just in case
res[name] = type_
return res
def get_table_names(self) -> list[str]:
master = self._get_sqlite_master()
res = []
for name, type_ in master.items():
if type_ != 'table':
continue
res.append(name)
return res
def get_table_schema(self, name: str) -> dict[str, str]:
"""
Returns map from column name to column type
NOTE: Sometimes this doesn't work if the db has some extensions (e.g. happens for facebook apps)
In this case you might still be able to use get_table_names
"""
schema: dict[str, str] = {}
for row in self.connection.execute(f'PRAGMA table_info(`{name}`)'):
col = row[1]
type_ = row[2]
# hmm, somewhere between 3.34.1 and 3.37.2, sqlite started normalising type names to uppercase
# let's do this just in case since python < 3.10 are using the old version
# e.g. it could have returned 'blob' and that would confuse blob check (see _check_allowed_blobs)
type_ = type_.upper()
schema[col] = type_
return schema
def get_table_schemas(self) -> dict[str, dict[str, str]]:
return {name: self.get_table_schema(name) for name in self.get_table_names()}

View file

@ -1,178 +1,23 @@
''' '''
Helpers for hpi doctor/stats functionality. Helpers for hpi doctor/stats functionality.
''' '''
import collections
from __future__ import annotations
import collections.abc
import importlib import importlib
import inspect import inspect
import typing import typing
from collections.abc import Iterable, Iterator, Sequence from typing import Optional, Callable, Any, Iterator, Sequence, Dict, List
from contextlib import contextmanager
from datetime import datetime
from pathlib import Path
from types import ModuleType
from typing import (
Any,
Callable,
Protocol,
cast,
)
from .types import asdict from .common import StatsFun, Stats, stat
Stats = dict[str, Any]
class StatsFun(Protocol):
def __call__(self, *, quick: bool = False) -> Stats: ...
# global state that turns on/off quick stats
# can use the 'quick_stats' contextmanager
# to enable/disable this in cli so that module 'stats'
# functions don't have to implement custom 'quick' logic
QUICK_STATS = False
# in case user wants to use the stats functions/quick option
# elsewhere -- can use this decorator instead of editing
# the global state directly
@contextmanager
def quick_stats():
global QUICK_STATS
prev = QUICK_STATS
try:
QUICK_STATS = True
yield
finally:
QUICK_STATS = prev
def stat(
func: Callable[[], Iterable[Any]] | Iterable[Any],
*,
quick: bool = False,
name: str | None = None,
) -> Stats:
"""
Extracts various statistics from a passed iterable/callable, e.g.:
- number of items
- first/last item
- timestamps associated with first/last item
If quick is set, then only first 100 items of the iterable will be processed
"""
if callable(func):
fr = func()
if hasattr(fr, '__enter__') and hasattr(fr, '__exit__'):
# context managers has Iterable type, but they aren't data providers
# sadly doesn't look like there is a way to tell from typing annotations
# Ideally we'd detect this in is_data_provider...
# but there is no way of knowing without actually calling it first :(
return {}
fname = func.__name__
else:
# meh. means it's just a list.. not sure how to generate a name then
fr = func
fname = f'unnamed_{id(fr)}'
type_name = type(fr).__name__
extras = {}
if type_name == 'DataFrame':
# dynamic, because pandas is an optional dependency..
df = cast(Any, fr) # todo ugh, not sure how to annotate properly
df = df.reset_index()
fr = df.to_dict(orient='records')
dtypes = df.dtypes.to_dict()
extras['dtypes'] = dtypes
res = _stat_iterable(fr, quick=quick)
res.update(extras)
stat_name = name if name is not None else fname
return {
stat_name: res,
}
def test_stat() -> None:
# the bulk of testing is in test_stat_iterable
# works with 'anonymous' lists
res = stat([1, 2, 3])
[(name, v)] = res.items()
# note: name will be a little funny since anonymous list doesn't have one
assert v == {'count': 3}
#
# works with functions:
def fun():
return [4, 5, 6]
assert stat(fun) == {'fun': {'count': 3}}
#
# context managers are technically iterable
# , but usually we wouldn't want to compute stats for them
# this is mainly intended for guess_stats,
# since it can't tell whether the function is a ctx manager without calling it
@contextmanager
def cm():
yield 1
yield 3
assert stat(cm) == {} # type: ignore[arg-type]
#
# works with pandas dataframes
import numpy as np
import pandas as pd
def df() -> pd.DataFrame:
dates = pd.date_range(start='2024-02-10 08:00', end='2024-02-11 16:00', freq='5h')
return pd.DataFrame([f'value{i}' for i, _ in enumerate(dates)], index=dates, columns=['value'])
assert stat(df) == {
'df': {
'count': 7,
'dtypes': {
'index': np.dtype('<M8[ns]'),
'value': np.dtype('O'),
},
'first': pd.Timestamp('2024-02-10 08:00'),
'last': pd.Timestamp('2024-02-11 14:00'),
},
}
#
def get_stats(module_name: str, *, guess: bool = False) -> StatsFun | None:
stats: StatsFun | None = None
try:
module = importlib.import_module(module_name)
except Exception:
return None
stats = getattr(module, 'stats', None)
if stats is None:
stats = guess_stats(module)
return stats
# TODO maybe could be enough to annotate OUTPUTS or something like that? # TODO maybe could be enough to annotate OUTPUTS or something like that?
# then stats could just use them as hints? # then stats could just use them as hints?
def guess_stats(module: ModuleType) -> StatsFun | None: def guess_stats(module_name: str, quick: bool = False) -> Optional[StatsFun]:
""" providers = guess_data_providers(module_name)
If the module doesn't have explicitly defined 'stat' function,
this is used to try to guess what could be included in stats automatically
"""
providers = _guess_data_providers(module)
if len(providers) == 0: if len(providers) == 0:
return None return None
def auto_stats(*, quick: bool = False) -> Stats: def auto_stats() -> Stats:
res = {} res = {}
for k, v in providers.items(): for k, v in providers.items():
res.update(stat(v, quick=quick, name=k)) res.update(stat(v, quick=quick, name=k))
@ -182,11 +27,12 @@ def guess_stats(module: ModuleType) -> StatsFun | None:
def test_guess_stats() -> None: def test_guess_stats() -> None:
from datetime import datetime
import my.core.tests.auto_stats as M import my.core.tests.auto_stats as M
auto_stats = guess_stats(M) auto_stats = guess_stats(M.__name__)
assert auto_stats is not None assert auto_stats is not None
res = auto_stats(quick=False) res = auto_stats()
assert res == { assert res == {
'inputs': { 'inputs': {
@ -202,15 +48,15 @@ def test_guess_stats() -> None:
} }
def _guess_data_providers(module: ModuleType) -> dict[str, Callable]: def guess_data_providers(module_name: str) -> Dict[str, Callable]:
module = importlib.import_module(module_name)
mfunctions = inspect.getmembers(module, inspect.isfunction) mfunctions = inspect.getmembers(module, inspect.isfunction)
return {k: v for k, v in mfunctions if is_data_provider(v)} return {k: v for k, v in mfunctions if is_data_provider(v)}
# todo how to exclude deprecated data providers? # todo how to exclude deprecated stuff?
def is_data_provider(fun: Any) -> bool: def is_data_provider(fun: Any) -> bool:
""" """
Criteria for being a "data provider":
1. returns iterable or something like that 1. returns iterable or something like that
2. takes no arguments? (otherwise not callable by stats anyway?) 2. takes no arguments? (otherwise not callable by stats anyway?)
3. doesn't start with an underscore (those are probably helper functions?) 3. doesn't start with an underscore (those are probably helper functions?)
@ -226,7 +72,7 @@ def is_data_provider(fun: Any) -> bool:
return False return False
# has at least one argument without default values # has at least one argument without default values
if len(list(_sig_required_params(sig))) > 0: if len(list(sig_required_params(sig))) > 0:
return False return False
if hasattr(fun, '__name__'): if hasattr(fun, '__name__'):
@ -242,7 +88,7 @@ def is_data_provider(fun: Any) -> bool:
if return_type is None: if return_type is None:
return False return False
return _type_is_iterable(return_type) return type_is_iterable(return_type)
def test_is_data_provider() -> None: def test_is_data_provider() -> None:
@ -253,42 +99,34 @@ def test_is_data_provider() -> None:
def no_return_type(): def no_return_type():
return [1, 2, 3] return [1, 2, 3]
assert not idp(no_return_type) assert not idp(no_return_type)
lam = lambda: [1, 2] lam = lambda: [1, 2]
assert not idp(lam) assert not idp(lam)
def has_extra_args(count) -> list[int]: def has_extra_args(count) -> List[int]:
return list(range(count)) return list(range(count))
assert not idp(has_extra_args) assert not idp(has_extra_args)
def has_return_type() -> Sequence[str]: def has_return_type() -> Sequence[str]:
return ['a', 'b', 'c'] return ['a', 'b', 'c']
assert idp(has_return_type) assert idp(has_return_type)
def _helper_func() -> Iterator[Any]: def _helper_func() -> Iterator[Any]:
yield 1 yield 1
assert not idp(_helper_func) assert not idp(_helper_func)
def inputs() -> Iterator[Any]: def inputs() -> Iterator[Any]:
yield 1 yield 1
assert idp(inputs) assert idp(inputs)
def producer_inputs() -> Iterator[Any]: def producer_inputs() -> Iterator[Any]:
yield 1 yield 1
assert idp(producer_inputs) assert idp(producer_inputs)
def _sig_required_params(sig: inspect.Signature) -> Iterator[inspect.Parameter]: # return any parameters the user is required to provide - those which don't have default values
""" def sig_required_params(sig: inspect.Signature) -> Iterator[inspect.Parameter]:
Returns parameters the user is required to provide - e.g. ones that don't have default values
"""
for param in sig.parameters.values(): for param in sig.parameters.values():
if param.default == inspect.Parameter.empty: if param.default == inspect.Parameter.empty:
yield param yield param
@ -298,24 +136,21 @@ def test_sig_required_params() -> None:
def x() -> int: def x() -> int:
return 5 return 5
assert len(list(sig_required_params(inspect.signature(x)))) == 0
assert len(list(_sig_required_params(inspect.signature(x)))) == 0
def y(arg: int) -> int: def y(arg: int) -> int:
return arg return arg
assert len(list(sig_required_params(inspect.signature(y)))) == 1
assert len(list(_sig_required_params(inspect.signature(y)))) == 1
# from stats perspective, this should be treated as a data provider as well # from stats perspective, this should be treated as a data provider as well
# could be that the default value to the data provider is the 'default' # could be that the default value to the data provider is the 'default'
# path to use for inputs/a function to provide input data # path to use for inputs/a function to provide input data
def z(arg: int = 5) -> int: def z(arg: int = 5) -> int:
return arg return arg
assert len(list(sig_required_params(inspect.signature(z)))) == 0
assert len(list(_sig_required_params(inspect.signature(z)))) == 0
def _type_is_iterable(type_spec) -> bool: def type_is_iterable(type_spec) -> bool:
origin = typing.get_origin(type_spec) origin = typing.get_origin(type_spec)
if origin is None: if origin is None:
return False return False
@ -332,139 +167,14 @@ def _type_is_iterable(type_spec) -> bool:
# todo docstring test? # todo docstring test?
def test_type_is_iterable() -> None: def test_type_is_iterable() -> None:
fun = _type_is_iterable from typing import List, Sequence, Iterable, Dict, Any
fun = type_is_iterable
assert not fun(None) assert not fun(None)
assert not fun(int) assert not fun(int)
assert not fun(Any) assert not fun(Any)
assert not fun(dict[int, int]) assert not fun(Dict[int, int])
assert fun(list[int]) assert fun(List[int])
assert fun(Sequence[dict[str, str]]) assert fun(Sequence[Dict[str, str]])
assert fun(Iterable[Any]) assert fun(Iterable[Any])
def _stat_item(item):
if item is None:
return None
if isinstance(item, Path):
return str(item)
return _guess_datetime(item)
def _stat_iterable(it: Iterable[Any], *, quick: bool = False) -> Stats:
from more_itertools import first, ilen, take
# todo not sure if there is something in more_itertools to compute this?
total = 0
errors = 0
first_item = None
last_item = None
def funcit():
nonlocal errors, first_item, last_item, total
for x in it:
total += 1
if isinstance(x, Exception):
errors += 1
else:
last_item = x
if first_item is None:
first_item = x
yield x
eit = funcit()
count: Any
if quick or QUICK_STATS:
initial = take(100, eit)
count = len(initial)
if first(eit, None) is not None: # todo can actually be none...
# haven't exhausted
count = f'{count}+'
else:
count = ilen(eit)
res = {
'count': count,
}
if total == 0:
# not sure but I guess a good balance? wouldn't want to throw early here?
res['warning'] = 'THE ITERABLE RETURNED NO DATA'
if errors > 0:
res['errors'] = errors
if (stat_first := _stat_item(first_item)) is not None:
res['first'] = stat_first
if (stat_last := _stat_item(last_item)) is not None:
res['last'] = stat_last
return res
def test_stat_iterable() -> None:
from datetime import datetime, timedelta, timezone
from typing import NamedTuple
dd = datetime.fromtimestamp(123, tz=timezone.utc)
day = timedelta(days=3)
class X(NamedTuple):
x: int
d: datetime
def it():
yield RuntimeError('oops!')
for i in range(2):
yield X(x=i, d=dd + day * i)
yield RuntimeError('bad!')
for i in range(3):
yield X(x=i * 10, d=dd + day * (i * 10))
yield X(x=123, d=dd + day * 50)
res = _stat_iterable(it())
assert res['count'] == 1 + 2 + 1 + 3 + 1
assert res['errors'] == 1 + 1
assert res['last'] == dd + day * 50
# experimental, not sure about it..
def _guess_datetime(x: Any) -> datetime | None:
# todo hmm implement without exception..
try:
d = asdict(x)
except: # noqa: E722 bare except
return None
for v in d.values():
if isinstance(v, datetime):
return v
return None
def test_guess_datetime() -> None:
from dataclasses import dataclass
from typing import NamedTuple
from .compat import fromisoformat
dd = fromisoformat('2021-02-01T12:34:56Z')
class A(NamedTuple):
x: int
class B(NamedTuple):
x: int
created: datetime
assert _guess_datetime(A(x=4)) is None
assert _guess_datetime(B(x=4, created=dd)) == dd
@dataclass
class C:
a: datetime
x: int
assert _guess_datetime(C(a=dd, x=435)) == dd
# TODO not sure what to return when multiple datetime fields?
# TODO test @property?

View file

@ -1,22 +1,20 @@
from __future__ import annotations
import atexit
import os import os
import shutil import shutil
import sys
import tarfile
import tempfile import tempfile
import zipfile import zipfile
from collections.abc import Generator, Sequence import atexit
from typing import Sequence, Generator, List, Union, Tuple
from contextlib import contextmanager from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from .logging import make_logger from .common import LazyLogger
logger = make_logger(__name__, level="info")
def _structure_exists(base_dir: Path, paths: Sequence[str], *, partial: bool = False) -> bool: logger = LazyLogger(__name__, level="info")
def _structure_exists(base_dir: Path, paths: Sequence[str], partial: bool = False) -> bool:
""" """
Helper function for match_structure to check if Helper function for match_structure to check if
all subpaths exist at some base directory all subpaths exist at some base directory
@ -38,18 +36,17 @@ def _structure_exists(base_dir: Path, paths: Sequence[str], *, partial: bool = F
ZIP_EXT = {".zip"} ZIP_EXT = {".zip"}
TARGZ_EXT = {".tar.gz"}
@contextmanager @contextmanager
def match_structure( def match_structure(
base: Path, base: Path,
expected: str | Sequence[str], expected: Union[str, Sequence[str]],
*, *,
partial: bool = False, partial: bool = False,
) -> Generator[tuple[Path, ...], None, None]: ) -> Generator[Tuple[Path, ...], None, None]:
""" """
Given a 'base' directory or archive (zip/tar.gz), recursively search for one or more paths that match the Given a 'base' directory or zipfile, recursively search for one or more paths that match the
pattern described in 'expected'. That can be a single string, or a list pattern described in 'expected'. That can be a single string, or a list
of relative paths (as strings) you expect at the same directory. of relative paths (as strings) you expect at the same directory.
@ -57,12 +54,12 @@ def match_structure(
expected be present, not all of them. expected be present, not all of them.
This reduces the chances of the user misconfiguring gdpr exports, e.g. This reduces the chances of the user misconfiguring gdpr exports, e.g.
if they archived the folders instead of the parent directory or vice-versa if they zipped the folders instead of the parent directory or vice-versa
When this finds a matching directory structure, it stops searching in that subdirectory When this finds a matching directory structure, it stops searching in that subdirectory
and continues onto other possible subdirectories which could match and continues onto other possible subdirectories which could match
If base is an archive, this extracts it into a temporary directory If base is a zipfile, this extracts the zipfile into a temporary directory
(configured by core_config.config.get_tmp_dir), and then searches the extracted (configured by core_config.config.get_tmp_dir), and then searches the extracted
folder for matching structures folder for matching structures
@ -72,21 +69,21 @@ def match_structure(
export_dir export_dir
exp_2020 exp_2020
channel_data    channel_data
data1       data1
data2       data2
index.json    index.json
messages    messages
messages.csv       messages.csv
profile    profile
settings.json    settings.json
exp_2021 exp_2021
channel_data channel_data
data1    data1
data2    data2
index.json index.json
messages messages
messages.csv    messages.csv
profile profile
settings.json settings.json
@ -98,12 +95,12 @@ def match_structure(
This doesn't require an exhaustive list of expected values, but its a good idea to supply This doesn't require an exhaustive list of expected values, but its a good idea to supply
a complete picture of the expected structure to avoid false-positives a complete picture of the expected structure to avoid false-positives
This does not recursively decompress archives in the subdirectories, This does not recursively unzip zipfiles in the subdirectories,
it only unpacks into a temporary directory if 'base' is an archive it only unzips into a temporary directory if 'base' is a zipfile
A common pattern for using this might be to use get_files to get a list A common pattern for using this might be to use get_files to get a list
of archives or top-level gdpr export directories, and use match_structure of zipfiles or top-level gdpr export directories, and use match_structure
to search the resulting paths for an export structure you're expecting to search the resulting paths for a export structure you're expecting
""" """
from . import core_config as CC from . import core_config as CC
@ -113,37 +110,29 @@ def match_structure(
expected = (expected,) expected = (expected,)
is_zip: bool = base.suffix in ZIP_EXT is_zip: bool = base.suffix in ZIP_EXT
is_targz: bool = any(base.name.endswith(suffix) for suffix in TARGZ_EXT)
searchdir: Path = base.absolute() searchdir: Path = base.absolute()
try: try:
# if the file given by the user is an archive, create a temporary # if the file given by the user is a zipfile, create a temporary
# directory and extract it to that temporary directory # directory and extract the zipfile to that temporary directory
# #
# this temporary directory is removed in the finally block # this temporary directory is removed in the finally block
if is_zip or is_targz: if is_zip:
# sanity check before we start creating directories/rm-tree'ing things # sanity check before we start creating directories/rm-tree'ing things
assert base.exists(), f"archive at {base} doesn't exist" assert base.exists(), f"zipfile at {base} doesn't exist"
searchdir = Path(tempfile.mkdtemp(dir=tdir)) searchdir = Path(tempfile.mkdtemp(dir=tdir))
if is_zip:
# base might already be a ZipPath, and str(base) would end with / # base might already be a ZipPath, and str(base) would end with /
zf = zipfile.ZipFile(str(base).rstrip('/')) zf = zipfile.ZipFile(str(base).rstrip('/'))
zf.extractall(path=str(searchdir)) zf.extractall(path=str(searchdir))
elif is_targz:
with tarfile.open(str(base)) as tar:
# filter is a security feature, will be required param in later python version
mfilter = {'filter': 'data'} if sys.version_info[:2] >= (3, 12) else {}
tar.extractall(path=str(searchdir), **mfilter) # type: ignore[arg-type]
else:
raise RuntimeError("can't happen")
else: else:
if not searchdir.is_dir(): if not searchdir.is_dir():
raise NotADirectoryError(f"Expected either a zip/tar.gz archive or a directory, received {searchdir}") raise NotADirectoryError(f"Expected either a zipfile or a directory, received {searchdir}")
matches: list[Path] = [] matches: List[Path] = []
possible_targets: list[Path] = [searchdir] possible_targets: List[Path] = [searchdir]
while len(possible_targets) > 0: while len(possible_targets) > 0:
p = possible_targets.pop(0) p = possible_targets.pop(0)
@ -163,9 +152,9 @@ def match_structure(
finally: finally:
if is_zip or is_targz: if is_zip:
# make sure we're not mistakenly deleting data # make sure we're not mistakenly deleting data
assert str(searchdir).startswith(str(tdir)), f"Expected the temporary directory for extracting archive to start with the temporary directory prefix ({tdir}), found {searchdir}" assert str(searchdir).startswith(str(tdir)), f"Expected the temporary directory for extracting zip to start with the temporary directory prefix ({tdir}), found {searchdir}"
shutil.rmtree(str(searchdir)) shutil.rmtree(str(searchdir))
@ -174,7 +163,7 @@ def warn_leftover_files() -> None:
from . import core_config as CC from . import core_config as CC
base_tmp: Path = CC.config.get_tmp_dir() base_tmp: Path = CC.config.get_tmp_dir()
leftover: list[Path] = list(base_tmp.iterdir()) leftover: List[Path] = list(base_tmp.iterdir())
if leftover: if leftover:
logger.debug(f"at exit warning: Found leftover files in temporary directory '{leftover}'. this may be because you have multiple hpi processes running -- if so this can be ignored") logger.debug(f"at exit warning: Found leftover files in temporary directory '{leftover}'. this may be because you have multiple hpi processes running -- if so this can be ignored")

View file

@ -1,12 +1,11 @@
""" """
Helper 'module' for test_guess_stats Helper 'module' for test_guess_stats
""" """
from collections.abc import Iterable, Iterator, Sequence
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime, timedelta
from pathlib import Path from pathlib import Path
from typing import Iterable, Sequence, Iterator
@dataclass @dataclass

View file

@ -1,32 +0,0 @@
from __future__ import annotations
import os
from collections.abc import Iterator
from contextlib import contextmanager
import pytest
V = 'HPI_TESTS_USES_OPTIONAL_DEPS'
# TODO use it for serialize tests that are using simplejson/orjson?
skip_if_uses_optional_deps = pytest.mark.skipif(
V not in os.environ,
reason=f'test only works when optional dependencies are installed. Set env variable {V}=true to override.',
)
# TODO maybe move to hpi core?
@contextmanager
def tmp_environ_set(key: str, value: str | None) -> Iterator[None]:
prev_value = os.environ.get(key)
if value is None:
os.environ.pop(key, None)
else:
os.environ[key] = value
try:
yield
finally:
if prev_value is None:
os.environ.pop(key, None)
else:
os.environ[key] = prev_value

View file

@ -1,9 +1,8 @@
import json
import warnings
from collections.abc import Iterator
from datetime import datetime from datetime import datetime
import json
from pathlib import Path from pathlib import Path
from typing import NamedTuple from typing import NamedTuple, Iterator
import warnings
from ..denylist import DenyList from ..denylist import DenyList
@ -92,7 +91,8 @@ def test_denylist(tmp_path: Path) -> None:
assert "59.40.113.87" not in [i.addr for i in filtered] assert "59.40.113.87" not in [i.addr for i in filtered]
data_json = json.loads(tf.read_text()) with open(tf, "r") as f:
data_json = json.loads(f.read())
assert data_json == [ assert data_json == [
{ {

View file

@ -1,7 +1,7 @@
import shutil
import sqlite3
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
from pathlib import Path from pathlib import Path
import shutil
import sqlite3
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from ..sqlite import sqlite_connect_immutable, sqlite_copy_and_open from ..sqlite import sqlite_connect_immutable, sqlite_copy_and_open

View file

@ -1,8 +1,9 @@
from pathlib import Path from pathlib import Path
from ..structure import match_structure
import pytest import pytest
from ..structure import match_structure
structure_data: Path = Path(__file__).parent / "structure_data" structure_data: Path = Path(__file__).parent / "structure_data"
@ -14,9 +15,8 @@ def test_gdpr_structure_exists() -> None:
assert results == (structure_data / "gdpr_subdirs" / "gdpr_export",) assert results == (structure_data / "gdpr_subdirs" / "gdpr_export",)
@pytest.mark.parametrize("archive", ["gdpr_export.zip", "gdpr_export.tar.gz"]) def test_gdpr_unzip() -> None:
def test_gdpr_unpack(archive: str) -> None: with match_structure(structure_data / "gdpr_export.zip", expected=gdpr_expected) as results:
with match_structure(structure_data / archive, expected=gdpr_expected) as results:
assert len(results) == 1 assert len(results) == 1
extracted = results[0] extracted = results[0]
index_file = extracted / "messages" / "index.csv" index_file = extracted / "messages" / "index.csv"
@ -33,6 +33,6 @@ def test_match_partial() -> None:
def test_not_directory() -> None: def test_not_directory() -> None:
with pytest.raises(NotADirectoryError, match=r"Expected either a zip/tar.gz archive or a directory"): with pytest.raises(NotADirectoryError, match=r"Expected either a zipfile or a directory"):
with match_structure(structure_data / "messages/index.csv", expected=gdpr_expected): with match_structure(structure_data / "messages/index.csv", expected=gdpr_expected):
pass pass

View file

@ -1,52 +0,0 @@
from __future__ import annotations
from .common import skip_if_uses_optional_deps as pytestmark
# TODO ugh, this is very messy.. need to sort out config overriding here
def test_cachew() -> None:
from cachew import settings
settings.ENABLE = True # by default it's off in tests (see conftest.py)
from my.core.cachew import mcachew
called = 0
# TODO ugh. need doublewrap or something to avoid having to pass parens
@mcachew()
def cf() -> list[int]:
nonlocal called
called += 1
return [1, 2, 3]
list(cf())
cc = called
# todo ugh. how to clean cache?
# assert called == 1 # precondition, to avoid turdes from previous tests
assert list(cf()) == [1, 2, 3]
assert called == cc
def test_cachew_dir_none() -> None:
from cachew import settings
settings.ENABLE = True # by default it's off in tests (see conftest.py)
from my.core.cachew import cache_dir, mcachew
from my.core.core_config import _reset_config as reset
with reset() as cc:
cc.cache_dir = None
called = 0
@mcachew(cache_path=cache_dir() / 'ctest')
def cf() -> list[int]:
nonlocal called
called += 1
return [called, called, called]
assert list(cf()) == [1, 1, 1]
assert list(cf()) == [2, 2, 2]

View file

@ -1,6 +1,6 @@
import os import os
import sys
from subprocess import check_call from subprocess import check_call
import sys
def test_lists_modules() -> None: def test_lists_modules() -> None:

View file

@ -1,178 +0,0 @@
"""
Various tests that are checking behaviour of user config wrt to various things
"""
import os
import sys
from pathlib import Path
import pytest
import pytz
import my.config
from my.core import notnone
from my.demo import items, make_config
from .common import tmp_environ_set
# TODO would be nice to randomize test order here to catch various config issues
# run the same test multiple times to make sure there are not issues with import order etc
@pytest.mark.parametrize('run_id', ['1', '2'])
def test_override_config(tmp_path: Path, run_id: str) -> None:
class user_config:
username = f'user_{run_id}'
data_path = f'{tmp_path}/*.json'
my.config.demo = user_config # type: ignore[misc, assignment]
[item1, item2] = items()
assert item1.username == f'user_{run_id}'
assert item2.username == f'user_{run_id}'
@pytest.mark.skip(reason="won't work at the moment because of inheritance")
def test_dynamic_config_simplenamespace(tmp_path: Path) -> None:
from types import SimpleNamespace
user_config = SimpleNamespace(
username='user3',
data_path=f'{tmp_path}/*.json',
)
my.config.demo = user_config # type: ignore[misc, assignment]
cfg = make_config()
assert cfg.username == 'user3'
def test_mixin_attribute_handling(tmp_path: Path) -> None:
"""
Tests that arbitrary mixin attributes work with our config handling pattern
"""
nytz = pytz.timezone('America/New_York')
class user_config:
# check that override is taken into the account
timezone = nytz
irrelevant = 'hello'
username = 'UUU'
data_path = f'{tmp_path}/*.json'
my.config.demo = user_config # type: ignore[misc, assignment]
cfg = make_config()
assert cfg.username == 'UUU'
# mypy doesn't know about it, but the attribute is there
assert getattr(cfg, 'irrelevant') == 'hello'
# check that overridden default attribute is actually getting overridden
assert cfg.timezone == nytz
[item1, item2] = items()
assert item1.username == 'UUU'
assert notnone(item1.dt.tzinfo).zone == nytz.zone # type: ignore[attr-defined]
assert item2.username == 'UUU'
assert notnone(item2.dt.tzinfo).zone == nytz.zone # type: ignore[attr-defined]
# use multiple identical tests to make sure there are no issues with cached imports etc
@pytest.mark.parametrize('run_id', ['1', '2'])
def test_dynamic_module_import(tmp_path: Path, run_id: str) -> None:
"""
Test for dynamic hackery in config properties
e.g. importing some external modules
"""
ext = tmp_path / 'external'
ext.mkdir()
(ext / '__init__.py').write_text(
'''
def transform(x):
from .submodule import do_transform
return do_transform(x)
'''
)
(ext / 'submodule.py').write_text(
f'''
def do_transform(x):
return {{"total_{run_id}": sum(x.values())}}
'''
)
class user_config:
username = 'someuser'
data_path = f'{tmp_path}/*.json'
external = f'{ext}'
my.config.demo = user_config # type: ignore[misc, assignment]
[item1, item2] = items()
assert item1.raw == {f'total_{run_id}': 1 + 123}, item1
assert item2.raw == {f'total_{run_id}': 2 + 456}, item2
# need to reset these modules, otherwise they get cached
# kind of relevant to my.core.cfg.tmp_config
sys.modules.pop('external', None)
sys.modules.pop('external.submodule', None)
@pytest.mark.parametrize('run_id', ['1', '2'])
def test_my_config_env_variable(tmp_path: Path, run_id: str) -> None:
"""
Tests handling of MY_CONFIG variable
"""
# ugh. so by this point, my.config is already loaded (default stub), so we need to unload it
sys.modules.pop('my.config', None)
# but my.config itself relies on my.core.init hook, so unless it's reloaded too it wouldn't help
sys.modules.pop('my.core', None)
sys.modules.pop('my.core.init', None)
# it's a bit of a mouthful of course, but in most cases MY_CONFIG would be set once
# , and before hpi runs, so hopefully it's not a huge deal
cfg_dir = tmp_path / 'my'
cfg_file = cfg_dir / 'config.py'
cfg_dir.mkdir()
cfg_file.write_text(
f'''
# print("IMPORTING CONFIG {run_id}")
class demo:
username = 'xxx_{run_id}'
data_path = r'{tmp_path}{os.sep}*.json' # need raw string for windows...
'''
)
with tmp_environ_set('MY_CONFIG', str(tmp_path)):
[item1, item2] = items()
assert item1.username == f'xxx_{run_id}'
assert item2.username == f'xxx_{run_id}'
# sigh.. so this is cached in sys.path
# so it takes precedence later during next import, not giving the MY_CONFIG hook
# (imported from builtin my.config) to kick in
sys.path.remove(str(tmp_path))
# FIXME ideally this shouldn't be necessary?
# remove this after we fixup my.tests.reddit and my.tests.commits
# (they were failing ci when running all tests)
sys.modules.pop('my.config', None)
@pytest.fixture(autouse=True)
def prepare_data(tmp_path: Path):
(tmp_path / 'data.json').write_text(
'''
[
{"key": 1, "value": 123},
{"key": 2, "value": 456}
]
'''
)

View file

@ -1,15 +1,16 @@
import os import os
from pathlib import Path
import shutil import shutil
import tempfile import tempfile
import zipfile
from pathlib import Path
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import zipfile
import pytest
from ..common import get_files from ..common import get_files
from ..compat import windows
from ..kompress import CPath, ZipPath from ..kompress import CPath, ZipPath
import pytest
# hack to replace all /tmp with 'real' tmp dir # hack to replace all /tmp with 'real' tmp dir
# not ideal, but makes tests more concise # not ideal, but makes tests more concise
@ -55,9 +56,8 @@ def test_single_file() -> None:
''' '''
assert get_files('/tmp/hpi_test/file.ext') == (Path('/tmp/hpi_test/file.ext'),) assert get_files('/tmp/hpi_test/file.ext') == (Path('/tmp/hpi_test/file.ext'),)
is_windows = os.name == 'nt'
"if the path starts with ~, we expand it" "if the path starts with ~, we expand it"
if not is_windows: # windows doesn't have bashrc.. ugh if not windows: # windows doesn't have bashrc.. ugh
assert get_files('~/.bashrc') == (Path('~').expanduser() / '.bashrc',) assert get_files('~/.bashrc') == (Path('~').expanduser() / '.bashrc',)
@ -175,17 +175,12 @@ TMP = tempfile.gettempdir()
test_path = Path(TMP) / 'hpi_test' test_path = Path(TMP) / 'hpi_test'
@pytest.fixture(autouse=True) def setup():
def prepare():
teardown() teardown()
test_path.mkdir() test_path.mkdir()
try:
yield
finally:
teardown()
def teardown() -> None: def teardown():
if test_path.is_dir(): if test_path.is_dir():
shutil.rmtree(test_path) shutil.rmtree(test_path)

View file

@ -12,7 +12,7 @@ def _init_default_config() -> None:
def test_tmp_config() -> None: def test_tmp_config() -> None:
## ugh. ideally this would be on the top level (would be a better test) ## ugh. ideally this would be on the top level (would be a better test)
## but pytest imports everything first, executes hooks, and some reset_modules() fictures mess stuff up ## but pytest imports eveything first, executes hooks, and some reset_modules() fictures mess stuff up
## later would be nice to be a bit more careful about them ## later would be nice to be a bit more careful about them
_init_default_config() _init_default_config()
from my.simple import items from my.simple import items

View file

@ -1,11 +1,9 @@
from __future__ import annotations from functools import lru_cache
from typing import Sequence, Dict
from collections.abc import Sequence
from functools import cache, lru_cache
import pytz import pytz
from .types import datetime_aware, datetime_naive from .common import datetime_aware, datetime_naive
def user_forced() -> Sequence[str]: def user_forced() -> Sequence[str]:
@ -13,21 +11,19 @@ def user_forced() -> Sequence[str]:
# https://stackoverflow.com/questions/36067621/python-all-possible-timezone-abbreviations-for-given-timezone-name-and-vise-ve # https://stackoverflow.com/questions/36067621/python-all-possible-timezone-abbreviations-for-given-timezone-name-and-vise-ve
try: try:
from my.config import time as user_config from my.config import time as user_config
return user_config.tz.force_abbreviations # type: ignore[attr-defined]
return user_config.tz.force_abbreviations # type: ignore[attr-defined] # noqa: TRY300
# note: noqa since we're catching case where config doesn't have attribute here as well
except: except:
# todo log/apply policy # todo log/apply policy
return [] return []
@lru_cache(1) @lru_cache(1)
def _abbr_to_timezone_map() -> dict[str, pytz.BaseTzInfo]: def _abbr_to_timezone_map() -> Dict[str, pytz.BaseTzInfo]:
# also force UTC to always correspond to utc # also force UTC to always correspond to utc
# this makes more sense than Zulu it ends up by default # this makes more sense than Zulu it ends up by default
timezones = [*pytz.all_timezones, 'UTC', *user_forced()] timezones = pytz.all_timezones + ['UTC'] + list(user_forced())
res: dict[str, pytz.BaseTzInfo] = {} res: Dict[str, pytz.BaseTzInfo] = {}
for tzname in timezones: for tzname in timezones:
tz = pytz.timezone(tzname) tz = pytz.timezone(tzname)
infos = getattr(tz, '_tzinfos', []) # not sure if can rely on attr always present? infos = getattr(tz, '_tzinfos', []) # not sure if can rely on attr always present?
@ -46,7 +42,7 @@ def _abbr_to_timezone_map() -> dict[str, pytz.BaseTzInfo]:
return res return res
@cache @lru_cache(maxsize=None)
def abbr_to_timezone(abbr: str) -> pytz.BaseTzInfo: def abbr_to_timezone(abbr: str) -> pytz.BaseTzInfo:
return _abbr_to_timezone_map()[abbr] return _abbr_to_timezone_map()[abbr]

View file

@ -1,37 +0,0 @@
from __future__ import annotations
from .internal import assert_subpackage
assert_subpackage(__name__)
from dataclasses import asdict as dataclasses_asdict
from dataclasses import is_dataclass
from datetime import datetime
from typing import Any
Json = dict[str, Any]
# for now just serves documentation purposes... but one day might make it statically verifiable where possible?
# TODO e.g. maybe use opaque mypy alias?
datetime_naive = datetime
datetime_aware = datetime
def is_namedtuple(thing: Any) -> bool:
# basic check to see if this is namedtuple-like
_asdict = getattr(thing, '_asdict', None)
return (_asdict is not None) and callable(_asdict)
def asdict(thing: Any) -> Json:
# todo primitive?
# todo exception?
if isinstance(thing, dict):
return thing
if is_dataclass(thing):
assert not isinstance(thing, type) # to help mypy
return dataclasses_asdict(thing)
if is_namedtuple(thing):
return thing._asdict()
raise TypeError(f'Could not convert object {thing} to dict')

View file

@ -1,35 +1,41 @@
from __future__ import annotations from pathlib import Path
from itertools import chain
from importlib import import_module
import os import os
import pkgutil import pkgutil
import sys import sys
from collections.abc import Iterable from typing import List, Iterable, Optional
from itertools import chain
from pathlib import Path
from types import ModuleType
from .discovery_pure import HPIModule, _is_not_module_src, has_stats, ignored from .discovery_pure import HPIModule, ignored, _is_not_module_src, has_stats
def modules() -> Iterable[HPIModule]: def modules() -> Iterable[HPIModule]:
import my import my
for m in _iter_all_importables(my):
yield m
yield from _iter_all_importables(my)
from .common import StatsFun
def get_stats(module: str) -> Optional[StatsFun]:
# todo detect via ast?
try:
mod = import_module(module)
except Exception:
return None
return getattr(mod, 'stats', None)
__NOT_HPI_MODULE__ = 'Import this to mark a python file as a helper, not an actual HPI module' __NOT_HPI_MODULE__ = 'Import this to mark a python file as a helper, not an actual HPI module'
from .discovery_pure import NOT_HPI_MODULE_VAR from .discovery_pure import NOT_HPI_MODULE_VAR
assert NOT_HPI_MODULE_VAR in globals() # check name consistency assert NOT_HPI_MODULE_VAR in globals() # check name consistency
def is_not_hpi_module(module: str) -> Optional[str]:
def is_not_hpi_module(module: str) -> str | None:
''' '''
None if a module, otherwise returns reason None if a module, otherwise returns reason
''' '''
import importlib.util import importlib
path: Optional[str] = None
path: str | None = None
try: try:
# TODO annoying, this can cause import of the parent module? # TODO annoying, this can cause import of the parent module?
spec = importlib.util.find_spec(module) spec = importlib.util.find_spec(module)
@ -47,6 +53,7 @@ def is_not_hpi_module(module: str) -> str | None:
return None return None
from types import ModuleType
# todo reuse in readme/blog post # todo reuse in readme/blog post
# borrowed from https://github.com/sanitizers/octomachinery/blob/24288774d6dcf977c5033ae11311dbff89394c89/tests/circular_imports_test.py#L22-L55 # borrowed from https://github.com/sanitizers/octomachinery/blob/24288774d6dcf977c5033ae11311dbff89394c89/tests/circular_imports_test.py#L22-L55
def _iter_all_importables(pkg: ModuleType) -> Iterable[HPIModule]: def _iter_all_importables(pkg: ModuleType) -> Iterable[HPIModule]:
@ -60,10 +67,9 @@ def _iter_all_importables(pkg: ModuleType) -> Iterable[HPIModule]:
def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModule]: def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModule]:
from .core_config import config
"""Yield all importables under a given path and package.""" """Yield all importables under a given path and package."""
from .core_config import config # noqa: F401
for dir_path, dirs, file_names in os.walk(pkg_pth): for dir_path, dirs, file_names in os.walk(pkg_pth):
file_names.sort() file_names.sort()
# NOTE: sorting dirs in place is intended, it's the way you're supposed to do it with os.walk # NOTE: sorting dirs in place is intended, it's the way you're supposed to do it with os.walk
@ -78,7 +84,7 @@ def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModu
continue continue
rel_pt = pkg_dir_path.relative_to(pkg_pth) rel_pt = pkg_dir_path.relative_to(pkg_pth)
pkg_pref = '.'.join((pkg_name, *rel_pt.parts)) pkg_pref = '.'.join((pkg_name, ) + rel_pt.parts)
yield from _walk_packages( yield from _walk_packages(
(str(pkg_dir_path), ), prefix=f'{pkg_pref}.', (str(pkg_dir_path), ), prefix=f'{pkg_pref}.',
@ -86,7 +92,6 @@ def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModu
# TODO might need to make it defensive and yield Exception (otherwise hpi doctor might fail for no good reason) # TODO might need to make it defensive and yield Exception (otherwise hpi doctor might fail for no good reason)
# use onerror=? # use onerror=?
# ignored explicitly -> not HPI # ignored explicitly -> not HPI
# if enabled in config -> HPI # if enabled in config -> HPI
# if disabled in config -> HPI # if disabled in config -> HPI
@ -98,14 +103,14 @@ def _discover_path_importables(pkg_pth: Path, pkg_name: str) -> Iterable[HPIModu
def _walk_packages(path: Iterable[str], prefix: str='', onerror=None) -> Iterable[HPIModule]: def _walk_packages(path: Iterable[str], prefix: str='', onerror=None) -> Iterable[HPIModule]:
""" """
Modified version of https://github.com/python/cpython/blob/d50a0700265536a20bcce3fb108c954746d97625/Lib/pkgutil.py#L53, Modified version of https://github.com/python/cpython/blob/d50a0700265536a20bcce3fb108c954746d97625/Lib/pkgutil.py#L53,
to avoid importing modules that are skipped to alvoid importing modules that are skipped
""" """
from .core_config import config from .core_config import config
def seen(p, m={}): # noqa: B006 def seen(p, m={}):
if p in m: if p in m:
return True return True
m[p] = True # noqa: RET503 m[p] = True
for info in pkgutil.iter_modules(path, prefix): for info in pkgutil.iter_modules(path, prefix):
mname = info.name mname = info.name
@ -158,9 +163,8 @@ def _walk_packages(path: Iterable[str], prefix: str = '', onerror=None) -> Itera
path = [p for p in path if not seen(p)] path = [p for p in path if not seen(p)]
yield from _walk_packages(path, mname + '.', onerror) yield from _walk_packages(path, mname + '.', onerror)
# deprecate? # deprecate?
def get_modules() -> list[HPIModule]: def get_modules() -> List[HPIModule]:
return list(modules()) return list(modules())
@ -200,7 +204,6 @@ from my.core import __NOT_HPI_MODULE__
''') ''')
import sys import sys
orig_path = list(sys.path) orig_path = list(sys.path)
try: try:
sys.path.insert(0, str(badp)) sys.path.insert(0, str(badp))
@ -235,7 +238,6 @@ def stats():
''') ''')
import sys import sys
orig_path = list(sys.path) orig_path = list(sys.path)
try: try:
sys.path.insert(0, str(badp)) sys.path.insert(0, str(badp))

View file

@ -1,40 +0,0 @@
from __future__ import annotations
from concurrent.futures import Executor, Future
from typing import Any, Callable, TypeVar
from ..compat import ParamSpec
_P = ParamSpec('_P')
_T = TypeVar('_T')
# https://stackoverflow.com/a/10436851/706389
class DummyExecutor(Executor):
"""
This is useful if you're already using Executor for parallelising,
but also want to provide an option to run the code serially (e.g. for debugging)
"""
def __init__(self, max_workers: int | None = 1) -> None:
self._shutdown = False
self._max_workers = max_workers
def submit(self, fn: Callable[_P, _T], /, *args: _P.args, **kwargs: _P.kwargs) -> Future[_T]:
if self._shutdown:
raise RuntimeError('cannot schedule new futures after shutdown')
f: Future[Any] = Future()
try:
result = fn(*args, **kwargs)
except KeyboardInterrupt:
raise
except BaseException as e:
f.set_exception(e)
else:
f.set_result(result)
return f
def shutdown(self, wait: bool = True, **kwargs) -> None: # noqa: FBT001,FBT002,ARG002
self._shutdown = True

View file

@ -1,37 +0,0 @@
from __future__ import annotations
import importlib
import importlib.util
import sys
from pathlib import Path
from types import ModuleType
# TODO only used in tests? not sure if useful at all.
def import_file(p: Path | str, name: str | None = None) -> ModuleType:
p = Path(p)
if name is None:
name = p.stem
spec = importlib.util.spec_from_file_location(name, p)
assert spec is not None, f"Fatal error; Could not create module spec from {name} {p}"
foo = importlib.util.module_from_spec(spec)
loader = spec.loader
assert loader is not None
loader.exec_module(foo)
return foo
def import_from(path: Path | str, name: str) -> ModuleType:
path = str(path)
sys.path.append(path)
try:
return importlib.import_module(name)
finally:
sys.path.remove(path)
def import_dir(path: Path | str, extra: str = '') -> ModuleType:
p = Path(path)
if p.parts[0] == '~':
p = p.expanduser() # TODO eh. not sure about this..
return import_from(p.parent, p.name + extra)

View file

@ -1,369 +0,0 @@
"""
Various helpers/transforms of iterators
Ideally this should be as small as possible and we should rely on stdlib itertools or more_itertools
"""
from __future__ import annotations
import warnings
from collections.abc import Hashable, Iterable, Iterator, Sized
from typing import (
TYPE_CHECKING,
Callable,
TypeVar,
Union,
cast,
)
import more_itertools
from decorator import decorator
from .. import warnings as core_warnings
from ..compat import ParamSpec
T = TypeVar('T')
K = TypeVar('K')
V = TypeVar('V')
def _identity(v: T) -> V: # type: ignore[type-var]
return cast(V, v)
# ugh. nothing in more_itertools?
# perhaps duplicates_everseen? but it doesn't yield non-unique elements?
def ensure_unique(it: Iterable[T], *, key: Callable[[T], K]) -> Iterable[T]:
key2item: dict[K, T] = {}
for i in it:
k = key(i)
pi = key2item.get(k, None)
if pi is not None:
raise RuntimeError(f"Duplicate key: {k}. Previous value: {pi}, new value: {i}")
key2item[k] = i
yield i
def test_ensure_unique() -> None:
import pytest
assert list(ensure_unique([1, 2, 3], key=lambda i: i)) == [1, 2, 3]
dups = [1, 2, 1, 4]
# this works because it's lazy
it = ensure_unique(dups, key=lambda i: i)
# but forcing throws
with pytest.raises(RuntimeError, match='Duplicate key'):
list(it)
# hacky way to force distinct objects?
list(ensure_unique(dups, key=lambda _: object()))
def make_dict(
it: Iterable[T],
*,
key: Callable[[T], K],
# TODO make value optional instead? but then will need a typing override for it?
value: Callable[[T], V] = _identity,
) -> dict[K, V]:
with_keys = ((key(i), i) for i in it)
uniques = ensure_unique(with_keys, key=lambda p: p[0])
res: dict[K, V] = {}
for k, i in uniques:
res[k] = i if value is None else value(i)
return res
def test_make_dict() -> None:
import pytest
it = range(5)
d = make_dict(it, key=lambda i: i, value=lambda i: i % 2)
assert d == {0: 0, 1: 1, 2: 0, 3: 1, 4: 0}
it = range(5)
with pytest.raises(RuntimeError, match='Duplicate key'):
d = make_dict(it, key=lambda i: i % 2, value=lambda i: i)
# check type inference
d2: dict[str, int] = make_dict(it, key=lambda i: str(i))
d3: dict[str, bool] = make_dict(it, key=lambda i: str(i), value=lambda i: i % 2 == 0)
LFP = ParamSpec('LFP')
LV = TypeVar('LV')
@decorator
def _listify(func: Callable[LFP, Iterable[LV]], *args: LFP.args, **kwargs: LFP.kwargs) -> list[LV]:
"""
Wraps a function's return value in wrapper (e.g. list)
Useful when an algorithm can be expressed more cleanly as a generator
"""
return list(func(*args, **kwargs))
# ugh. decorator library has stub types, but they are way too generic?
# tried implementing my own stub, but failed -- not sure if it's possible at all?
# so seems easiest to just use specialize instantiations of decorator instead
if TYPE_CHECKING:
def listify(func: Callable[LFP, Iterable[LV]]) -> Callable[LFP, list[LV]]: ... # noqa: ARG001
else:
listify = _listify
def test_listify() -> None:
from ..compat import assert_type
@listify
def it() -> Iterator[int]:
yield 1
yield 2
res = it()
assert_type(res, list[int])
assert res == [1, 2]
@decorator
def _warn_if_empty(func, *args, **kwargs):
# so there is a more_itertools.peekable which could work nicely for these purposes
# the downside is that it would start advancing the generator right after it's created
# , which can be somewhat confusing
iterable = func(*args, **kwargs)
if isinstance(iterable, Sized):
sz = len(iterable)
if sz == 0:
core_warnings.medium(f"Function {func} returned empty container, make sure your config paths are correct")
return iterable
else: # must be an iterator
def wit():
empty = True
for i in iterable:
yield i
empty = False
if empty:
core_warnings.medium(f"Function {func} didn't emit any data, make sure your config paths are correct")
return wit()
if TYPE_CHECKING:
FF = TypeVar('FF', bound=Callable[..., Iterable])
def warn_if_empty(func: FF) -> FF: ... # noqa: ARG001
else:
warn_if_empty = _warn_if_empty
def test_warn_if_empty_iterator() -> None:
from ..compat import assert_type
@warn_if_empty
def nonempty() -> Iterator[str]:
yield 'a'
yield 'aba'
with warnings.catch_warnings(record=True) as w:
res1 = nonempty()
assert len(w) == 0 # warning isn't emitted until iterator is consumed
assert_type(res1, Iterator[str])
assert list(res1) == ['a', 'aba']
assert len(w) == 0
@warn_if_empty
def empty() -> Iterator[int]:
yield from []
with warnings.catch_warnings(record=True) as w:
res2 = empty()
assert len(w) == 0 # warning isn't emitted until iterator is consumed
assert_type(res2, Iterator[int])
assert list(res2) == []
assert len(w) == 1
def test_warn_if_empty_list() -> None:
from ..compat import assert_type
ll = [1, 2, 3]
@warn_if_empty
def nonempty() -> list[int]:
return ll
with warnings.catch_warnings(record=True) as w:
res1 = nonempty()
assert len(w) == 0
assert_type(res1, list[int])
assert isinstance(res1, list)
assert res1 is ll # object should be unchanged!
@warn_if_empty
def empty() -> list[str]:
return []
with warnings.catch_warnings(record=True) as w:
res2 = empty()
assert len(w) == 1
assert_type(res2, list[str])
assert isinstance(res2, list)
assert res2 == []
def test_warn_if_empty_unsupported() -> None:
# these should be rejected by mypy! (will show "unused type: ignore" if we break it)
@warn_if_empty # type: ignore[type-var]
def bad_return_type() -> float:
return 0.00
_HT = TypeVar('_HT', bound=Hashable)
# NOTE: ideally we'do It = TypeVar('It', bound=Iterable[_HT]), and function would be It -> It
# Sadly this doesn't work in mypy, doesn't look like we can have double bound TypeVar
# Not a huge deal, since this function is for unique_eversee and
# we need to pass iterator to unique_everseen anyway
# TODO maybe contribute to more_itertools? https://github.com/more-itertools/more-itertools/issues/898
def check_if_hashable(iterable: Iterable[_HT]) -> Iterable[_HT]:
"""
NOTE: Despite Hashable bound, typing annotation doesn't guarantee runtime safety
Consider hashable type X, and Y that inherits from X, but not hashable
Then l: list[X] = [Y(...)] is a valid expression, and type checks against Hashable,
but isn't runtime hashable
"""
# Sadly this doesn't work 100% correctly with dataclasses atm...
# they all are considered hashable: https://github.com/python/mypy/issues/11463
if isinstance(iterable, Iterator):
def res() -> Iterator[_HT]:
for i in iterable:
assert isinstance(i, Hashable), i
# ugh. need a cast due to https://github.com/python/mypy/issues/10817
yield cast(_HT, i)
return res()
else:
# hopefully, iterable that can be iterated over multiple times?
# not sure if should have 'allowlist' of types that don't have to be transformed instead?
for i in iterable:
assert isinstance(i, Hashable), i
return iterable
# TODO different policies -- error/warn/ignore?
def test_check_if_hashable() -> None:
from dataclasses import dataclass
import pytest
from ..compat import assert_type
x1: list[int] = [1, 2]
r1 = check_if_hashable(x1)
assert_type(r1, Iterable[int])
assert r1 is x1
x2: Iterator[int | str] = iter((123, 'aba'))
r2 = check_if_hashable(x2)
assert_type(r2, Iterable[Union[int, str]])
assert list(r2) == [123, 'aba']
x3: tuple[object, ...] = (789, 'aba')
r3 = check_if_hashable(x3)
assert_type(r3, Iterable[object])
assert r3 is x3 # object should be unchanged
x4: list[set[int]] = [{1, 2, 3}, {4, 5, 6}]
with pytest.raises(Exception):
# should be rejected by mypy sice set isn't Hashable, but also throw at runtime
r4 = check_if_hashable(x4) # type: ignore[type-var]
x5: Iterator[object] = iter([{1, 2}, {3, 4}])
# here, we hide behind object, which is hashable
# so mypy can't really help us anything
r5 = check_if_hashable(x5)
with pytest.raises(Exception):
# note: this only throws when iterator is advanced
list(r5)
# dataclass is unhashable by default! unless frozen=True and eq=True, or unsafe_hash=True
@dataclass(unsafe_hash=True)
class X:
a: int
x6: list[X] = [X(a=123)]
r6 = check_if_hashable(x6)
assert x6 is r6
# inherited dataclass will not be hashable!
@dataclass
class Y(X):
b: str
x7: list[Y] = [Y(a=123, b='aba')]
with pytest.raises(Exception):
# ideally that would also be rejected by mypy, but currently there is a bug
# which treats all dataclasses as hashable: https://github.com/python/mypy/issues/11463
check_if_hashable(x7)
_UET = TypeVar('_UET')
_UEU = TypeVar('_UEU')
# NOTE: for historic reasons, this function had to accept Callable that returns iterator
# instead of just iterator
# TODO maybe deprecated Callable support? not sure
def unique_everseen(
fun: Callable[[], Iterable[_UET]] | Iterable[_UET],
key: Callable[[_UET], _UEU] | None = None,
) -> Iterator[_UET]:
import os
if callable(fun):
iterable = fun()
else:
iterable = fun
if key is None:
# todo check key return type as well? but it's more likely to be hashable
if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
iterable = check_if_hashable(iterable)
return more_itertools.unique_everseen(iterable=iterable, key=key)
def test_unique_everseen() -> None:
import pytest
from ..tests.common import tmp_environ_set
def fun_good() -> Iterator[int]:
yield 123
def fun_bad():
return [{1, 2}, {1, 2}, {1, 3}]
with tmp_environ_set('HPI_CHECK_UNIQUE_EVERSEEN', 'yes'):
assert list(unique_everseen(fun_good)) == [123]
with pytest.raises(Exception):
# since function returns a list rather than iterator, check happens immediately
# , even without advancing the iterator
unique_everseen(fun_bad)
good_list = [4, 3, 2, 1, 2, 3, 4]
assert list(unique_everseen(good_list)) == [4, 3, 2, 1]
with tmp_environ_set('HPI_CHECK_UNIQUE_EVERSEEN', None):
assert list(unique_everseen(fun_bad)) == [{1, 2}, {1, 3}]

View file

@ -5,16 +5,14 @@ since who looks at the terminal output?
E.g. would be nice to propagate the warnings in the UI (it's even a subclass of Exception!) E.g. would be nice to propagate the warnings in the UI (it's even a subclass of Exception!)
''' '''
from __future__ import annotations
import sys import sys
from typing import Optional
import warnings import warnings
from typing import TYPE_CHECKING
import click import click
def _colorize(x: str, color: str | None = None) -> str: def _colorize(x: str, color: Optional[str]=None) -> str:
if color is None: if color is None:
return x return x
@ -26,10 +24,10 @@ def _colorize(x: str, color: str | None = None) -> str:
return click.style(x, fg=color) return click.style(x, fg=color)
def _warn(message: str, *args, color: str | None = None, **kwargs) -> None: def _warn(message: str, *args, color: Optional[str]=None, **kwargs) -> None:
stacklevel = kwargs.get('stacklevel', 1) stacklevel = kwargs.get('stacklevel', 1)
kwargs['stacklevel'] = stacklevel + 2 # +1 for this function, +1 for medium/high wrapper kwargs['stacklevel'] = stacklevel + 2 # +1 for this function, +1 for medium/high wrapper
warnings.warn(_colorize(message, color=color), *args, **kwargs) # noqa: B028 warnings.warn(_colorize(message, color=color), *args, **kwargs)
def low(message: str, *args, **kwargs) -> None: def low(message: str, *args, **kwargs) -> None:
@ -50,11 +48,5 @@ def high(message: str, *args, **kwargs) -> None:
_warn(message, *args, **kwargs) _warn(message, *args, **kwargs)
if not TYPE_CHECKING: # NOTE: deprecated -- legacy import
from .compat import deprecated from warnings import warn
@deprecated('use warnings.warn directly instead')
def warn(*args, **kwargs):
import warnings
return warnings.warn(*args, **kwargs) # noqa: B028

View file

@ -1,77 +1,69 @@
''' '''
Just a demo module for testing and documentation purposes Just a demo module for testing and documentation purposes
''' '''
from __future__ import annotations
import json from .core import Paths, PathIsh
from collections.abc import Iterable, Sequence
from typing import Optional
from datetime import tzinfo, timezone
from my.config import demo as user_config
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timezone, tzinfo
from pathlib import Path
from typing import Protocol
from my.core import Json, PathIsh, Paths, get_files
class config(Protocol): @dataclass
class demo(user_config):
data_path: Paths data_path: Paths
# this is to check required attribute handling
username: str username: str
# this is to check optional attribute handling
timezone: tzinfo = timezone.utc timezone: tzinfo = timezone.utc
external: PathIsh | None = None external: Optional[PathIsh] = None
@property @property
def external_module(self): def external_module(self):
rpath = self.external rpath = self.external
if rpath is not None: if rpath is not None:
from my.core.utils.imports import import_dir from .core.common import import_dir
return import_dir(rpath) return import_dir(rpath)
import my.config.repos.external as m # type: ignore import my.config.repos.external as m # type: ignore
return m return m
def make_config() -> config: from .core import make_config
from my.config import demo as user_config config = make_config(demo)
class combined_config(user_config, config): ... # TODO not sure about type checking?
external = config.external_module
return combined_config()
from pathlib import Path
from typing import Sequence, Iterable
from datetime import datetime
from .core import Json, get_files
@dataclass @dataclass
class Item: class Item:
''' '''
Some completely arbitrary artificial stuff, just for testing Some completely arbitrary artificial stuff, just for testing
''' '''
username: str username: str
raw: Json raw: Json
dt: datetime dt: datetime
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
cfg = make_config() return get_files(config.data_path)
return get_files(cfg.data_path)
import json
def items() -> Iterable[Item]: def items() -> Iterable[Item]:
cfg = make_config()
transform = (lambda i: i) if cfg.external is None else cfg.external_module.transform
for f in inputs(): for f in inputs():
dt = datetime.fromtimestamp(f.stat().st_mtime, tz=cfg.timezone) dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config.timezone)
j = json.loads(f.read_text()) j = json.loads(f.read_text())
for raw in j: for raw in j:
yield Item( yield Item(
username=cfg.username, username=config.username,
raw=transform(raw), raw=external.identity(raw),
dt=dt, dt=dt,
) )

View file

@ -4,33 +4,31 @@
Consumes data exported by https://github.com/karlicoss/emfitexport Consumes data exported by https://github.com/karlicoss/emfitexport
""" """
from __future__ import annotations
REQUIRES = [ REQUIRES = [
'git+https://github.com/karlicoss/emfitexport', 'git+https://github.com/karlicoss/emfitexport',
] ]
import dataclasses
import inspect
from collections.abc import Iterable, Iterator
from contextlib import contextmanager from contextlib import contextmanager
import dataclasses
from datetime import datetime, time, timedelta from datetime import datetime, time, timedelta
import inspect
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any, Dict, Iterable, Iterator, List, Optional
import emfitexport.dal as dal
from my.core import ( from my.core import (
Res,
Stats,
get_files, get_files,
stat, stat,
Res,
Stats,
) )
from my.core.cachew import cache_dir, mcachew from my.core.common import mcachew
from my.core.error import extract_error_datetime, set_error_datetime from my.core.cachew import cache_dir
from my.core.error import set_error_datetime, extract_error_datetime
from my.core.pandas import DataFrameT from my.core.pandas import DataFrameT
from my.config import emfit as config # isort: skip from my.config import emfit as config
import emfitexport.dal as dal
Emfit = dal.Emfit Emfit = dal.Emfit
@ -88,7 +86,7 @@ def datas() -> Iterable[Res[Emfit]]:
# TODO should be used for jawbone data as well? # TODO should be used for jawbone data as well?
def pre_dataframe() -> Iterable[Res[Emfit]]: def pre_dataframe() -> Iterable[Res[Emfit]]:
# TODO shit. I need some sort of interrupted sleep detection? # TODO shit. I need some sort of interrupted sleep detection?
g: list[Emfit] = [] g: List[Emfit] = []
def flush() -> Iterable[Res[Emfit]]: def flush() -> Iterable[Res[Emfit]]:
if len(g) == 0: if len(g) == 0:
@ -115,10 +113,10 @@ def pre_dataframe() -> Iterable[Res[Emfit]]:
def dataframe() -> DataFrameT: def dataframe() -> DataFrameT:
dicts: list[dict[str, Any]] = [] dicts: List[Dict[str, Any]] = []
last: Emfit | None = None last: Optional[Emfit] = None
for s in pre_dataframe(): for s in pre_dataframe():
d: dict[str, Any] d: Dict[str, Any]
if isinstance(s, Exception): if isinstance(s, Exception):
edt = extract_error_datetime(s) edt = extract_error_datetime(s)
d = { d = {
@ -158,9 +156,9 @@ def dataframe() -> DataFrameT:
last = s # meh last = s # meh
dicts.append(d) dicts.append(d)
import pandas as pd import pandas
return pd.DataFrame(dicts) return pandas.DataFrame(dicts)
def stats() -> Stats: def stats() -> Stats:
@ -169,12 +167,11 @@ def stats() -> Stats:
@contextmanager @contextmanager
def fake_data(nights: int = 500) -> Iterator: def fake_data(nights: int = 500) -> Iterator:
from my.core.cfg import tmp_config
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
import pytz import pytz
from my.core.cfg import tmp_config
with TemporaryDirectory() as td: with TemporaryDirectory() as td:
tdir = Path(td) tdir = Path(td)
gen = dal.FakeData() gen = dal.FakeData()
@ -191,9 +188,9 @@ def fake_data(nights: int = 500) -> Iterator:
# TODO remove/deprecate it? I think used by timeline # TODO remove/deprecate it? I think used by timeline
def get_datas() -> list[Emfit]: def get_datas() -> List[Emfit]:
# todo ugh. run lint properly # todo ugh. run lint properly
return sorted(datas(), key=lambda e: e.start) # type: ignore return list(sorted(datas(), key=lambda e: e.start)) # type: ignore
# TODO move away old entries if there is a diff?? # TODO move away old entries if there is a diff??

View file

@ -7,14 +7,13 @@ REQUIRES = [
] ]
# todo use ast in setup.py or doctor to extract the corresponding pip packages? # todo use ast in setup.py or doctor to extract the corresponding pip packages?
from collections.abc import Iterable, Sequence
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Sequence, Iterable
from my.config import endomondo as user_config
from .core import Paths, get_files from .core import Paths, get_files
from my.config import endomondo as user_config
@dataclass @dataclass
class endomondo(user_config): class endomondo(user_config):
@ -32,22 +31,20 @@ def inputs() -> Sequence[Path]:
# todo add a doctor check for pip endoexport module # todo add a doctor check for pip endoexport module
import endoexport.dal as dal import endoexport.dal as dal
from endoexport.dal import Point, Workout # noqa: F401 from endoexport.dal import Point, Workout
from .core import Res from .core import Res
# todo cachew? # todo cachew?
def workouts() -> Iterable[Res[Workout]]: def workouts() -> Iterable[Res[Workout]]:
_dal = dal.DAL(inputs()) _dal = dal.DAL(inputs())
yield from _dal.workouts() yield from _dal.workouts()
from .core.pandas import DataFrameT, check_dataframe from .core.pandas import check_dataframe, DataFrameT
@check_dataframe @check_dataframe
def dataframe(*, defensive: bool=True) -> DataFrameT: def dataframe(defensive: bool=True) -> DataFrameT:
def it(): def it():
for w in workouts(): for w in workouts():
if isinstance(w, Exception): if isinstance(w, Exception):
@ -78,9 +75,7 @@ def dataframe(*, defensive: bool=True) -> DataFrameT:
return df return df
from .core import Stats, stat from .core import stat, Stats
def stats() -> Stats: def stats() -> Stats:
return { return {
# todo pretty print stats? # todo pretty print stats?
@ -91,16 +86,13 @@ def stats() -> Stats:
# TODO make sure it's possible to 'advise' functions and override stuff # TODO make sure it's possible to 'advise' functions and override stuff
from collections.abc import Iterator
from contextlib import contextmanager from contextlib import contextmanager
from typing import Iterator
@contextmanager @contextmanager
def fake_data(count: int=100) -> Iterator: def fake_data(count: int=100) -> Iterator:
import json
from tempfile import TemporaryDirectory
from my.core.cfg import tmp_config from my.core.cfg import tmp_config
from tempfile import TemporaryDirectory
import json
with TemporaryDirectory() as td: with TemporaryDirectory() as td:
tdir = Path(td) tdir = Path(td)
fd = dal.FakeData() fd = dal.FakeData()

View file

@ -1,6 +1,6 @@
from .core.warnings import high from .core.warnings import high
high("DEPRECATED! Please use my.core.error instead.") high("DEPRECATED! Please use my.core.error instead.")
from .core import __NOT_HPI_MODULE__ from .core import __NOT_HPI_MODULE__
from .core.error import * from .core.error import *

View file

@ -1,60 +0,0 @@
from collections.abc import Iterator
from dataclasses import dataclass
from typing import Any
from my.core.compat import NoneType, assert_never
# TODO Popper? not sure
@dataclass
class Helper:
manager: 'Manager'
item: Any # todo realistically, list or dict? could at least type as indexable or something
path: tuple[str, ...]
def pop_if_primitive(self, *keys: str) -> None:
"""
The idea that primitive TODO
"""
item = self.item
for k in keys:
v = item[k]
if isinstance(v, (str, bool, float, int, NoneType)):
item.pop(k) # todo kinda unfortunate to get dict item twice.. but not sure if can avoid?
def check(self, key: str, expected: Any) -> None:
actual = self.item.pop(key)
assert actual == expected, (key, actual, expected)
def zoom(self, key: str) -> 'Helper':
return self.manager.helper(item=self.item.pop(key), path=(*self.path, key))
def is_empty(x) -> bool:
if isinstance(x, dict):
return len(x) == 0
elif isinstance(x, list):
return all(map(is_empty, x))
else:
assert_never(x) # noqa: RET503
class Manager:
def __init__(self) -> None:
self.helpers: list[Helper] = []
def helper(self, item: Any, *, path: tuple[str, ...] = ()) -> Helper:
res = Helper(manager=self, item=item, path=path)
self.helpers.append(res)
return res
def check(self) -> Iterator[Exception]:
remaining = []
for h in self.helpers:
# TODO recursively check it's primitive?
if is_empty(h.item):
continue
remaining.append((h.path, h.item))
if len(remaining) == 0:
return
yield RuntimeError(f'Unparsed items remaining: {remaining}')

View file

@ -9,7 +9,7 @@ since that allows for easier overriding using namespace packages
See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info. See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info.
""" """
# prevent it from appearing in modules list/doctor # prevent it from apprearing in modules list/doctor
from ..core import __NOT_HPI_MODULE__ from ..core import __NOT_HPI_MODULE__
# kinda annoying to keep it, but it's so legacy 'hpi module install my.fbmessenger' works # kinda annoying to keep it, but it's so legacy 'hpi module install my.fbmessenger' works
@ -20,7 +20,6 @@ REQUIRES = [
from my.core.hpi_compat import handle_legacy_import from my.core.hpi_compat import handle_legacy_import
is_legacy_import = handle_legacy_import( is_legacy_import = handle_legacy_import(
parent_module_name=__name__, parent_module_name=__name__,
legacy_submodule_name='export', legacy_submodule_name='export',

View file

@ -1,10 +1,10 @@
from collections.abc import Iterator from typing import Iterator
from my.core import Res, stat, Stats
from my.core import Res, Stats
from my.core.source import import_source from my.core.source import import_source
from .common import Message, _merge_messages from .common import Message, _merge_messages
src_export = import_source(module_name='my.fbmessenger.export') src_export = import_source(module_name='my.fbmessenger.export')
src_android = import_source(module_name='my.fbmessenger.android') src_android = import_source(module_name='my.fbmessenger.android')

View file

@ -1,23 +1,20 @@
""" """
Messenger data from Android app database (in =/data/data/com.facebook.orca/databases/threads_db2=) Messenger data from Android app database (in =/data/data/com.facebook.orca/databases/threads_db2=)
""" """
from __future__ import annotations from __future__ import annotations
import sqlite3
from collections.abc import Iterator, Sequence
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Union import sqlite3
from typing import Iterator, Sequence, Optional, Dict, Union, List
from my.core import LazyLogger, Paths, Res, datetime_aware, get_files, make_config from my.core import get_files, Paths, datetime_aware, Res, assert_never, LazyLogger, make_config
from my.core.common import unique_everseen from my.core.common import unique_everseen
from my.core.compat import assert_never
from my.core.error import echain from my.core.error import echain
from my.core.sqlite import sqlite_connection, SqliteTool from my.core.sqlite import sqlite_connection
from my.config import fbmessenger as user_config # isort: skip from my.config import fbmessenger as user_config
logger = LazyLogger(__name__) logger = LazyLogger(__name__)
@ -28,7 +25,7 @@ class Config(user_config.android):
# paths[s]/glob to the exported sqlite databases # paths[s]/glob to the exported sqlite databases
export_path: Paths export_path: Paths
facebook_id: str | None = None facebook_id: Optional[str] = None
# hmm. this is necessary for default value (= None) to work # hmm. this is necessary for default value (= None) to work
@ -43,13 +40,13 @@ def inputs() -> Sequence[Path]:
@dataclass(unsafe_hash=True) @dataclass(unsafe_hash=True)
class Sender: class Sender:
id: str id: str
name: str | None name: Optional[str]
@dataclass(unsafe_hash=True) @dataclass(unsafe_hash=True)
class Thread: class Thread:
id: str id: str
name: str | None # isn't set for groups or one to one messages name: Optional[str] # isn't set for groups or one to one messages
# todo not sure about order of fields... # todo not sure about order of fields...
@ -57,14 +54,14 @@ class Thread:
class _BaseMessage: class _BaseMessage:
id: str id: str
dt: datetime_aware dt: datetime_aware
text: str | None text: Optional[str]
@dataclass(unsafe_hash=True) @dataclass(unsafe_hash=True)
class _Message(_BaseMessage): class _Message(_BaseMessage):
thread_id: str thread_id: str
sender_id: str sender_id: str
reply_to_id: str | None reply_to_id: Optional[str]
# todo hmm, on the one hand would be kinda nice to inherit common.Message protocol here # todo hmm, on the one hand would be kinda nice to inherit common.Message protocol here
@ -73,7 +70,7 @@ class _Message(_BaseMessage):
class Message(_BaseMessage): class Message(_BaseMessage):
thread: Thread thread: Thread
sender: Sender sender: Sender
reply_to: Message | None reply_to: Optional[Message]
Entity = Union[Sender, Thread, _Message] Entity = Union[Sender, Thread, _Message]
@ -86,8 +83,8 @@ def _entities() -> Iterator[Res[Entity]]:
for idx, path in enumerate(paths): for idx, path in enumerate(paths):
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
with sqlite_connection(path, immutable=True, row_factory='row') as db: with sqlite_connection(path, immutable=True, row_factory='row') as db:
use_msys = "logging_events_v2" in SqliteTool(db).get_table_names()
try: try:
use_msys = len(list(db.execute('SELECT * FROM sqlite_master WHERE name = "logging_events_v2"'))) > 0
if use_msys: if use_msys:
yield from _process_db_msys(db) yield from _process_db_msys(db)
else: else:
@ -111,7 +108,7 @@ def _normalise_thread_id(key) -> str:
# NOTE: this is sort of copy pasted from other _process_db method # NOTE: this is sort of copy pasted from other _process_db method
# maybe later could unify them # maybe later could unify them
def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]: def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
senders: dict[str, Sender] = {} senders: Dict[str, Sender] = {}
for r in db.execute('SELECT CAST(id AS TEXT) AS id, name FROM contacts'): for r in db.execute('SELECT CAST(id AS TEXT) AS id, name FROM contacts'):
s = Sender( s = Sender(
id=r['id'], # looks like it's server id? same used on facebook site id=r['id'], # looks like it's server id? same used on facebook site
@ -128,21 +125,18 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
# TODO can we get it from db? could infer as the most common id perhaps? # TODO can we get it from db? could infer as the most common id perhaps?
self_id = config.facebook_id self_id = config.facebook_id
thread_users: dict[str, list[Sender]] = {} thread_users: Dict[str, List[Sender]] = {}
for r in db.execute('SELECT CAST(thread_key AS TEXT) AS thread_key, CAST(contact_id AS TEXT) AS contact_id FROM participants'): for r in db.execute('SELECT CAST(thread_key AS TEXT) AS thread_key, CAST(contact_id AS TEXT) AS contact_id FROM participants'):
thread_key = r['thread_key'] thread_key = r['thread_key']
user_key = r['contact_id'] user_key = r['contact_id']
if self_id is not None and user_key == self_id:
# exclude yourself, otherwise it's just spammy to show up in all participants
continue
ll = thread_users.get(thread_key) ll = thread_users.get(thread_key)
if ll is None: if ll is None:
ll = [] ll = []
thread_users[thread_key] = ll thread_users[thread_key] = ll
if self_id is not None and user_key == self_id:
# exclude yourself, otherwise it's just spammy to show up in all participants
# TODO not sure about that, maybe change later
continue
ll.append(senders[user_key]) ll.append(senders[user_key])
# 15 is a weird thread that doesn't have any participants and messages # 15 is a weird thread that doesn't have any participants and messages
@ -169,15 +163,6 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
CAST(sender_id AS TEXT) AS sender_id, CAST(sender_id AS TEXT) AS sender_id,
reply_source_id reply_source_id
FROM messages FROM messages
WHERE
/* Regular message_id conforms to mid.* regex.
However seems that when message is not sent yet it doesn't have this server id yet
(happened only once, but could be just luck of course!)
We exclude these messages to avoid duplication.
However poisitive filter (e.g. message_id LIKE 'mid%') feels a bit wrong, e.g. what if message ids change or something
So instead this excludes only such unsent messages.
*/
message_id != offline_threading_id
ORDER BY timestamp_ms /* they aren't in order in the database, so need to sort */ ORDER BY timestamp_ms /* they aren't in order in the database, so need to sort */
''' '''
): ):
@ -194,7 +179,7 @@ def _process_db_msys(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
def _process_db_threads_db2(db: sqlite3.Connection) -> Iterator[Res[Entity]]: def _process_db_threads_db2(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
senders: dict[str, Sender] = {} senders: Dict[str, Sender] = {}
for r in db.execute('''SELECT * FROM thread_users'''): for r in db.execute('''SELECT * FROM thread_users'''):
# for messaging_actor_type == 'REDUCED_MESSAGING_ACTOR', name is None # for messaging_actor_type == 'REDUCED_MESSAGING_ACTOR', name is None
# but they are still referenced, so need to keep # but they are still referenced, so need to keep
@ -208,7 +193,7 @@ def _process_db_threads_db2(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
yield s yield s
self_id = config.facebook_id self_id = config.facebook_id
thread_users: dict[str, list[Sender]] = {} thread_users: Dict[str, List[Sender]] = {}
for r in db.execute('SELECT * from thread_participants'): for r in db.execute('SELECT * from thread_participants'):
thread_key = r['thread_key'] thread_key = r['thread_key']
user_key = r['user_key'] user_key = r['user_key']
@ -268,9 +253,9 @@ def contacts() -> Iterator[Res[Sender]]:
def messages() -> Iterator[Res[Message]]: def messages() -> Iterator[Res[Message]]:
senders: dict[str, Sender] = {} senders: Dict[str, Sender] = {}
msgs: dict[str, Message] = {} msgs: Dict[str, Message] = {}
threads: dict[str, Thread] = {} threads: Dict[str, Thread] = {}
for x in unique_everseen(_entities): for x in unique_everseen(_entities):
if isinstance(x, Exception): if isinstance(x, Exception):
yield x yield x

View file

@ -1,9 +1,6 @@
from __future__ import annotations from my.core import __NOT_HPI_MODULE__
from my.core import __NOT_HPI_MODULE__ # isort: skip from typing import Iterator, Optional, Protocol
from collections.abc import Iterator
from typing import Protocol
from my.core import datetime_aware from my.core import datetime_aware
@ -13,7 +10,7 @@ class Thread(Protocol):
def id(self) -> str: ... def id(self) -> str: ...
@property @property
def name(self) -> str | None: ... def name(self) -> Optional[str]: ...
class Sender(Protocol): class Sender(Protocol):
@ -21,7 +18,7 @@ class Sender(Protocol):
def id(self) -> str: ... def id(self) -> str: ...
@property @property
def name(self) -> str | None: ... def name(self) -> Optional[str]: ...
class Message(Protocol): class Message(Protocol):
@ -32,7 +29,7 @@ class Message(Protocol):
def dt(self) -> datetime_aware: ... def dt(self) -> datetime_aware: ...
@property @property
def text(self) -> str | None: ... def text(self) -> Optional[str]: ...
@property @property
def thread(self) -> Thread: ... def thread(self) -> Thread: ...
@ -42,11 +39,8 @@ class Message(Protocol):
from itertools import chain from itertools import chain
from more_itertools import unique_everseen from more_itertools import unique_everseen
from my.core import warn_if_empty, Res
from my.core import Res, warn_if_empty
@warn_if_empty @warn_if_empty
def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:

View file

@ -7,15 +7,16 @@ REQUIRES = [
'git+https://github.com/karlicoss/fbmessengerexport', 'git+https://github.com/karlicoss/fbmessengerexport',
] ]
from collections.abc import Iterator
from contextlib import ExitStack, contextmanager from contextlib import ExitStack, contextmanager
from dataclasses import dataclass from dataclasses import dataclass
from typing import Iterator
from my.core import PathIsh, Res, stat, Stats
from my.core.warnings import high
from my.config import fbmessenger as user_config
import fbmessengerexport.dal as messenger import fbmessengerexport.dal as messenger
from my.config import fbmessenger as user_config
from my.core import PathIsh, Res, Stats, stat
from my.core.warnings import high
### ###
# support old style config # support old style config

View file

@ -2,16 +2,18 @@
Foursquare/Swarm checkins Foursquare/Swarm checkins
''' '''
import json from datetime import datetime, timezone, timedelta
from datetime import datetime, timedelta, timezone
from itertools import chain from itertools import chain
from pathlib import Path
from my.config import foursquare as config import json
# TODO pytz for timezone??? # TODO pytz for timezone???
from my.core import get_files, make_logger
logger = make_logger(__name__) from .core.common import get_files, LazyLogger
from my.config import foursquare as config
logger = LazyLogger(__name__)
def inputs(): def inputs():

View file

@ -3,7 +3,8 @@ Unified Github data (merged from GDPR export and periodic API updates)
""" """
from . import gdpr, ghexport from . import gdpr, ghexport
from .common import Results, merge_events
from .common import merge_events, Results
def events() -> Results: def events() -> Results:

View file

@ -1,20 +1,17 @@
""" """
Github events and their metadata: comments/issues/pull requests Github events and their metadata: comments/issues/pull requests
""" """
from ..core import __NOT_HPI_MODULE__
from __future__ import annotations
from my.core import __NOT_HPI_MODULE__ # isort: skip
from collections.abc import Iterable
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import NamedTuple, Optional from typing import Optional, NamedTuple, Iterable, Set, Tuple
from my.core import make_logger, warn_if_empty from ..core import warn_if_empty, LazyLogger
from my.core.error import Res from ..core.error import Res
logger = make_logger(__name__)
logger = LazyLogger(__name__)
class Event(NamedTuple): class Event(NamedTuple):
dt: datetime dt: datetime
@ -30,7 +27,7 @@ Results = Iterable[Res[Event]]
@warn_if_empty @warn_if_empty
def merge_events(*sources: Results) -> Results: def merge_events(*sources: Results) -> Results:
from itertools import chain from itertools import chain
emitted: set[tuple[datetime, str]] = set() emitted: Set[Tuple[datetime, str]] = set()
for e in chain(*sources): for e in chain(*sources):
if isinstance(e, Exception): if isinstance(e, Exception):
yield e yield e
@ -55,7 +52,7 @@ def parse_dt(s: str) -> datetime:
# experimental way of supportint event ids... not sure # experimental way of supportint event ids... not sure
class EventIds: class EventIds:
@staticmethod @staticmethod
def repo_created(*, dts: str, name: str, ref_type: str, ref: str | None) -> str: def repo_created(*, dts: str, name: str, ref_type: str, ref: Optional[str]) -> str:
return f'{dts}_repocreated_{name}_{ref_type}_{ref}' return f'{dts}_repocreated_{name}_{ref_type}_{ref}'
@staticmethod @staticmethod

Some files were not shown because too many files have changed in this diff Show more