Compare commits
459 commits
v0.2.20201
...
master
Author | SHA1 | Date | |
---|---|---|---|
|
bb703c8c6a | ||
|
54df429f61 | ||
|
f1d23c5e96 | ||
|
d8c53bde34 | ||
|
95a16b956f | ||
|
a7f05c2cad | ||
|
ad55c5c345 | ||
|
7ab6f0d5cb | ||
|
a2b397ec4a | ||
|
8496d131e7 | ||
|
d3f9a8e8b6 | ||
|
bc7c3ac253 | ||
|
a8f86e32b9 | ||
|
6a6d157040 | ||
|
bf8af6c598 | ||
|
8ed9e1947e | ||
|
75639a3d5e | ||
|
3166109f15 | ||
|
02dabe9f2b | ||
|
239e6617fe | ||
|
e036cc9e85 | ||
|
2ca323da84 | ||
|
6a18f47c37 | ||
|
201ddd4d7c | ||
|
27178c0939 | ||
|
71fdeca5e1 | ||
|
d58453410c | ||
|
1c5efc46aa | ||
|
affa79ba3a | ||
|
fc0e0be291 | ||
|
c5df3ce128 | ||
|
ac08af7aab | ||
|
9fd4227abf | ||
|
bd1e5d2f11 | ||
|
985c0f94e6 | ||
|
72cc8ff3ac | ||
|
d0df8e8f2d | ||
|
b594377a59 | ||
|
664c40e3e8 | ||
|
118c2d4484 | ||
|
d244c7cc4e | ||
|
c08ddbc781 | ||
|
b1fe23b8d0 | ||
|
b87d1c970a | ||
|
a5643206a0 | ||
|
270080bd56 | ||
|
094519acaf | ||
|
7cae9d5bf3 | ||
|
2ff2dcfc00 | ||
|
1215181af5 | ||
|
5a67f0bafe | ||
|
d154825591 | ||
|
9f017fb29b | ||
|
5ec357915b | ||
|
245ad22057 | ||
|
7bfce72b7c | ||
|
7023088d13 | ||
|
614c929f95 | ||
|
2b0f92c883 | ||
|
7f8a502310 | ||
|
88f3c17c27 | ||
|
c45c51af22 | ||
|
18529257e7 | ||
|
bcc4c15304 | ||
|
06084a8787 | ||
|
770dba5506 | ||
|
66c08a6c80 | ||
|
c64d7f5b67 | ||
|
973c4205df | ||
|
a7439c7846 | ||
|
1317914bff | ||
|
1e1e8d8494 | ||
|
069264ce52 | ||
|
c69a0b43ba | ||
|
34593c032d | ||
|
074e24c309 | ||
|
fb8e9909a4 | ||
|
3aebc573e8 | ||
|
b615ba10b1 | ||
|
2c63fe25c0 | ||
|
652ee9b875 | ||
|
9e72672b4f | ||
|
d5fccf1874 | ||
|
0e6dd32afe | ||
|
c9c0e19543 | ||
|
35dd5d82a0 | ||
|
8a8a1ebb0e | ||
|
103ea2096e | ||
|
751ed02f43 | ||
|
477b7e8fd3 | ||
|
0f3d09915c | ||
|
7236024c7a | ||
|
87a8a7781b | ||
|
93e475795d | ||
|
1b187b2c1b | ||
|
3ec362fce9 | ||
|
a0ce666024 | ||
|
1c452b12d4 | ||
|
51209c547e | ||
|
a4a7bc41b9 | ||
|
3d75abafe9 | ||
|
a8f8858cb1 | ||
|
adbc0e73a2 | ||
|
84d835962d | ||
|
224ba521e3 | ||
|
a843407e40 | ||
|
09e0f66892 | ||
|
bde43d6a7a | ||
|
37643c098f | ||
|
7b1cec9326 | ||
|
657ce08ac8 | ||
|
996169aa29 | ||
|
70bb9ed0c5 | ||
|
65c617ed94 | ||
|
ac5f71c68b | ||
|
e547acfa59 | ||
|
33f8d867e2 | ||
|
19353e996d | ||
|
4ac3bbb101 | ||
|
5630621ec1 | ||
|
7631f1f2e4 | ||
|
105928238f | ||
|
24da04f142 | ||
|
71cb66df5f | ||
|
d6786084ca | ||
|
79ce8e84ec | ||
|
f28f68b14b | ||
|
ea195e3d17 | ||
|
bd27bd4c24 | ||
|
f668208bce | ||
|
6821fbc2fe | ||
|
edea2c2e75 | ||
|
d88a1b9933 | ||
|
4f7c9b4a71 | ||
|
70bf51a125 | ||
|
fb2b3e07de | ||
|
32aa87b3ec | ||
|
3a25c9042c | ||
|
bef0423b4f | ||
|
a0910e798d | ||
|
1f61e853c9 | ||
|
a5c04e789a | ||
|
0e94e0a9ea | ||
|
72ab2603d5 | ||
|
414b88178f | ||
|
f355a55e06 | ||
|
f9a1050ceb | ||
|
86ea605aec | ||
|
c335c0c9d8 | ||
|
a60d69fb30 | ||
|
c5fe2e9412 | ||
|
872053a3c3 | ||
|
37bb33cdbc | ||
|
8c2d1c9463 | ||
|
c63e80ce94 | ||
|
9ffce1b696 | ||
|
29832a9f75 | ||
|
28d2450a21 | ||
|
fe26efaea8 | ||
|
bb478f369d | ||
|
68289c1be3 | ||
|
0512488241 | ||
|
fabcbab751 | ||
|
8cd74a9fc4 | ||
|
f3507613f0 | ||
|
8addd2d58a | ||
|
01480ec8eb | ||
|
be81466871 | ||
|
2a46341ce2 | ||
|
ff84d8fc88 | ||
|
c283e542e3 | ||
|
642e3b14d5 | ||
|
7ec894807f | ||
|
fcaa7c1561 | ||
|
d6af4dec11 | ||
|
88a3aa8d67 | ||
|
c25ab51664 | ||
|
6f6be5c78e | ||
|
dff31455f1 | ||
|
661714f1d9 | ||
|
6aa3d4225e | ||
|
ab7135d42f | ||
|
c12224af74 | ||
|
c91534b966 | ||
|
5fe21240b4 | ||
|
f8cd31044e | ||
|
fcfc423a75 | ||
|
9594caa1cd | ||
|
04d976f937 | ||
|
a98bc6daca | ||
|
fe88380499 | ||
|
c34656e8fb | ||
|
a445d2cbfe | ||
|
7a32302d66 | ||
|
82bc51d9fc | ||
|
40de162fab | ||
|
02c738594f | ||
|
d464b1e607 | ||
|
0c5b2b4a09 | ||
|
8288032b1c | ||
|
74710b339a | ||
|
d2ef23fcb4 | ||
|
919c84fb5a | ||
|
9aadbb504b | ||
|
8f7d14e7c6 | ||
|
e7be680841 | ||
|
347cd1ef77 | ||
|
58d2e25a42 | ||
|
bef832cbff | ||
|
0a05b27266 | ||
|
457797bdfb | ||
|
9db5f318fb | ||
|
79eeab2128 | ||
|
9d231a8ea9 | ||
|
a4c713664e | ||
|
bee17d932b | ||
|
4dfc4029c3 | ||
|
b94904f5ee | ||
|
db2cd00bed | ||
|
a70118645b | ||
|
f36bc6144b | ||
|
435cb020f9 | ||
|
98b086f746 | ||
|
6dc5e7575f | ||
|
a7099e2efc | ||
|
02c98143d5 | ||
|
130c273513 | ||
|
07e7c62d02 | ||
|
c63177e186 | ||
|
eff9c02886 | ||
|
af874d2d75 | ||
|
6493859ba5 | ||
|
6594ad24dc | ||
|
458633ea96 | ||
|
0e884fe166 | ||
|
5ac5636e7f | ||
|
fb0c1289f0 | ||
|
bb5ad2b6ac | ||
|
5c82d0faa9 | ||
|
9c432027b5 | ||
|
11b6e51c90 | ||
|
54e6fe6ab5 | ||
|
ad52e131a0 | ||
|
716a2c82ba | ||
|
7098d6831f | ||
|
5f1d41fa52 | ||
|
ca91be8154 | ||
|
c8cf0272f9 | ||
|
7925ec81b6 | ||
|
119b295d71 | ||
|
dbd15a7ee8 | ||
|
cef9b4c6d3 | ||
|
f0397b00ff | ||
|
5f0231c5ee | ||
|
016f28250b | ||
|
fd0c65d176 | ||
|
b9d788efd0 | ||
|
7323e99504 | ||
|
b5f266c2bd | ||
|
bf3dd6e931 | ||
|
7a1b7b1554 | ||
|
fd1a683d49 | ||
|
b96c9f4534 | ||
|
3faebdd629 | ||
|
186f561018 | ||
|
9461df6aa5 | ||
|
8336d18434 | ||
|
179b657eea | ||
|
049820c827 | ||
|
1b4ca6ad1b | ||
|
73e57b52d1 | ||
|
2025d7ad1a | ||
|
5799c062a5 | ||
|
4e59a65f9a | ||
|
711157e0f5 | ||
|
d092608002 | ||
|
ef120bc643 | ||
|
946daf40d0 | ||
|
bb4c77612b | ||
|
bb6201bf2d | ||
|
1e2fc3bec7 | ||
|
44a6b17ec3 | ||
|
4104f821fa | ||
|
d65e1b5245 | ||
|
de7972be05 | ||
|
19da373a0a | ||
|
eae0e1a614 | ||
|
76a497f2bb | ||
|
64a4782f0e | ||
|
637982a5ba | ||
|
80c5be7293 | ||
|
0ce44bf0d1 | ||
|
f43eedd52a | ||
|
2cb836181b | ||
|
66a00c6ada | ||
|
78f6ae96d1 | ||
|
915cfe69b3 | ||
|
f9f73dda24 | ||
|
6e921627d3 | ||
|
382f205429 | ||
|
599a8b0dd7 | ||
|
e6e948de9c | ||
|
706ec03a3f | ||
|
7c0f304f94 | ||
|
444ec1c450 | ||
|
16c777b45a | ||
|
e750666e30 | ||
|
3fd6c81511 | ||
|
07b0c0cbef | ||
|
6185942f78 | ||
|
8b01674fed | ||
|
f1b18beef7 | ||
|
9e5cd60ff2 | ||
|
059c4ae791 | ||
|
a791b25650 | ||
|
7bf316eb9a | ||
|
bea2c6a201 | ||
|
62832a6756 | ||
|
b6fa26b899 | ||
|
b9852f45cf | ||
|
afdf9d4334 | ||
|
f8e73134b3 | ||
|
4626c1bba6 | ||
|
403ca9c111 | ||
|
fcd7ca6480 | ||
|
f78b12f005 | ||
|
c4ad84ad95 | ||
|
590e09f80b | ||
|
1e635502a2 | ||
|
0e891a267f | ||
|
d1f791dee8 | ||
|
e30953195c | ||
|
823668ca5c | ||
|
7ead8eb4c9 | ||
|
673ee53a49 | ||
|
a39b5605ae | ||
|
a1f03f9c02 | ||
|
73c9e46c4c | ||
|
7493770d4d | ||
|
03dd1271f4 | ||
|
3f4fb64d56 | ||
|
be21606075 | ||
|
5e9cc2a6a0 | ||
|
01dfbbd58e | ||
|
83725e49dd | ||
|
dd928964e6 | ||
|
9578b13fca | ||
|
074b8685d6 | ||
|
8033b5cdbd | ||
|
4364484192 | ||
|
d006339ab4 | ||
|
d6c484f321 | ||
|
5d2eadbcc6 | ||
|
8422c6e420 | ||
|
b54ec0d7f1 | ||
|
f5b47dd695 | ||
|
68d77981db | ||
|
4a04c09f31 | ||
|
46198a6447 | ||
|
821bc08a23 | ||
|
8ca88bde2e | ||
|
2a4bddea79 | ||
|
e8be20dcb5 | ||
|
b64a11cc69 | ||
|
014494059d | ||
|
43cfb2742f | ||
|
fa7474c087 | ||
|
d71383ddee | ||
|
68019c80db | ||
|
0517f7ffb8 | ||
|
0278f2b68d | ||
|
491bef83bc | ||
|
2611e237a3 | ||
|
393ed0d9ce | ||
|
4b4cb7cb5b | ||
|
277f0e3988 | ||
|
91eed15a75 | ||
|
68d3385468 | ||
|
1ef2c5619e | ||
|
c1b70cd90e | ||
|
f559e7cb89 | ||
|
fb49243005 | ||
|
b94120deaf | ||
|
f09ca17560 | ||
|
e99e8725b1 | ||
|
f2a339f755 | ||
|
349ab78fca | ||
|
5ef2775265 | ||
|
386234970b | ||
|
b306ccc839 | ||
|
c31641b4eb | ||
|
5ecd4b4810 | ||
|
a11a3af597 | ||
|
edf6e5d50b | ||
|
ad177a1ccd | ||
|
e7604c188e | ||
|
5c38872efc | ||
|
3118891c03 | ||
|
d47f3c28aa | ||
|
1b36bd4379 | ||
|
29384aef44 | ||
|
1cdef6f40a | ||
|
2bbbff0021 | ||
|
8e868391fd | ||
|
eb26cf8633 | ||
|
02a9fb5e8f | ||
|
a1a24ffbc3 | ||
|
ec8b0e9170 | ||
|
8d6f691824 | ||
|
4db81ca362 | ||
|
44b893a025 | ||
|
fb9426d316 | ||
|
c83bfbd21c | ||
|
ce157c47cc | ||
|
1fd2a9f643 | ||
|
5ef638694e | ||
|
0585cc4a89 | ||
|
ca4d58e4e7 | ||
|
86497f9b13 | ||
|
ad924ebca8 | ||
|
20585a3130 | ||
|
bfec6b975f | ||
|
271cd7feef | ||
|
3e821ca7fd | ||
|
9afe1811a5 | ||
|
da3c1c9b74 | ||
|
837ea16dc8 | ||
|
ad1cc71b0f | ||
|
3b4a2a378f | ||
|
f90599d7e4 | ||
|
a3305677b2 | ||
|
ddbb2e5f23 | ||
|
94ace823e0 | ||
|
82e2f96192 | ||
|
5313984d8f | ||
|
42399f6250 | ||
|
0534c5c57d | ||
|
97650adf3b | ||
|
4ad4f34cda | ||
|
56d5587c20 | ||
|
f102101b39 | ||
|
6d9bc2964b | ||
|
1899b006de | ||
|
746c3da0ca | ||
|
d77ab92d86 | ||
|
d562f00dca | ||
|
6239879245 | ||
|
4012f9b7c2 | ||
|
07f901e1e5 | ||
|
df9a7f7390 | ||
|
3a1e21635a | ||
|
5b501d1562 | ||
|
6b451336ed | ||
|
e81dddddf0 | ||
|
dda628e866 | ||
|
571cb48aea | ||
|
63c825ab81 | ||
|
ddea816a49 | ||
|
9d39892e75 |
291 changed files with 20222 additions and 5026 deletions
|
@ -21,7 +21,7 @@ import shutil
|
||||||
|
|
||||||
is_ci = os.environ.get('CI') is not None
|
is_ci = os.environ.get('CI') is not None
|
||||||
|
|
||||||
def main():
|
def main() -> None:
|
||||||
import argparse
|
import argparse
|
||||||
p = argparse.ArgumentParser()
|
p = argparse.ArgumentParser()
|
||||||
p.add_argument('--test', action='store_true', help='use test pypi')
|
p.add_argument('--test', action='store_true', help='use test pypi')
|
||||||
|
@ -29,7 +29,7 @@ def main():
|
||||||
|
|
||||||
extra = []
|
extra = []
|
||||||
if args.test:
|
if args.test:
|
||||||
extra.extend(['--repository-url', 'https://test.pypi.org/legacy/'])
|
extra.extend(['--repository', 'testpypi'])
|
||||||
|
|
||||||
root = Path(__file__).absolute().parent.parent
|
root = Path(__file__).absolute().parent.parent
|
||||||
os.chdir(root) # just in case
|
os.chdir(root) # just in case
|
||||||
|
@ -42,7 +42,7 @@ def main():
|
||||||
if dist.exists():
|
if dist.exists():
|
||||||
shutil.rmtree(dist)
|
shutil.rmtree(dist)
|
||||||
|
|
||||||
check_call('python3 setup.py sdist bdist_wheel', shell=True)
|
check_call(['python3', '-m', 'build'])
|
||||||
|
|
||||||
TP = 'TWINE_PASSWORD'
|
TP = 'TWINE_PASSWORD'
|
||||||
password = os.environ.get(TP)
|
password = os.environ.get(TP)
|
48
.ci/run
Executable file
48
.ci/run
Executable file
|
@ -0,0 +1,48 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
cd "$(dirname "$0")"
|
||||||
|
cd .. # git root
|
||||||
|
|
||||||
|
if ! command -v sudo; then
|
||||||
|
# CI or Docker sometimes doesn't have it, so useful to have a dummy
|
||||||
|
function sudo {
|
||||||
|
"$@"
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --parallel-live to show outputs while it's running
|
||||||
|
tox_cmd='run-parallel --parallel-live'
|
||||||
|
if [ -n "${CI-}" ]; then
|
||||||
|
# install OS specific stuff here
|
||||||
|
case "$OSTYPE" in
|
||||||
|
darwin*)
|
||||||
|
# macos
|
||||||
|
brew install fd
|
||||||
|
;;
|
||||||
|
cygwin* | msys* | win*)
|
||||||
|
# windows
|
||||||
|
# ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that
|
||||||
|
tox_cmd='run'
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
# must be linux?
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install fd-find
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
PY_BIN="python3"
|
||||||
|
# some systems might have python pointing to python3
|
||||||
|
if ! command -v python3 &> /dev/null; then
|
||||||
|
PY_BIN="python"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# TODO hmm for some reason installing uv with pip and then running
|
||||||
|
# "$PY_BIN" -m uv tool fails with missing setuptools error??
|
||||||
|
# just uvx directly works, but it's not present in PATH...
|
||||||
|
"$PY_BIN" -m pip install --user pipx
|
||||||
|
"$PY_BIN" -m pipx run uv tool run --with=tox-uv tox $tox_cmd "$@"
|
74
.github/workflows/main.yml
vendored
74
.github/workflows/main.yml
vendored
|
@ -5,43 +5,74 @@ on:
|
||||||
push:
|
push:
|
||||||
branches: '*'
|
branches: '*'
|
||||||
tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi
|
tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi
|
||||||
# TODO not sure if need 'pull_request'??
|
# Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug:
|
||||||
|
pull_request: # needed to trigger on others' PRs
|
||||||
|
# Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them".
|
||||||
workflow_dispatch: # needed to trigger workflows manually
|
workflow_dispatch: # needed to trigger workflows manually
|
||||||
|
# todo cron?
|
||||||
|
inputs:
|
||||||
|
debug_enabled:
|
||||||
|
type: boolean
|
||||||
|
description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
|
||||||
|
required: false
|
||||||
|
default: false
|
||||||
|
|
||||||
env:
|
|
||||||
# useful for scripts & sometimes tests to know
|
|
||||||
CI: true
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
strategy:
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
platform: [ubuntu-latest, macos-latest] # TODO windows-latest??
|
platform: [ubuntu-latest, macos-latest, windows-latest]
|
||||||
python-version: [3.6, 3.7, 3.8]
|
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
|
||||||
|
exclude: [
|
||||||
|
# windows runners are pretty scarce, so let's only run lowest and highest python version
|
||||||
|
{platform: windows-latest, python-version: '3.10'},
|
||||||
|
{platform: windows-latest, python-version: '3.11'},
|
||||||
|
{platform: windows-latest, python-version: '3.12'},
|
||||||
|
|
||||||
|
# same, macos is a bit too slow and ubuntu covers python quirks well
|
||||||
|
{platform: macos-latest , python-version: '3.10' },
|
||||||
|
{platform: macos-latest , python-version: '3.11' },
|
||||||
|
{platform: macos-latest , python-version: '3.12' },
|
||||||
|
]
|
||||||
|
|
||||||
runs-on: ${{ matrix.platform }}
|
runs-on: ${{ matrix.platform }}
|
||||||
|
|
||||||
|
# useful for 'optional' pipelines
|
||||||
|
# continue-on-error: ${{ matrix.platform == 'windows-latest' }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
# ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
|
# ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
|
||||||
- run: echo "$HOME/.local/bin" >> $GITHUB_PATH
|
- run: echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||||
|
|
||||||
- uses: actions/setup-python@v1
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
|
fetch-depth: 0 # nicer to have all git history when debugging/for tests
|
||||||
|
|
||||||
# uncomment for SSH debugging
|
- uses: mxschmitt/action-tmate@v3
|
||||||
# - uses: mxschmitt/action-tmate@v3
|
if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}
|
||||||
|
|
||||||
- run: scripts/ci/run
|
# explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd...
|
||||||
|
- run: bash .ci/run
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v2
|
- if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: .coverage.mypy_${{ matrix.platform }}_${{ matrix.python-version }}
|
include-hidden-files: true
|
||||||
path: .coverage.mypy/
|
name: .coverage.mypy-misc_${{ matrix.platform }}_${{ matrix.python-version }}
|
||||||
|
path: .coverage.mypy-misc/
|
||||||
|
- if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
include-hidden-files: true
|
||||||
|
name: .coverage.mypy-core_${{ matrix.platform }}_${{ matrix.python-version }}
|
||||||
|
path: .coverage.mypy-core/
|
||||||
|
|
||||||
pypi:
|
pypi:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -51,26 +82,25 @@ jobs:
|
||||||
# ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
|
# ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
|
||||||
- run: echo "$HOME/.local/bin" >> $GITHUB_PATH
|
- run: echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||||||
|
|
||||||
- uses: actions/setup-python@v1
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: 3.7
|
python-version: '3.10'
|
||||||
|
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
|
|
||||||
- name: 'release to test pypi'
|
- name: 'release to test pypi'
|
||||||
# always deploy merged master to test pypi
|
# always deploy merged master to test pypi
|
||||||
if: github.event.ref == 'refs/heads/master'
|
if: github.event_name != 'pull_request' && github.event.ref == 'refs/heads/master'
|
||||||
env:
|
env:
|
||||||
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD_TEST }}
|
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD_TEST }}
|
||||||
run: pip3 install --user wheel twine && scripts/release --test
|
run: pip3 install --user --upgrade build twine && .ci/release --test
|
||||||
# TODO run pip install just to test?
|
|
||||||
|
|
||||||
- name: 'release to pypi'
|
- name: 'release to pypi'
|
||||||
# always deploy tags to release pypi
|
# always deploy tags to release pypi
|
||||||
# NOTE: release tags are guarded by on: push: tags on the top
|
# NOTE: release tags are guarded by on: push: tags on the top
|
||||||
if: startsWith(github.event.ref, 'refs/tags')
|
if: github.event_name != 'pull_request' && startsWith(github.event.ref, 'refs/tags')
|
||||||
env:
|
env:
|
||||||
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
|
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
|
||||||
run: pip3 install --user wheel twine && scripts/release
|
run: pip3 install --user --upgrade build twine && .ci/release
|
||||||
|
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -12,6 +12,7 @@
|
||||||
auto-save-list
|
auto-save-list
|
||||||
tramp
|
tramp
|
||||||
.\#*
|
.\#*
|
||||||
|
*.gpx
|
||||||
|
|
||||||
# Org-mode
|
# Org-mode
|
||||||
.org-id-locations
|
.org-id-locations
|
||||||
|
@ -154,6 +155,9 @@ celerybeat-schedule
|
||||||
.dmypy.json
|
.dmypy.json
|
||||||
dmypy.json
|
dmypy.json
|
||||||
|
|
||||||
|
# linters
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
# Pyre type checker
|
# Pyre type checker
|
||||||
.pyre/
|
.pyre/
|
||||||
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
- /.mypy_cache/
|
|
58
CHANGELOG.md
Normal file
58
CHANGELOG.md
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
# `v0.3.20210220`
|
||||||
|
|
||||||
|
General/my.core changes:
|
||||||
|
|
||||||
|
- a3305677b24694391a247fc4cb6cc1237e57f840 **deprecate*** my.cfg, instead my.config can (and should be) used directly
|
||||||
|
- 0534c5c57dc420f9a01387b58a7098823e54277e new cli feature: **module management**
|
||||||
|
|
||||||
|
cli: add `hpi module install` and `hpi module requires`
|
||||||
|
|
||||||
|
relevant: https://github.com/karlicoss/HPI/issues/12, https://github.com/karlicoss/HPI/issues/79
|
||||||
|
|
||||||
|
- 97650adf3b48c653651b31c78cefe24ecae5ed4f add discovery_pure module to get modules and their dependencies via `ast` module
|
||||||
|
- f90599d7e4463e936c8d95196ff767c730207202 make module discovery rely on `ast` module
|
||||||
|
Hopefully it will make it more robust & much faster.
|
||||||
|
- 07f901e1e5fb2bd3009561c84cc4efd311c94733 helpers for **automatic dataframes** from sequences of NamedTuple/dataclass
|
||||||
|
- 4012f9b7c2a429170df8600591ec8d1e1407b162 more generic functions to jsonify data
|
||||||
|
- 746c3da0cadcba3b179688783186d8a0bd0999c5 core.pandas: allow specifying schema; add tests
|
||||||
|
- 5313984d8fea2b6eef6726b7b346c1f4316acd01 add `tmp_config` context manager for test & adhoc patching
|
||||||
|
- df9a7f7390aee6c69f1abf1c8d1fc7659ebb957c core.pandas: add check for 'error' column + add empty one by default
|
||||||
|
- e81dddddf083ffd81aa7e2b715bd34f59949479c properly resolve class properties in make_config + add test
|
||||||
|
|
||||||
|
Modules:
|
||||||
|
- some initial work on filling **InfluxDB** with HPI data
|
||||||
|
|
||||||
|
- pinboard
|
||||||
|
- 42399f6250d9901d93dcedcfe05f7857babcf834: **breaking backwards compatibility**, use pinbexport module directly
|
||||||
|
|
||||||
|
Use 'hpi module install my.pinboard' to install it
|
||||||
|
|
||||||
|
relevant: https://github.com/karlicoss/HPI/issues/79
|
||||||
|
|
||||||
|
- stackexchange
|
||||||
|
- 63c825ab81bb561e912655e423c6b332fb6fd1b4 use GDPR data for votes
|
||||||
|
- ddea816a49f5da79fd6332e7f6b879b1955838af use proper pip package, add stat
|
||||||
|
|
||||||
|
- bluemaestro
|
||||||
|
- 6d9bc2964b24cfe6187945f4634940673dfe9c27 populate grafana
|
||||||
|
- 1899b006de349140303110ca98a21d918d9eb049 investigation of data quality + more sanity checks
|
||||||
|
- d77ab92d8634d0863d2b966cb448bbfcc8a8d565 get rid of unnecessary file, move to top level
|
||||||
|
|
||||||
|
- runnerup
|
||||||
|
- 6b451336ed5df2b893c9e6387175edba50b0719b Initial parser for RunnerUp data which I'm now using instead of Endomondo
|
||||||
|
|
||||||
|
Misc:
|
||||||
|
- f102101b3917e8a38511faa5e4fd9dd33d284d7e core/windows: fix get_files and its tests
|
||||||
|
- 56d5587c209dcbd27c7802d60c0bc8e8e2391672 CI: clean up tox config a bit, get rid of custom lint script
|
||||||
|
- d562f00dca720fd4f6736377a41168e9a796c122
|
||||||
|
|
||||||
|
tests: run all tests, but exclude tests specific to my computer from CI
|
||||||
|
controllable via `HPI_TESTS_KARLICOSS=true`
|
||||||
|
|
||||||
|
- improved mypy coverage
|
||||||
|
|
||||||
|
|
||||||
|
# before `v0.2.20201125`
|
||||||
|
|
||||||
|
I used to keep it in [Github releases](https://github.com/karlicoss/HPI/releases).
|
||||||
|
However I realized it's means promoting a silo, so now it's reflected in this file (and only copied to github releases page).
|
304
README.org
304
README.org
|
@ -1,3 +1,5 @@
|
||||||
|
# TODO ugh. my blog generator dumps links as file: ....
|
||||||
|
# so used smeth like :s/file:\(.*\)\.org/https:\/\/beepb00p.xyz\/\1.html/gc -- steal leaves ::# links etc. ugh
|
||||||
#+summary: My life in a Python package
|
#+summary: My life in a Python package
|
||||||
#+created: [2019-11-14 Thu]
|
#+created: [2019-11-14 Thu]
|
||||||
#+filetags: :infra:pkm:quantifiedself:hpi:
|
#+filetags: :infra:pkm:quantifiedself:hpi:
|
||||||
|
@ -9,10 +11,14 @@ If you're in a hurry, feel free to jump straight to the [[#usecases][demos]].
|
||||||
|
|
||||||
- see [[https://github.com/karlicoss/HPI/tree/master/doc/SETUP.org][SETUP]] for the *installation/configuration guide*
|
- see [[https://github.com/karlicoss/HPI/tree/master/doc/SETUP.org][SETUP]] for the *installation/configuration guide*
|
||||||
- see [[https://github.com/karlicoss/HPI/tree/master/doc/DEVELOPMENT.org][DEVELOPMENT]] for the *development guide*
|
- see [[https://github.com/karlicoss/HPI/tree/master/doc/DEVELOPMENT.org][DEVELOPMENT]] for the *development guide*
|
||||||
|
- see [[https://github.com/karlicoss/HPI/tree/master/doc/DESIGN.org][DESIGN]] for the *design goals*
|
||||||
|
- see [[https://github.com/karlicoss/HPI/tree/master/doc/MODULES.org][MODULES]] for *module-specific setup*
|
||||||
|
- see [[https://github.com/karlicoss/HPI/tree/master/doc/MODULE_DESIGN.org][MODULE_DESIGN]] for some thoughts on structuring modules, and possibly *extending HPI*
|
||||||
|
- see [[https://beepb00p.xyz/exobrain/projects/hpi.html][exobrain/HPI]] for some of my raw thoughts and todos on the project
|
||||||
|
|
||||||
*TLDR*: I'm using [[https://github.com/karlicoss/HPI][HPI]] (Human Programming Interface) package as a means of unifying, accessing and interacting with all of my personal data.
|
*TLDR*: I'm using [[https://github.com/karlicoss/HPI][HPI]] (Human Programming Interface) package as a means of unifying, accessing and interacting with all of my personal data.
|
||||||
|
|
||||||
It's a Python library (named ~my~), a collection of modules for:
|
HPI is a Python package (named ~my~), a collection of modules for:
|
||||||
|
|
||||||
- social networks: posts, comments, favorites
|
- social networks: posts, comments, favorites
|
||||||
- reading: e-books and pdfs
|
- reading: e-books and pdfs
|
||||||
|
@ -30,9 +36,9 @@ You simply 'import' your data and get to work with familiar Python types and dat
|
||||||
- Here's a short example to give you an idea: "which subreddits I find the most interesting?"
|
- Here's a short example to give you an idea: "which subreddits I find the most interesting?"
|
||||||
|
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
import my.reddit
|
import my.reddit.all
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
return Counter(s.subreddit for s in my.reddit.saved()).most_common(4)
|
return Counter(s.subreddit for s in my.reddit.all.saved()).most_common(4)
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
| orgmode | 62 |
|
| orgmode | 62 |
|
||||||
|
@ -42,10 +48,10 @@ You simply 'import' your data and get to work with familiar Python types and dat
|
||||||
|
|
||||||
|
|
||||||
I consider my digital trace an important part of my identity. ([[https://beepb00p.xyz/tags.html#extendedmind][#extendedmind]])
|
I consider my digital trace an important part of my identity. ([[https://beepb00p.xyz/tags.html#extendedmind][#extendedmind]])
|
||||||
The fact that the data is siloed, and accessing it is inconvenient and borderline frustrating feels very wrong.
|
Usually the data is siloed, accessing it is inconvenient and borderline frustrating. This feels very wrong.
|
||||||
|
|
||||||
Once the data is available as Python objects, I can easily plug it into existing tools, libraries and frameworks.
|
In contrast, once the data is available as Python objects, I can easily plug it into existing tools, libraries and frameworks.
|
||||||
It makes building new tools considerably easier and allows creating new ways of interacting with the data.
|
It makes building new tools considerably easier and opens up new ways of interacting with the data.
|
||||||
|
|
||||||
I tried different things over the years and I think I'm getting to the point where other people can also benefit from my code by 'just' plugging in their data,
|
I tried different things over the years and I think I'm getting to the point where other people can also benefit from my code by 'just' plugging in their data,
|
||||||
and that's why I'm sharing this.
|
and that's why I'm sharing this.
|
||||||
|
@ -53,6 +59,7 @@ and that's why I'm sharing this.
|
||||||
Imagine if all your life was reflected digitally and available at your fingertips.
|
Imagine if all your life was reflected digitally and available at your fingertips.
|
||||||
This library is my attempt to achieve this vision.
|
This library is my attempt to achieve this vision.
|
||||||
|
|
||||||
|
|
||||||
#+toc: headlines 2
|
#+toc: headlines 2
|
||||||
|
|
||||||
|
|
||||||
|
@ -69,6 +76,7 @@ This library is my attempt to achieve this vision.
|
||||||
- Accessing exercise data
|
- Accessing exercise data
|
||||||
- Book reading progress
|
- Book reading progress
|
||||||
- Messenger stats
|
- Messenger stats
|
||||||
|
- Which month in 2020 did I make the most git commits in?
|
||||||
- Querying Roam Research database
|
- Querying Roam Research database
|
||||||
- How does it get input data?
|
- How does it get input data?
|
||||||
- Q & A
|
- Q & A
|
||||||
|
@ -79,11 +87,16 @@ This library is my attempt to achieve this vision.
|
||||||
- But /should/ I use it?
|
- But /should/ I use it?
|
||||||
- Would it suit /me/?
|
- Would it suit /me/?
|
||||||
- What it isn't?
|
- What it isn't?
|
||||||
|
- HPI Repositories
|
||||||
- Related links
|
- Related links
|
||||||
- --
|
- --
|
||||||
:END:
|
:END:
|
||||||
|
|
||||||
* Why?
|
* Why?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: motivation
|
||||||
|
:END:
|
||||||
|
|
||||||
The main reason that led me to develop this is the dissatisfaction of the current situation:
|
The main reason that led me to develop this is the dissatisfaction of the current situation:
|
||||||
|
|
||||||
- Our personal data is siloed and trapped across cloud services and various devices
|
- Our personal data is siloed and trapped across cloud services and various devices
|
||||||
|
@ -94,7 +107,7 @@ The main reason that led me to develop this is the dissatisfaction of the curren
|
||||||
|
|
||||||
Integrations of data across silo boundaries are almost non-existent. There is so much potential and it's all wasted.
|
Integrations of data across silo boundaries are almost non-existent. There is so much potential and it's all wasted.
|
||||||
|
|
||||||
- I'm not willing to wait till some vaporwave project reinvents the whole computing model from scratch
|
- I'm not willing to wait till some vaporware project reinvents the whole computing model from scratch
|
||||||
|
|
||||||
As a programmer, I am in capacity to do something *right now*, even though it's not necessarily perfect and consistent.
|
As a programmer, I am in capacity to do something *right now*, even though it's not necessarily perfect and consistent.
|
||||||
|
|
||||||
|
@ -174,6 +187,10 @@ But the major reason I want to solve these problems is to be better at learning
|
||||||
so I could be better at solving the real problems.
|
so I could be better at solving the real problems.
|
||||||
|
|
||||||
* How does a Python package help?
|
* How does a Python package help?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: package
|
||||||
|
:END:
|
||||||
|
|
||||||
When I started solving some of these problems for myself, I've noticed a common pattern: the [[https://beepb00p.xyz/sad-infra.html#exports_are_hard][hardest bit]] is actually getting your data in the first place.
|
When I started solving some of these problems for myself, I've noticed a common pattern: the [[https://beepb00p.xyz/sad-infra.html#exports_are_hard][hardest bit]] is actually getting your data in the first place.
|
||||||
It's inherently error-prone and frustrating.
|
It's inherently error-prone and frustrating.
|
||||||
|
|
||||||
|
@ -183,6 +200,9 @@ This package knows how to find data on your filesystem, deserialize it and norma
|
||||||
You have the full power of the programming language to transform the data and do whatever comes to your mind.
|
You have the full power of the programming language to transform the data and do whatever comes to your mind.
|
||||||
|
|
||||||
** Why don't you just put everything in a massive database?
|
** Why don't you just put everything in a massive database?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: database
|
||||||
|
:END:
|
||||||
Glad you've asked! I wrote a whole [[https://beepb00p.xyz/unnecessary-db.html][post]] about it.
|
Glad you've asked! I wrote a whole [[https://beepb00p.xyz/unnecessary-db.html][post]] about it.
|
||||||
|
|
||||||
In short: while databases are efficient and easy to read from, often they aren't flexible enough to fit your data.
|
In short: while databases are efficient and easy to read from, often they aren't flexible enough to fit your data.
|
||||||
|
@ -193,42 +213,61 @@ That's where a Python package comes in.
|
||||||
|
|
||||||
|
|
||||||
* What's inside?
|
* What's inside?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: modules
|
||||||
|
:END:
|
||||||
|
|
||||||
Here's the (incomplete) list of the modules:
|
Here's the (incomplete) list of the modules:
|
||||||
|
|
||||||
|
|
||||||
:results:
|
:results:
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/bluemaestro][my.bluemaestro]] | [[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/bluemaestro.py][=my.bluemaestro=]] | [[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/body/blood.py][my.body.blood]] | Blood tracking |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/body/blood.py][=my.body.blood=]] | Blood tracking (manual org-mode entries) |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/body/weight.py][my.body.weight]] | Weight data (manually logged) |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/body/exercise/all.py][=my.body.exercise.all=]] | Combined exercise data |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/books/kobo.py][my.books.kobo]] | [[https://uk.kobobooks.com/products/kobo-aura-one][Kobo]] e-ink reader: annotations and reading stats |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/body/exercise/cardio.py][=my.body.exercise.cardio=]] | Cardio data, filtered from various data sources |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/calendar/holidays.py][my.calendar.holidays]] | Public holidays (automatic) and days off work (manual inputs) |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/body/exercise/cross_trainer.py][=my.body.exercise.cross_trainer=]] | My cross trainer exercise data, arbitrated from different sources (mainly, Endomondo and manual text notes) |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/coding/commits.py][my.coding.commits]] | Git commits data for repositories on your filesystem |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/body/weight.py][=my.body.weight=]] | Weight data (manually logged) |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/coding/github.py][my.coding.github]] | Github events and their metadata: comments/issues/pull requests |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/calendar/holidays.py][=my.calendar.holidays=]] | Holidays and days off work |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/emfit][my.emfit]] | [[https://shop-eu.emfit.com/products/emfit-qs][Emfit QS]] sleep tracker |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/coding/commits.py][=my.coding.commits=]] | Git commits data for repositories on your filesystem |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/fbmessenger.py][my.fbmessenger]] | Facebook Messenger messages |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/demo.py][=my.demo=]] | Just a demo module for testing and documentation purposes |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/feedbin.py][my.feedbin]] | Feedbin RSS reader |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/emfit/__init__.py][=my.emfit=]] | [[https://shop-eu.emfit.com/products/emfit-qs][Emfit QS]] sleep tracker |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/feedly.py][my.feedly]] | Feedly RSS reader |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/endomondo.py][=my.endomondo=]] | Endomondo exercise data |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/foursquare.py][my.foursquare]] | Foursquare/Swarm checkins |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/fbmessenger.py][=my.fbmessenger=]] | Facebook Messenger messages |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/google/takeout/html.py][my.google.takeout.html]] | Google Takeout exports: browsing history, search/youtube/google play activity |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/foursquare.py][=my.foursquare=]] | Foursquare/Swarm checkins |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/hypothesis.py][my.hypothesis]] | [[https://hypothes.is][Hypothes.is]] highlights and annotations |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/github/all.py][=my.github.all=]] | Unified Github data (merged from GDPR export and periodic API updates) |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/instapaper.py][my.instapaper]] | Instapaper bookmarks, highlights and annotations |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/github/gdpr.py][=my.github.gdpr=]] | Github data (uses [[https://github.com/settings/admin][official GDPR export]]) |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/lastfm][my.lastfm]] | Last.fm scrobbles |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/github/ghexport.py][=my.github.ghexport=]] | Github data: events, comments, etc. (API data) |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/location/takeout.py][my.location.takeout]] | Location data from Google Takeout |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/hypothesis.py][=my.hypothesis=]] | [[https://hypothes.is][Hypothes.is]] highlights and annotations |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/materialistic.py][my.materialistic]] | [[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/instapaper.py][=my.instapaper=]] | [[https://www.instapaper.com][Instapaper]] bookmarks, highlights and annotations |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/notes/orgmode.py][my.notes.orgmode]] | Programmatic access and queries to org-mode files on the filesystem |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/kobo.py][=my.kobo=]] | [[https://uk.kobobooks.com/products/kobo-aura-one][Kobo]] e-ink reader: annotations and reading stats |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/pdfs.py][my.pdfs]] | PDF documents and annotations on your filesystem |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/lastfm.py][=my.lastfm=]] | Last.fm scrobbles |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/photos][my.photos]] | Photos and videos on your filesystem, their GPS and timestamps |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/location/google.py][=my.location.google=]] | Location data from Google Takeout |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/pinboard.py][my.pinboard]] | [[https://pinboard.in][Pinboard]] bookmarks |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/location/home.py][=my.location.home=]] | Simple location provider, serving as a fallback when more detailed data isn't available |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/reading/polar.py][my.reading.polar]] | [[https://github.com/burtonator/polar-books][Polar]] articles and highlights |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/materialistic.py][=my.materialistic=]] | [[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/reddit.py][my.reddit]] | Reddit data: saved items/comments/upvotes/etc. |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/orgmode.py][=my.orgmode=]] | Programmatic access and queries to org-mode files on the filesystem |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/rescuetime.py][my.rescuetime]] | Rescuetime (activity tracking) data |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/pdfs.py][=my.pdfs=]] | PDF documents and annotations on your filesystem |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/roamresearch.py][my.roamresearch]] | [[https://roamresearch.com][Roam]] data |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/photos/main.py][=my.photos.main=]] | Photos and videos on your filesystem, their GPS and timestamps |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/rtm.py][my.rtm]] | [[https://rememberthemilk.com][Remember The Milk]] tasks and notes |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/pinboard.py][=my.pinboard=]] | [[https://pinboard.in][Pinboard]] bookmarks |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/smscalls.py][my.smscalls]] | Phone calls and SMS messages |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/pocket.py][=my.pocket=]] | [[https://getpocket.com][Pocket]] bookmarks and highlights |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/stackexchange.py][my.stackexchange]] | Stackexchange data |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/polar.py][=my.polar=]] | [[https://github.com/burtonator/polar-bookshelf][Polar]] articles and highlights |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/twitter/all.py][my.twitter.all]] | Unified Twitter data (merged from the archive and periodic updates) |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/reddit.py][=my.reddit=]] | Reddit data: saved items/comments/upvotes/etc. |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/twitter/archive.py][my.twitter.archive]] | Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]]) |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/rescuetime.py][=my.rescuetime=]] | Rescuetime (phone activity tracking) data. |
|
||||||
| [[https://github.com/karlicoss/my/tree/master/my/twitter/twint.py][my.twitter.twint]] | Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export. |
|
| [[https://github.com/karlicoss/HPI/tree/master/my/roamresearch.py][=my.roamresearch=]] | [[https://roamresearch.com][Roam]] data |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/rss/all.py][=my.rss.all=]] | Unified RSS data, merged from different services I used historically |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/rss/feedbin.py][=my.rss.feedbin=]] | Feedbin RSS reader |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/rss/feedly.py][=my.rss.feedly=]] | Feedly RSS reader |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/rtm.py][=my.rtm=]] | [[https://rememberthemilk.com][Remember The Milk]] tasks and notes |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/runnerup.py][=my.runnerup=]] | [[https://github.com/jonasoreland/runnerup][Runnerup]] exercise data (TCX format) |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/smscalls.py][=my.smscalls=]] | Phone calls and SMS messages |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/stackexchange/gdpr.py][=my.stackexchange.gdpr=]] | Stackexchange data (uses [[https://stackoverflow.com/legal/gdpr/request][official GDPR export]]) |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/stackexchange/stexport.py][=my.stackexchange.stexport=]] | Stackexchange data (uses API via [[https://github.com/karlicoss/stexport][stexport]]) |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/taplog.py][=my.taplog=]] | [[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/time/tz/main.py][=my.time.tz.main=]] | Timezone data provider, used to localize timezone-unaware timestamps for other modules |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/time/tz/via_location.py][=my.time.tz.via_location=]] | Timezone data provider, guesses timezone based on location data (e.g. GPS) |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/twitter/all.py][=my.twitter.all=]] | Unified Twitter data (merged from the archive and periodic updates) |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/twitter/archive.py][=my.twitter.archive=]] | Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]]) |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/twitter/twint.py][=my.twitter.twint=]] | Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export. |
|
||||||
|
| [[https://github.com/karlicoss/HPI/tree/master/my/vk/vk_messages_backup.py][=my.vk.vk_messages_backup=]] | VK data (exported by [[https://github.com/Totktonada/vk_messages_backup][Totktonada/vk_messages_backup]]) |
|
||||||
:END:
|
:END:
|
||||||
|
|
||||||
Some modules are private, and need a bit of cleanup before merging:
|
Some modules are private, and need a bit of cleanup before merging:
|
||||||
|
@ -244,17 +283,25 @@ Some modules are private, and need a bit of cleanup before merging:
|
||||||
#+html: <div id="usecases"></div>
|
#+html: <div id="usecases"></div>
|
||||||
|
|
||||||
* How do you use it?
|
* How do you use it?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: usecases
|
||||||
|
:END:
|
||||||
Mainly I use it as a data provider for my scripts, tools, and dashboards.
|
Mainly I use it as a data provider for my scripts, tools, and dashboards.
|
||||||
|
|
||||||
Also, check out [[https://beepb00p.xyz/myinfra.html#mypkg][my infrastructure map]].
|
Also, check out [[https://beepb00p.xyz/myinfra.html#mypkg][my infrastructure map]]. It might be helpful for understanding what's my vision on HPI.
|
||||||
It's a draft at the moment, but it might be helpful for understanding what's my vision on HPI.
|
|
||||||
** Instant search
|
** Instant search
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: search
|
||||||
|
:END:
|
||||||
Typical search interfaces make me unhappy as they are *siloed, slow, awkward to use and don't work offline*.
|
Typical search interfaces make me unhappy as they are *siloed, slow, awkward to use and don't work offline*.
|
||||||
So I built my own ways around it! I write about it in detail [[https://beepb00p.xyz/pkm-search.html#personal_information][here]].
|
So I built my own ways around it! I write about it in detail [[https://beepb00p.xyz/pkm-search.html#personal_information][here]].
|
||||||
|
|
||||||
In essence, I'm mirroring most of my online data like chat logs, comments, etc., as plaintext.
|
In essence, I'm mirroring most of my online data like chat logs, comments, etc., as plaintext.
|
||||||
I can overview it in any text editor, and incrementally search over *all of it* in a single keypress.
|
I can overview it in any text editor, and incrementally search over *all of it* in a single keypress.
|
||||||
** orger
|
** orger
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: orger
|
||||||
|
:END:
|
||||||
[[https://github.com/karlicoss/orger][orger]] is a tool that helps you generate an org-mode representation of your data.
|
[[https://github.com/karlicoss/orger][orger]] is a tool that helps you generate an org-mode representation of your data.
|
||||||
|
|
||||||
It lets you benefit from the existing tooling and infrastructure around org-mode, the most famous being Emacs.
|
It lets you benefit from the existing tooling and infrastructure around org-mode, the most famous being Emacs.
|
||||||
|
@ -265,18 +312,29 @@ I'm using it for:
|
||||||
- creating tasks straight from the apps (e.g. Reddit/Telegram)
|
- creating tasks straight from the apps (e.g. Reddit/Telegram)
|
||||||
- spaced repetition via [[https://orgmode.org/worg/org-contrib/org-drill.html][org-drill]]
|
- spaced repetition via [[https://orgmode.org/worg/org-contrib/org-drill.html][org-drill]]
|
||||||
|
|
||||||
Orger comes with some existing [[https://github.com/orger/tree/master/modules][modules]], but it should be easy to adapt your own data source if you need something else.
|
Orger comes with some existing [[https://github.com/karlicoss/orger/tree/master/modules][modules]], but it should be easy to adapt your own data source if you need something else.
|
||||||
|
|
||||||
I write about it in detail [[https://beepb00p.xyz/orger.html][here]] and [[https://beepb00p.xyz/orger-todos.html][here]].
|
I write about it in detail [[http://beepb00p.xyz/orger.html][here]] and [[http://beepb00p.xyz/orger-todos.html][here]].
|
||||||
** promnesia
|
** promnesia
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: promnesia
|
||||||
|
:END:
|
||||||
[[https://github.com/karlicoss/promnesia#demo][promnesia]] is a browser extension I'm working on to escape silos by *unifying annotations and browsing history* from different data sources.
|
[[https://github.com/karlicoss/promnesia#demo][promnesia]] is a browser extension I'm working on to escape silos by *unifying annotations and browsing history* from different data sources.
|
||||||
|
|
||||||
I've been using it for more than a year now and working on final touches to properly release it for other people.
|
I've been using it for more than a year now and working on final touches to properly release it for other people.
|
||||||
** dashboard
|
** dashboard
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: dashboard
|
||||||
|
:END:
|
||||||
|
|
||||||
As a big fan of [[https://beepb00p.xyz/tags.html#quantified-self][#quantified-self]], I'm working on personal health, sleep and exercise dashboard, built from various data sources.
|
As a big fan of [[https://beepb00p.xyz/tags.html#quantified-self][#quantified-self]], I'm working on personal health, sleep and exercise dashboard, built from various data sources.
|
||||||
|
|
||||||
I'm working on making it public, you can see some screenshots [[https://www.reddit.com/r/QuantifiedSelf/comments/cokt4f/what_do_you_all_do_with_your_data/ewmucgk][here]].
|
I'm working on making it public, you can see some screenshots [[https://www.reddit.com/r/QuantifiedSelf/comments/cokt4f/what_do_you_all_do_with_your_data/ewmucgk][here]].
|
||||||
** timeline
|
** timeline
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: timeline
|
||||||
|
:END:
|
||||||
|
|
||||||
Timeline is a [[https://beepb00p.xyz/tags.html#lifelogging][#lifelogging]] project I'm working on.
|
Timeline is a [[https://beepb00p.xyz/tags.html#lifelogging][#lifelogging]] project I'm working on.
|
||||||
|
|
||||||
I want to see all my digital history, search in it, filter, easily jump at a specific point in time and see the context when it happened.
|
I want to see all my digital history, search in it, filter, easily jump at a specific point in time and see the context when it happened.
|
||||||
|
@ -286,15 +344,20 @@ Ideally, it would look similar to Andrew Louis's [[https://hyfen.net/memex][Meme
|
||||||
he open sources it. I highly recommend watching his talk for inspiration.
|
he open sources it. I highly recommend watching his talk for inspiration.
|
||||||
|
|
||||||
* Ad-hoc and interactive
|
* Ad-hoc and interactive
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: interactive
|
||||||
|
:END:
|
||||||
|
|
||||||
** What were my music listening stats for 2018?
|
** What were my music listening stats for 2018?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: lastfm
|
||||||
|
:END:
|
||||||
|
|
||||||
Single import away from getting tracks you listened to:
|
Single import away from getting tracks you listened to:
|
||||||
|
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
from my.lastfm import get_scrobbles
|
from my.lastfm import scrobbles
|
||||||
scrobbles = get_scrobbles()
|
list(scrobbles())[200: 205]
|
||||||
scrobbles[200: 205]
|
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
|
|
||||||
|
@ -305,16 +368,15 @@ Single import away from getting tracks you listened to:
|
||||||
: Scrobble(raw={'album': 'Rolled Gold +', 'artist': 'The Rolling Stones', 'date': '1282494161', 'name': "You Can't Always Get What You Want"})]
|
: Scrobble(raw={'album': 'Rolled Gold +', 'artist': 'The Rolling Stones', 'date': '1282494161', 'name': "You Can't Always Get What You Want"})]
|
||||||
|
|
||||||
|
|
||||||
Or, as a pandas frame to make it pretty:
|
Or, as a pretty Pandas frame:
|
||||||
|
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
df = pd.DataFrame([{
|
df = pd.DataFrame([{
|
||||||
'dt': s.dt,
|
'dt': s.dt,
|
||||||
'track': s.track,
|
'track': s.track,
|
||||||
} for s in scrobbles])
|
} for s in scrobbles()]).set_index('dt')
|
||||||
cdf = df.set_index('dt')
|
df[200: 205]
|
||||||
cdf[200: 205]
|
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
|
|
||||||
|
@ -334,29 +396,33 @@ We can use [[https://github.com/martijnvermaat/calmap][calmap]] library to plot
|
||||||
plt.figure(figsize=(10, 2.3))
|
plt.figure(figsize=(10, 2.3))
|
||||||
|
|
||||||
import calmap
|
import calmap
|
||||||
cdf = cdf.set_index(cdf.index.tz_localize(None)) # calmap expects tz-unaware dates
|
df = df.set_index(df.index.tz_localize(None)) # calmap expects tz-unaware dates
|
||||||
calmap.yearplot(cdf['track'], how='count', year=2018)
|
calmap.yearplot(df['track'], how='count', year=2018)
|
||||||
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.title('My music listening activity for 2018')
|
plt.title('My music listening activity for 2018')
|
||||||
plot_file = 'lastfm_2018.png'
|
plot_file = 'hpi_files/lastfm_2018.png'
|
||||||
plt.savefig(plot_file)
|
plt.savefig(plot_file)
|
||||||
plot_file
|
plot_file
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
|
|
||||||
[[https://beepb00p.xyz/lastfm_2018.png]]
|
[[https://beepb00p.xyz/hpi_files/lastfm_2018.png]]
|
||||||
|
|
||||||
This isn't necessarily very insightful data, but fun to look at now and then!
|
This isn't necessarily very insightful data, but fun to look at now and then!
|
||||||
|
|
||||||
** What are the most interesting Slate Star Codex posts I've read?
|
** What are the most interesting Slate Star Codex posts I've read?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: hypothesis_stats
|
||||||
|
:END:
|
||||||
|
|
||||||
My friend asked me if I could recommend them posts I found interesting on [[https://slatestarcodex.com][Slate Star Codex]].
|
My friend asked me if I could recommend them posts I found interesting on [[https://slatestarcodex.com][Slate Star Codex]].
|
||||||
With few lines of Python I can quickly recommend them posts I engaged most with, i.e. the ones I annotated most on [[https://hypothes.is][Hypothesis]].
|
With few lines of Python I can quickly recommend them posts I engaged most with, i.e. the ones I annotated most on [[https://hypothes.is][Hypothesis]].
|
||||||
|
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
from my.hypothesis import get_pages
|
from my.hypothesis import pages
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
cc = Counter({(p.title + ' ' + p.url): len(p.highlights) for p in get_pages() if 'slatestarcodex' in p.url})
|
cc = Counter({(p.title + ' ' + p.url): len(p.highlights) for p in pages() if 'slatestarcodex' in p.url})
|
||||||
return cc.most_common(10)
|
return cc.most_common(10)
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
|
@ -373,9 +439,16 @@ With few lines of Python I can quickly recommend them posts I engaged most with,
|
||||||
| I Can Tolerate Anything Except The Outgroup https://slatestarcodex.com/2014/09/30/i-can-tolerate-anything-except-the-outgroup/ | 9 |
|
| I Can Tolerate Anything Except The Outgroup https://slatestarcodex.com/2014/09/30/i-can-tolerate-anything-except-the-outgroup/ | 9 |
|
||||||
|
|
||||||
** Accessing exercise data
|
** Accessing exercise data
|
||||||
E.g. see use of ~my.workouts~ [[https://beepb00p.xyz/./heartbeats_vs_kcals.html][here]].
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: exercise
|
||||||
|
:END:
|
||||||
|
E.g. see use of ~my.workouts~ [[https://beepb00p.xyz/heartbeats_vs_kcals.html][here]].
|
||||||
|
|
||||||
** Book reading progress
|
** Book reading progress
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: kobo_progress
|
||||||
|
:END:
|
||||||
|
|
||||||
I publish my reading stats on [[https://www.goodreads.com/user/show/22191391-dima-gerasimov][Goodreads]] so other people can see what I'm reading/have read, but Kobo [[https://beepb00p.xyz/ideas.html#kobo2goodreads][lacks integration]] with Goodreads.
|
I publish my reading stats on [[https://www.goodreads.com/user/show/22191391-dima-gerasimov][Goodreads]] so other people can see what I'm reading/have read, but Kobo [[https://beepb00p.xyz/ideas.html#kobo2goodreads][lacks integration]] with Goodreads.
|
||||||
I'm using [[https://github.com/karlicoss/kobuddy][kobuddy]] to access my my Kobo data, and I've got a regular task that reminds me to sync my progress once a month.
|
I'm using [[https://github.com/karlicoss/kobuddy][kobuddy]] to access my my Kobo data, and I've got a regular task that reminds me to sync my progress once a month.
|
||||||
|
|
||||||
|
@ -384,7 +457,7 @@ The task looks like this:
|
||||||
#+begin_src org
|
#+begin_src org
|
||||||
,* TODO [#C] sync [[https://goodreads.com][reading progress]] with kobo
|
,* TODO [#C] sync [[https://goodreads.com][reading progress]] with kobo
|
||||||
DEADLINE: <2019-11-24 Sun .+4w -0d>
|
DEADLINE: <2019-11-24 Sun .+4w -0d>
|
||||||
[[eshell: with_my python3 -c 'import my.books.kobo as kobo; kobo.print_progress()']]
|
[[eshell: python3 -c 'import my.kobo; my.kobo.print_progress()']]
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
With a single Enter keypress on the inlined =eshell:= command I can print the progress and fill in the completed books on Goodreads, e.g.:
|
With a single Enter keypress on the inlined =eshell:= command I can print the progress and fill in the completed books on Goodreads, e.g.:
|
||||||
|
@ -414,6 +487,9 @@ With a single Enter keypress on the inlined =eshell:= command I can print the pr
|
||||||
#+end_example
|
#+end_example
|
||||||
|
|
||||||
** Messenger stats
|
** Messenger stats
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: messenger_stats
|
||||||
|
:END:
|
||||||
How much do I chat on Facebook Messenger?
|
How much do I chat on Facebook Messenger?
|
||||||
|
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
|
@ -435,23 +511,63 @@ How much do I chat on Facebook Messenger?
|
||||||
x_labels = df.index.strftime('%Y %b')
|
x_labels = df.index.strftime('%Y %b')
|
||||||
ax.set_xticklabels(x_labels)
|
ax.set_xticklabels(x_labels)
|
||||||
|
|
||||||
plot_file = 'messenger_2016_to_2019.png'
|
plot_file = 'hpi_files/messenger_2016_to_2019.png'
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.savefig(plot_file)
|
plt.savefig(plot_file)
|
||||||
return plot_file
|
return plot_file
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
|
|
||||||
[[https://beepb00p.xyz/messenger_2016_to_2019.png]]
|
[[https://beepb00p.xyz/hpi_files/messenger_2016_to_2019.png]]
|
||||||
|
|
||||||
|
|
||||||
|
** Which month in 2020 did I make the most git commits in?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: hpi_query_git
|
||||||
|
:END:
|
||||||
|
|
||||||
|
If you like the shell or just want to quickly convert/grab some information from HPI, it also comes with a JSON query interface - so you can export the data, or just pipeline to your heart's content:
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
$ hpi query my.coding.commits.commits --stream # stream JSON objects as they're read
|
||||||
|
--order-type datetime # find the 'datetime' attribute and order by that
|
||||||
|
--after '2020-01-01' --before '2021-01-01' # in 2020
|
||||||
|
| jq '.committed_dt' -r # extract the datetime
|
||||||
|
# mangle the output a bit to group by month and graph it
|
||||||
|
| cut -d'-' -f-2 | sort | uniq -c | awk '{print $2,$1}' | sort -n | termgraph
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+begin_src
|
||||||
|
2020-01: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 458.00
|
||||||
|
2020-02: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 440.00
|
||||||
|
2020-03: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 545.00
|
||||||
|
2020-04: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 585.00
|
||||||
|
2020-05: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 518.00
|
||||||
|
2020-06: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 755.00
|
||||||
|
2020-07: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 467.00
|
||||||
|
2020-08: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 449.00
|
||||||
|
2020-09: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 1.03 K
|
||||||
|
2020-10: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 791.00
|
||||||
|
2020-11: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 474.00
|
||||||
|
2020-12: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 383.00
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
See [[https://github.com/karlicoss/HPI/blob/master/doc/QUERY.md][query docs]]
|
||||||
|
for more examples
|
||||||
|
|
||||||
** Querying Roam Research database
|
** Querying Roam Research database
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: roamresearch
|
||||||
|
:END:
|
||||||
I've got some code examples [[https://beepb00p.xyz/myinfra-roam.html#interactive][here]].
|
I've got some code examples [[https://beepb00p.xyz/myinfra-roam.html#interactive][here]].
|
||||||
|
|
||||||
* How does it get input data?
|
* How does it get input data?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: input_data
|
||||||
|
:END:
|
||||||
If you're curious about any specific data sources I'm using, I've written it up [[https://beepb00p.xyz/my-data.html][in detail]].
|
If you're curious about any specific data sources I'm using, I've written it up [[https://beepb00p.xyz/my-data.html][in detail]].
|
||||||
|
|
||||||
Also see [[file:doc/SETUP.org::#data-flow]["Data flow"]] documentation with some nice diagrams explaining on specific examples.
|
Also see [[https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#data-flow]["Data flow"]] documentation with some nice diagrams explaining on specific examples.
|
||||||
|
|
||||||
In short:
|
In short:
|
||||||
|
|
||||||
|
@ -473,8 +589,15 @@ I consider it a necessary sacrifice to make everything fast and resilient.
|
||||||
In theory, it's possible to make the system almost realtime by having a service that sucks in data continuously (rather than periodically), but it's harder as well.
|
In theory, it's possible to make the system almost realtime by having a service that sucks in data continuously (rather than periodically), but it's harder as well.
|
||||||
|
|
||||||
* Q & A
|
* Q & A
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: q_and_a
|
||||||
|
:END:
|
||||||
|
|
||||||
** Why Python?
|
** Why Python?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: why_python
|
||||||
|
:END:
|
||||||
|
|
||||||
I don't consider Python unique as a language suitable for such a project.
|
I don't consider Python unique as a language suitable for such a project.
|
||||||
It just happens to be the one I'm most comfortable with.
|
It just happens to be the one I'm most comfortable with.
|
||||||
I do have some reasons that I think make it /specifically/ good, but explaining them is out of this post's scope.
|
I do have some reasons that I think make it /specifically/ good, but explaining them is out of this post's scope.
|
||||||
|
@ -487,6 +610,9 @@ I've heard LISPs are great for data? ;)
|
||||||
Overall, I wish [[https://en.wikipedia.org/wiki/Foreign_function_interface][FFIs]] were a bit more mature, so we didn't have to think about specific programming languages at all.
|
Overall, I wish [[https://en.wikipedia.org/wiki/Foreign_function_interface][FFIs]] were a bit more mature, so we didn't have to think about specific programming languages at all.
|
||||||
|
|
||||||
** Can anyone use it?
|
** Can anyone use it?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: can_anyone_use_it
|
||||||
|
:END:
|
||||||
Yes!
|
Yes!
|
||||||
|
|
||||||
- you can plug in *your own data*
|
- you can plug in *your own data*
|
||||||
|
@ -496,6 +622,9 @@ Yes!
|
||||||
Starting from simply adding new modules to any dynamic hackery you can possibly imagine within Python.
|
Starting from simply adding new modules to any dynamic hackery you can possibly imagine within Python.
|
||||||
|
|
||||||
** How easy is it to use?
|
** How easy is it to use?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: how_easy_to_use
|
||||||
|
:END:
|
||||||
The whole setup requires some basic programmer literacy:
|
The whole setup requires some basic programmer literacy:
|
||||||
|
|
||||||
- installing/running and potentially modifying Python code
|
- installing/running and potentially modifying Python code
|
||||||
|
@ -505,6 +634,9 @@ The whole setup requires some basic programmer literacy:
|
||||||
If you have any ideas on making the setup simpler, please let me know!
|
If you have any ideas on making the setup simpler, please let me know!
|
||||||
|
|
||||||
** What about privacy?
|
** What about privacy?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: privacy
|
||||||
|
:END:
|
||||||
The modules contain *no data, only code* to operate on the data.
|
The modules contain *no data, only code* to operate on the data.
|
||||||
|
|
||||||
Everything is [[https://beepb00p.xyz/tags.html#offline][*local first*]], the input data is on your filesystem.
|
Everything is [[https://beepb00p.xyz/tags.html#offline][*local first*]], the input data is on your filesystem.
|
||||||
|
@ -515,6 +647,10 @@ There is still a question of whether you trust yourself at even keeping all the
|
||||||
If you'd rather keep some code private too, it's also trivial to achieve with a private subpackage.
|
If you'd rather keep some code private too, it's also trivial to achieve with a private subpackage.
|
||||||
|
|
||||||
** But /should/ I use it?
|
** But /should/ I use it?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: should_i_use_it
|
||||||
|
:END:
|
||||||
|
|
||||||
#+begin_quote
|
#+begin_quote
|
||||||
Sure, maybe you can achieve a perfect system where you can instantly find and recall anything that you've done. Do you really want it?
|
Sure, maybe you can achieve a perfect system where you can instantly find and recall anything that you've done. Do you really want it?
|
||||||
Wouldn't that, like, make you less human?
|
Wouldn't that, like, make you less human?
|
||||||
|
@ -532,10 +668,14 @@ I can clearly delegate some tasks, like long term memory, information lookup, an
|
||||||
What about these people who have perfect recall and wish they hadn't.
|
What about these people who have perfect recall and wish they hadn't.
|
||||||
#+end_quote
|
#+end_quote
|
||||||
|
|
||||||
Sure, maybe it sucks. At the moment though, I don't anything close to it and this only annoys me.
|
Sure, maybe it sucks. At the moment though, my recall is far from perfect, and this only annoys me.
|
||||||
I want to have a choice at least, and digital tools give me this choice.
|
I want to have a choice at least, and digital tools give me this choice.
|
||||||
|
|
||||||
** Would it suit /me/?
|
** Would it suit /me/?
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: would_it_suit_me
|
||||||
|
:END:
|
||||||
|
|
||||||
Probably, at least to some extent.
|
Probably, at least to some extent.
|
||||||
|
|
||||||
First, our lives are different, so our APIs might be different too.
|
First, our lives are different, so our APIs might be different too.
|
||||||
|
@ -555,7 +695,11 @@ but I still feel that wouldn't be enough.
|
||||||
I'm not sure whether it's a solvable problem at this point, but happy to hear any suggestions!
|
I'm not sure whether it's a solvable problem at this point, but happy to hear any suggestions!
|
||||||
|
|
||||||
** What it isn't?
|
** What it isn't?
|
||||||
- It's not vaporwave
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: what_it_isnt
|
||||||
|
:END:
|
||||||
|
|
||||||
|
- It's not vaporware
|
||||||
|
|
||||||
The project is a little crude, but it's real and working. I've been using it for a long time now, and find it fairly sustainable to keep using for the foreseeable future.
|
The project is a little crude, but it's real and working. I've been using it for a long time now, and find it fairly sustainable to keep using for the foreseeable future.
|
||||||
|
|
||||||
|
@ -568,23 +712,51 @@ I'm not sure whether it's a solvable problem at this point, but happy to hear an
|
||||||
Please take my ideas and code and build something cool from it!
|
Please take my ideas and code and build something cool from it!
|
||||||
|
|
||||||
|
|
||||||
|
* HPI Repositories
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: hpi_repos
|
||||||
|
:END:
|
||||||
|
|
||||||
|
One of HPI's core goals is to be as extendable as possible. The goal here isn't to become a monorepo and support every possible data source/website to the point that this isn't maintainable anymore, but hopefully you get a few modules 'for free'.
|
||||||
|
|
||||||
|
If you want to write modules for personal use but don't want to merge them into here, you're free to maintain modules locally in a separate directory to avoid any merge conflicts, and entire HPI repositories can even be published separately and installed into the single ~my~ python package (For more info on this, see [[https://github.com/karlicoss/HPI/tree/master/doc/MODULE_DESIGN.org][MODULE_DESIGN]])
|
||||||
|
|
||||||
|
Other HPI Repositories:
|
||||||
|
|
||||||
|
- [[https://github.com/purarue/HPI][purarue/HPI]]
|
||||||
|
- [[https://github.com/madelinecameron/hpi][madelinecameron/HPI]]
|
||||||
|
|
||||||
|
If you want to create your own to create your own modules/override something here, you can use the [[https://github.com/purarue/HPI-template][template]].
|
||||||
|
|
||||||
* Related links
|
* Related links
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: links
|
||||||
|
:END:
|
||||||
Similar projects:
|
Similar projects:
|
||||||
|
|
||||||
|
- [[https://hyfen.net/memex][Memex]] by Andrew Louis
|
||||||
- [[https://github.com/novoid/Memacs][Memacs]] by Karl Voit
|
- [[https://github.com/novoid/Memacs][Memacs]] by Karl Voit
|
||||||
- [[https://news.ycombinator.com/item?id=9615901][Me API - turn yourself into an open API (HN)]]
|
- [[https://news.ycombinator.com/item?id=9615901][Me API - turn yourself into an open API (HN)]]
|
||||||
- [[https://github.com/markwk/qs_ledger][QS ledger]] from Mark Koester
|
- [[https://github.com/markwk/qs_ledger][QS ledger]] from Mark Koester
|
||||||
|
- [[https://dogsheep.github.io][Dogsheep]]: a collection of tools for personal analytics using SQLite and Datasette
|
||||||
- [[https://github.com/tehmantra/my][tehmantra/my]]: directly inspired by this package
|
- [[https://github.com/tehmantra/my][tehmantra/my]]: directly inspired by this package
|
||||||
- [[https://github.com/bcongdon/bolero][bcongdon/bolero]]
|
- [[https://github.com/bcongdon/bolero][bcongdon/bolero]]: exposes your personal data as a REST API
|
||||||
- [[https://en.wikipedia.org/wiki/Solid_(web_decentralization_project)#Design][Solid project]]: personal data pod, which websites pull data from
|
- [[https://en.wikipedia.org/wiki/Solid_(web_decentralization_project)#Design][Solid project]]: personal data pod, which websites pull data from
|
||||||
|
- [[https://remotestorage.io][remoteStorage]]: open protocol for apps to write data to your own storage
|
||||||
|
- [[https://perkeep.org]][Perkeep]: a tool with [[https://perkeep.org/doc/principles]][principles] and esp. [[https://perkeep.org/doc/uses]][use cases] for self-sovereign storage of personal data
|
||||||
|
- [[https://www.openhumans.org]][Open Humans]: a community and infrastructure to analyse and share personal data
|
||||||
|
|
||||||
Other links:
|
Other links:
|
||||||
|
|
||||||
- NetOpWibby: [[https://news.ycombinator.com/item?id=21684949][A Personal API (HN)]]
|
- NetOpWibby: [[https://news.ycombinator.com/item?id=21684949][A Personal API (HN)]]
|
||||||
- [[https://beepb00p.xyz/sad-infra.html][The sad state of personal data and infrastructure]]: here I am going into motivation and difficulties arising in the implementation
|
- [[https://beepb00p.xyz/sad-infra.html][The sad state of personal data and infrastructure]]: here I am going into motivation and difficulties arising in the implementation
|
||||||
|
- [[https://beepb00p.xyz/myinfra-roam.html][Extending my personal infrastructure]]: a followup, where I'm demonstrating how to integrate a new data source (Roam Research)
|
||||||
|
|
||||||
* --
|
* --
|
||||||
|
:PROPERTIES:
|
||||||
|
:CUSTOM_ID: fin
|
||||||
|
:END:
|
||||||
|
|
||||||
Open to any feedback and thoughts!
|
Open to any feedback and thoughts!
|
||||||
|
|
||||||
Also, don't hesitate to raise an issue, or reach me personally if you want to try using it, and find the instructions confusing. Your questions would help me to make it simpler!
|
Also, don't hesitate to raise an issue, or reach me personally if you want to try using it, and find the instructions confusing. Your questions would help me to make it simpler!
|
||||||
|
|
47
conftest.py
Normal file
47
conftest.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
# this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly
|
||||||
|
# without it, pytest can't discover the package root for some reason
|
||||||
|
# also see https://github.com/karlicoss/pytest_namespace_pkgs for more
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import _pytest.main
|
||||||
|
import _pytest.pathlib
|
||||||
|
|
||||||
|
# we consider all dirs in repo/ to be namespace packages
|
||||||
|
root_dir = pathlib.Path(__file__).absolute().parent.resolve() # / 'src'
|
||||||
|
assert root_dir.exists(), root_dir
|
||||||
|
|
||||||
|
# TODO assert it contains package name?? maybe get it via setuptools..
|
||||||
|
|
||||||
|
namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()]
|
||||||
|
|
||||||
|
# resolve_package_path is called from _pytest.pathlib.import_path
|
||||||
|
# takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem
|
||||||
|
resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path
|
||||||
|
def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]:
|
||||||
|
result = path # search from the test file upwards
|
||||||
|
for parent in result.parents:
|
||||||
|
if str(parent) in namespace_pkg_dirs:
|
||||||
|
return parent
|
||||||
|
if os.name == 'nt':
|
||||||
|
# ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx
|
||||||
|
if path.name == 'conftest.py':
|
||||||
|
return resolve_pkg_path_orig(path)
|
||||||
|
raise RuntimeError("Couldn't determine path for ", path)
|
||||||
|
_pytest.pathlib.resolve_package_path = resolve_package_path
|
||||||
|
|
||||||
|
|
||||||
|
# without patching, the orig function returns just a package name for some reason
|
||||||
|
# (I think it's used as a sort of fallback)
|
||||||
|
# so we need to point it at the absolute path properly
|
||||||
|
# not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure..
|
||||||
|
search_pypath_orig = _pytest.main.search_pypath
|
||||||
|
def search_pypath(module_name: str) -> str:
|
||||||
|
mpath = root_dir / module_name.replace('.', os.sep)
|
||||||
|
if not mpath.is_dir():
|
||||||
|
mpath = mpath.with_suffix('.py')
|
||||||
|
assert mpath.exists(), mpath # just in case
|
||||||
|
return str(mpath)
|
||||||
|
_pytest.main.search_pypath = search_pypath
|
28
demo.py
28
demo.py
|
@ -1,29 +1,35 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from subprocess import check_call, DEVNULL
|
from subprocess import check_call, DEVNULL
|
||||||
from shutil import copy, copytree
|
from shutil import copytree, ignore_patterns
|
||||||
import os
|
import os
|
||||||
from os.path import abspath
|
from os.path import abspath
|
||||||
|
from sys import executable as python
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
my_repo = Path(__file__).absolute().parent
|
my_repo = Path(__file__).absolute().parent
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run() -> None:
|
||||||
# uses fixed paths; worth it for the sake of demonstration
|
# uses fixed paths; worth it for the sake of demonstration
|
||||||
# assumes we're in /tmp/my_demo now
|
# assumes we're in /tmp/my_demo now
|
||||||
|
|
||||||
# 1. clone git@github.com:karlicoss/my.git
|
# 1. clone git@github.com:karlicoss/my.git
|
||||||
copytree(my_repo, 'my_repo', symlinks=True)
|
copytree(
|
||||||
|
my_repo,
|
||||||
|
'my_repo',
|
||||||
|
symlinks=True,
|
||||||
|
ignore=ignore_patterns('.tox*'), # tox dir might have broken symlinks while tests are running in parallel
|
||||||
|
)
|
||||||
|
|
||||||
# 2. prepare repositories you'd be using. For this demo we only set up Hypothesis
|
# 2. prepare repositories you'd be using. For this demo we only set up Hypothesis
|
||||||
tox = 'TOX' in os.environ
|
tox = 'TOX' in os.environ
|
||||||
if tox: # tox doesn't like --user flag
|
if tox: # tox doesn't like --user flag
|
||||||
check_call('pip3 install git+https://github.com/karlicoss/hypexport.git'.split())
|
check_call(f'{python} -m pip install git+https://github.com/karlicoss/hypexport.git'.split())
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
import hypexport
|
import hypexport
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
check_call('pip3 install --user git+https://github.com/karlicoss/hypexport.git'.split())
|
check_call(f'{python} -m pip --user git+https://github.com/karlicoss/hypexport.git'.split())
|
||||||
|
|
||||||
|
|
||||||
# 3. prepare some demo Hypothesis data
|
# 3. prepare some demo Hypothesis data
|
||||||
|
@ -48,7 +54,7 @@ def run():
|
||||||
# 4. now we can use it!
|
# 4. now we can use it!
|
||||||
os.chdir(my_repo)
|
os.chdir(my_repo)
|
||||||
|
|
||||||
check_call(['python3', '-c', '''
|
check_call([python, '-c', '''
|
||||||
import my.hypothesis
|
import my.hypothesis
|
||||||
|
|
||||||
pages = my.hypothesis.pages()
|
pages = my.hypothesis.pages()
|
||||||
|
@ -106,13 +112,17 @@ def named_temp_dir(name: str):
|
||||||
"""
|
"""
|
||||||
Fixed name tmp dir
|
Fixed name tmp dir
|
||||||
"""
|
"""
|
||||||
td = (Path('/tmp') / name)
|
import tempfile
|
||||||
|
td = Path(tempfile.gettempdir()) / name
|
||||||
try:
|
try:
|
||||||
td.mkdir(exist_ok=False)
|
td.mkdir(exist_ok=False)
|
||||||
yield td
|
yield td
|
||||||
finally:
|
finally:
|
||||||
import shutil
|
import os, shutil
|
||||||
shutil.rmtree(str(td))
|
skip_cleanup = 'CI' in os.environ and os.name == 'nt'
|
||||||
|
# TODO hmm for some reason cleanup on windows causes AccessError
|
||||||
|
if not skip_cleanup:
|
||||||
|
shutil.rmtree(str(td))
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
This doc describes the technical decisions behind HPI configuration system.
|
This doc describes the technical decisions behind HPI configuration system.
|
||||||
If you just want to know how to set it up, see [[file:SETUP.org][SETUP]].
|
It's more of a 'design doc' rather than usage guide.
|
||||||
|
If you just want to know how to set up HPI or configure it, see [[file:SETUP.org][SETUP]].
|
||||||
|
|
||||||
I feel like it's good to keep the rationales in the documentation,
|
I feel like it's good to keep the rationales in the documentation,
|
||||||
but happy to [[https://github.com/karlicoss/HPI/issues/46][discuss]] it here.
|
but happy to [[https://github.com/karlicoss/HPI/issues/46][discuss]] it here.
|
||||||
|
@ -16,8 +17,6 @@ At the moment, it uses the following config attributes:
|
||||||
|
|
||||||
Cache is extremely useful to speed up some queries. But it's *optional*, everything should work without it.
|
Cache is extremely useful to speed up some queries. But it's *optional*, everything should work without it.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
I'll refer to this config as *specific* further in the doc, and give examples. to each point. Note that they are only illustrating the specific requirement, potentially ignoring the other ones.
|
I'll refer to this config as *specific* further in the doc, and give examples. to each point. Note that they are only illustrating the specific requirement, potentially ignoring the other ones.
|
||||||
Now, the requirements as I see it:
|
Now, the requirements as I see it:
|
||||||
|
|
||||||
|
@ -41,9 +40,9 @@ Now, the requirements as I see it:
|
||||||
|
|
||||||
- keeping it overly flexible and powerful means it's potentially less accessible to people less familiar with programming
|
- keeping it overly flexible and powerful means it's potentially less accessible to people less familiar with programming
|
||||||
|
|
||||||
But see the further point about keeping it simple. I claim that simple programs look as easy as simple json.
|
But see the further point about keeping it simple. I claim that simple programs look as easy as simple JSON.
|
||||||
|
|
||||||
- Python is 'less safe' than a plain json/yaml config
|
- Python is 'less safe' than a plain JSON/YAML config
|
||||||
|
|
||||||
But at the moment the whole thing is running potentially untrusted Python code anyway.
|
But at the moment the whole thing is running potentially untrusted Python code anyway.
|
||||||
It's not a tool you're going to install it across your organization, run under root privileges, and let the employers tweak it.
|
It's not a tool you're going to install it across your organization, run under root privileges, and let the employers tweak it.
|
||||||
|
@ -51,7 +50,7 @@ Now, the requirements as I see it:
|
||||||
Ultimately, you set it up for yourself, and the config has exactly the same permissions as the code you're installing.
|
Ultimately, you set it up for yourself, and the config has exactly the same permissions as the code you're installing.
|
||||||
Thinking that plain config would give you more security is deceptive, and it's a false sense of security (at this stage of the project).
|
Thinking that plain config would give you more security is deceptive, and it's a false sense of security (at this stage of the project).
|
||||||
|
|
||||||
# TODO I don't mind having json/toml/whatever, but only as an additional interface
|
# TODO I don't mind having JSON/TOML/whatever, but only as an additional interface
|
||||||
|
|
||||||
I also write more about all this [[https://beepb00p.xyz/configs-suck.html][here]].
|
I also write more about all this [[https://beepb00p.xyz/configs-suck.html][here]].
|
||||||
|
|
||||||
|
@ -294,12 +293,9 @@ Some of TODO rexport?
|
||||||
|
|
||||||
To some extent, this is an experiment. I'm not sure how much value is in .
|
To some extent, this is an experiment. I'm not sure how much value is in .
|
||||||
|
|
||||||
|
|
||||||
One thing are TODO software? libraries that have fairly well defined APIs and you can reasonably version them.
|
One thing are TODO software? libraries that have fairly well defined APIs and you can reasonably version them.
|
||||||
|
|
||||||
Another thing is the modules for accessing data, where you'd hopefully have everything backwards compatible.
|
Another thing is the modules for accessing data, where you'd hopefully have everything backwards compatible.
|
||||||
Maybe in the future
|
Maybe in the future
|
||||||
|
|
||||||
I'm just not sure, happy to hear people's opinions on this.
|
I'm just not sure, happy to hear people's opinions on this.
|
||||||
|
|
||||||
|
|
||||||
|
|
12
doc/CONTRIBUTING.org
Normal file
12
doc/CONTRIBUTING.org
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
doc in progress
|
||||||
|
|
||||||
|
- I don't use automatic code formatters (like =black=)
|
||||||
|
|
||||||
|
I don't mind if you do, e.g. when you're adding new code or formatting some code you modified, but please don't reformat the whole repository or slip in unrelated code style changes.
|
||||||
|
|
||||||
|
In particular I can't stand when formatters mess with vertically aligned code (thus making it less readable!), or conform the code to some arbitrary line length (like 80 symbols).
|
||||||
|
|
||||||
|
Of course reasonable formatting improvements (like obvious typos, missing spaces or too dense code) are welcome.
|
||||||
|
And of course, if we end up collaborating a lot on the project I'm open to discussion if automatic code style is really important to you.
|
||||||
|
|
||||||
|
- See [[file:MODULE_DESIGN.org][MODULE_DESIGN.org]] for common practices in HPI
|
130
doc/DENYLIST.md
Normal file
130
doc/DENYLIST.md
Normal file
|
@ -0,0 +1,130 @@
|
||||||
|
For code reference, see: [`my.core.denylist.py`](../my/core/denylist.py)
|
||||||
|
|
||||||
|
A helper module for defining denylists for sources programmatically (in layman's terms, this lets you remove some particular output from a module you don't want)
|
||||||
|
|
||||||
|
Lets you specify a class, an attribute to match on,
|
||||||
|
and a JSON file containing a list of values to deny/filter out
|
||||||
|
|
||||||
|
As an example, this will use the `my.ip` module, as filtering incorrect IPs was the original use case for this module:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class IP(NamedTuple):
|
||||||
|
addr: str
|
||||||
|
dt: datetime
|
||||||
|
```
|
||||||
|
|
||||||
|
A possible denylist file would contain:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"addr": "192.168.1.1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dt": "2020-06-02T03:12:00+00:00",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that if the value being compared to is not a single (non-array/object) JSON primitive
|
||||||
|
(str, int, float, bool, None), it will be converted to a string before comparison
|
||||||
|
|
||||||
|
To use this in code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from my.ip.all import ips
|
||||||
|
filtered = DenyList("~/data/ip_denylist.json").filter(ips())
|
||||||
|
```
|
||||||
|
|
||||||
|
To add items to the denylist, in python (in a one-off script):
|
||||||
|
|
||||||
|
```python
|
||||||
|
from my.ip.all import ips
|
||||||
|
from my.core.denylist import DenyList
|
||||||
|
|
||||||
|
d = DenyList("~/data/ip_denylist.json")
|
||||||
|
|
||||||
|
for ip in ips():
|
||||||
|
# some custom code you define
|
||||||
|
if ip.addr == ...:
|
||||||
|
d.deny(key="ip", value=ip.ip)
|
||||||
|
d.write()
|
||||||
|
```
|
||||||
|
|
||||||
|
... or interactively, which requires [`fzf`](https://github.com/junegunn/fzf) and [`pyfzf-iter`](https://pypi.org/project/pyfzf-iter/) (`python3 -m pip install pyfzf-iter`) to be installed:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from my.ip.all import ips
|
||||||
|
from my.core.denylist import DenyList
|
||||||
|
|
||||||
|
d = DenyList("~/data/ip_denylist.json")
|
||||||
|
d.deny_cli(ips()) # automatically writes after each selection
|
||||||
|
```
|
||||||
|
|
||||||
|
That will open up an interactive `fzf` prompt, where you can select an item to add to the denylist
|
||||||
|
|
||||||
|
This is meant for relatively simple filters, where you want to filter items out
|
||||||
|
based on a single attribute of a namedtuple/dataclass. If you want to do something
|
||||||
|
more complex, I would recommend overriding the `all.py` file for that source and
|
||||||
|
writing your own filter function there.
|
||||||
|
|
||||||
|
For more info on all.py:
|
||||||
|
|
||||||
|
https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy
|
||||||
|
|
||||||
|
This would typically be used in an overridden `all.py` file, or in a one-off script
|
||||||
|
which you may want to filter out some items from a source, progressively adding more
|
||||||
|
items to the denylist as you go.
|
||||||
|
|
||||||
|
A potential `my/ip/all.py` file might look like (Sidenote: `discord` module from [here](https://github.com/purarue/HPI)):
|
||||||
|
|
||||||
|
```python
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
from my.ip.common import IP
|
||||||
|
from my.core.denylist import DenyList
|
||||||
|
|
||||||
|
deny = DenyList("~/data/ip_denylist.json")
|
||||||
|
|
||||||
|
# all possible data from the source
|
||||||
|
def _ips() -> Iterator[IP]:
|
||||||
|
from my.ip import discord
|
||||||
|
# could add other imports here
|
||||||
|
|
||||||
|
yield from discord.ips()
|
||||||
|
|
||||||
|
|
||||||
|
# filtered data
|
||||||
|
def ips() -> Iterator[IP]:
|
||||||
|
yield from deny.filter(_ips())
|
||||||
|
```
|
||||||
|
|
||||||
|
To add items to the denylist, you could create a `__main__.py` in your namespace package (in this case, `my/ip/__main__.py`), with contents like:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from my.ip import all
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
all.deny.deny_cli(all.ips())
|
||||||
|
```
|
||||||
|
|
||||||
|
Which could then be called like: `python3 -m my.ip`
|
||||||
|
|
||||||
|
Or, you could just run it from the command line:
|
||||||
|
|
||||||
|
```
|
||||||
|
python3 -c 'from my.ip import all; all.deny.deny_cli(all.ips())'
|
||||||
|
```
|
||||||
|
|
||||||
|
To edit the `all.py`, you could either:
|
||||||
|
|
||||||
|
- install it as editable (`python3 -m pip install --user -e ./HPI`), and then edit the file directly
|
||||||
|
- or, create a namespace package, which splits the package across multiple directories. For info on that see [`MODULE_DESIGN`](https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#namespace-packages), [`reorder_editable`](https://github.com/purarue/reorder_editable), and possibly the [`HPI-template`](https://github.com/purarue/HPI-template) to create your own HPI namespace package to create your own `all.py` file.
|
||||||
|
|
||||||
|
For a real example of this see, [purarue/HPI-personal](https://github.com/purarue/HPI-personal/blob/master/my/ip/all.py)
|
||||||
|
|
||||||
|
Sidenote: the reason why we want to specifically override
|
||||||
|
the all.py and not just create a script that filters out the items you're
|
||||||
|
not interested in is because we want to be able to import from `my.ip.all`
|
||||||
|
or `my.location.all` from other modules and get the filtered results, without
|
||||||
|
having to mix data filtering logic with parsing/loading/caching (the stuff HPI does)
|
55
doc/DESIGN.org
Normal file
55
doc/DESIGN.org
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
note: this doc is in progress
|
||||||
|
|
||||||
|
* main design principles
|
||||||
|
|
||||||
|
- interoperable
|
||||||
|
|
||||||
|
# note: this link doesn't work in org, but does for the github preview
|
||||||
|
This is the main motivation and [[file:../README.org#why][why]] I created HPI in the first place.
|
||||||
|
|
||||||
|
Ideally it should be possible to hook into anything you can imagine -- regardless the database/programming language/etc.
|
||||||
|
|
||||||
|
Check out [[https://beepb00p.xyz/myinfra.html#mypkg][my infrastructure map]] to see how I'm using it.
|
||||||
|
|
||||||
|
- extensible
|
||||||
|
|
||||||
|
It should be possible for anyone to modify/extent HPI to their own needs, e.g.
|
||||||
|
|
||||||
|
- adding new data providers
|
||||||
|
- patching existing ones
|
||||||
|
- mixing in custom data sources
|
||||||
|
|
||||||
|
See the guide to [[file:SETUP.org::#addingmodifying-modules][extending/modifying HPI]]
|
||||||
|
|
||||||
|
- local first/offline
|
||||||
|
|
||||||
|
The main idea is to work against data on your disk to provide convenient, fast and robust access.
|
||||||
|
See [[file:../README.org::#how-does-it-get-input-data]["How does it get input data?"]]
|
||||||
|
|
||||||
|
Although in principle there is nothing wrong if you want to hook it to some online API, it's just python code after all!
|
||||||
|
|
||||||
|
- reasonably defensive
|
||||||
|
|
||||||
|
Data is inherently messy, and it's inevitable to get parsing errors and missing fields now and then.
|
||||||
|
|
||||||
|
I'm trying to combat this with [[https://beepb00p.xyz/mypy-error-handling.html][mypy assisted error handling]],
|
||||||
|
so you are aware of errors, but still can work with the 'good' subset of data.
|
||||||
|
|
||||||
|
- robust
|
||||||
|
|
||||||
|
The code is extensively covered with tests & ~mypy~ to make sure it doesn't rot.
|
||||||
|
I also try to keep everything as backwards compatible as possible.
|
||||||
|
|
||||||
|
- (almost) no magic
|
||||||
|
|
||||||
|
While I do use dynamic Python's features where it's inevitable or too convenient, I try to keep everything as close to standard Python as possible.
|
||||||
|
|
||||||
|
This allows it to:
|
||||||
|
|
||||||
|
- be at least as extensible as other Python software
|
||||||
|
- use mature tools like =pip= or =mypy=
|
||||||
|
|
||||||
|
|
||||||
|
* other docs
|
||||||
|
- [[file:CONFIGURING.org][some decisions around HPI configuration 'system']]
|
||||||
|
- [[file:MODULE_DESIGN.org][some thoughts on the modules, their design, and adding new ones]]
|
|
@ -26,6 +26,11 @@ However, Pycharm/Emacs or whatever IDE you are using won't be able to figure tha
|
||||||
i.e. create a new interpreter configuration (e.g. name it "Python 3.7 (for HPI)"), and add =~/.config/my=.
|
i.e. create a new interpreter configuration (e.g. name it "Python 3.7 (for HPI)"), and add =~/.config/my=.
|
||||||
|
|
||||||
* Linting
|
* Linting
|
||||||
You should be able to use [[file:../lint]] script to run mypy checks.
|
~tox~ should run all test, mypy, etc.
|
||||||
|
|
||||||
[[file:../mypy.ini]] points at =~/.config/my= by default.
|
If you want to run some specific parts/tests, consult [[file:tox.ini]].
|
||||||
|
|
||||||
|
Some useful flags (look them up):
|
||||||
|
|
||||||
|
- ~-e~ flag for tox
|
||||||
|
- ~-k~ flag for pytest
|
||||||
|
|
208
doc/MODULES.org
208
doc/MODULES.org
|
@ -4,7 +4,7 @@ There are many more, see:
|
||||||
|
|
||||||
- [[file:../README.org::#whats-inside]["What's inside"]] for the full list of modules.
|
- [[file:../README.org::#whats-inside]["What's inside"]] for the full list of modules.
|
||||||
- you can also run =hpi modules= to list what's available on your system
|
- you can also run =hpi modules= to list what's available on your system
|
||||||
- source code is always the primary source of truth
|
- [[https://github.com/karlicoss/HPI][source code]] is always the primary source of truth
|
||||||
|
|
||||||
If you have some issues with the setup, see [[file:SETUP.org::#troubleshooting]["Troubleshooting"]].
|
If you have some issues with the setup, see [[file:SETUP.org::#troubleshooting]["Troubleshooting"]].
|
||||||
|
|
||||||
|
@ -16,14 +16,17 @@ If you have some issues with the setup, see [[file:SETUP.org::#troubleshooting][
|
||||||
- [[#toc][TOC]]
|
- [[#toc][TOC]]
|
||||||
- [[#intro][Intro]]
|
- [[#intro][Intro]]
|
||||||
- [[#configs][Configs]]
|
- [[#configs][Configs]]
|
||||||
- [[#mygoogletakeoutpaths][my.google.takeout.paths]]
|
- [[#mygoogletakeoutparser][my.google.takeout.parser]]
|
||||||
- [[#myhypothesis][my.hypothesis]]
|
- [[#myhypothesis][my.hypothesis]]
|
||||||
- [[#myreddit][my.reddit]]
|
- [[#myreddit][my.reddit]]
|
||||||
|
- [[#mybrowser][my.browser]]
|
||||||
|
- [[#mylocation][my.location]]
|
||||||
|
- [[#mytimetzvia_location][my.time.tz.via_location]]
|
||||||
- [[#mypocket][my.pocket]]
|
- [[#mypocket][my.pocket]]
|
||||||
- [[#mytwittertwint][my.twitter.twint]]
|
- [[#mytwittertwint][my.twitter.twint]]
|
||||||
- [[#mytwitterarchive][my.twitter.archive]]
|
- [[#mytwitterarchive][my.twitter.archive]]
|
||||||
- [[#mylastfm][my.lastfm]]
|
- [[#mylastfm][my.lastfm]]
|
||||||
- [[#myreadingpolar][my.reading.polar]]
|
- [[#mypolar][my.polar]]
|
||||||
- [[#myinstapaper][my.instapaper]]
|
- [[#myinstapaper][my.instapaper]]
|
||||||
- [[#mygithubgdpr][my.github.gdpr]]
|
- [[#mygithubgdpr][my.github.gdpr]]
|
||||||
- [[#mygithubghexport][my.github.ghexport]]
|
- [[#mygithubghexport][my.github.ghexport]]
|
||||||
|
@ -55,11 +58,147 @@ Some explanations:
|
||||||
|
|
||||||
- if the field has a default value, you can omit it from your private config altogether
|
- if the field has a default value, you can omit it from your private config altogether
|
||||||
|
|
||||||
|
For more thoughts on modules and their structure, see [[file:MODULE_DESIGN.org][MODULE_DESIGN]]
|
||||||
|
|
||||||
|
* all.py
|
||||||
|
|
||||||
|
Some modules have lots of different sources for data. For example,
|
||||||
|
~my.location~ (location data) has lots of possible sources -- from
|
||||||
|
~my.google.takeout.parser~, using the ~gpslogger~ android app, or through
|
||||||
|
geolocating ~my.ip~ addresses. If you only plan on using one the modules, you
|
||||||
|
can just import from the individual module, (e.g. ~my.google.takeout.parser~)
|
||||||
|
or you can disable the others using the ~core~ config -- See the
|
||||||
|
[[https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy][MODULE_DESIGN]] docs for more details.
|
||||||
|
|
||||||
* Configs
|
* Configs
|
||||||
|
|
||||||
The config snippets below are meant to be modified accordingly and *pasted into your private configuration*, e.g =$MY_CONFIG/my/config.py=.
|
The config snippets below are meant to be modified accordingly and *pasted into your private configuration*, e.g =$MY_CONFIG/my/config.py=.
|
||||||
|
|
||||||
You don't have to set them up all at once, it's recommended to do it gradually.
|
You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works.
|
||||||
|
|
||||||
|
For an extensive/complex example, you can check out ~@purarue~'s [[https://github.com/purarue/dotfiles/blob/master/.config/my/my/config/__init__.py][config]]
|
||||||
|
|
||||||
|
# Nested Configurations before the doc generation using the block below
|
||||||
|
** [[file:../my/reddit][my.reddit]]
|
||||||
|
|
||||||
|
Reddit data: saved items/comments/upvotes/etc.
|
||||||
|
|
||||||
|
# Note: can't be generated as easily since this is a nested configuration object
|
||||||
|
#+begin_src python
|
||||||
|
class reddit:
|
||||||
|
class rexport:
|
||||||
|
'''
|
||||||
|
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# path[s]/glob to the exported JSON data
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
class pushshift:
|
||||||
|
'''
|
||||||
|
Uses [[https://github.com/purarue/pushshift_comment_export][pushshift]] to get access to old comments
|
||||||
|
'''
|
||||||
|
|
||||||
|
# path[s]/glob to the exported JSON data
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
** [[file:../my/browser/][my.browser]]
|
||||||
|
|
||||||
|
Parses browser history using [[http://github.com/purarue/browserexport][browserexport]]
|
||||||
|
|
||||||
|
#+begin_src python
|
||||||
|
class browser:
|
||||||
|
class export:
|
||||||
|
# path[s]/glob to your backed up browser history sqlite files
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
class active_browser:
|
||||||
|
# paths to sqlite database files which you use actively
|
||||||
|
# to read from. For example:
|
||||||
|
# from browserexport.browsers.all import Firefox
|
||||||
|
# export_path = Firefox.locate_database()
|
||||||
|
export_path: Paths
|
||||||
|
#+end_src
|
||||||
|
** [[file:../my/location][my.location]]
|
||||||
|
|
||||||
|
Merged location history from lots of sources.
|
||||||
|
|
||||||
|
The main sources here are
|
||||||
|
[[https://github.com/mendhak/gpslogger][gpslogger]] .gpx (XML) files, and
|
||||||
|
google takeout (using =my.google.takeout.parser=), with a fallback on
|
||||||
|
manually defined home locations.
|
||||||
|
|
||||||
|
You might also be able to use [[file:../my/location/via_ip.py][my.location.via_ip]] which uses =my.ip.all= to
|
||||||
|
provide geolocation data for an IPs (though no IPs are provided from any
|
||||||
|
of the sources here). For an example of usage, see [[https://github.com/purarue/HPI/tree/master/my/ip][here]]
|
||||||
|
|
||||||
|
#+begin_src python
|
||||||
|
class location:
|
||||||
|
home = (
|
||||||
|
# supports ISO strings
|
||||||
|
('2005-12-04' , (42.697842, 23.325973)), # Bulgaria, Sofia
|
||||||
|
# supports date/datetime objects
|
||||||
|
(date(year=1980, month=2, day=15) , (40.7128 , -74.0060 )), # NY
|
||||||
|
(datetime.fromtimestamp(1600000000, tz=timezone.utc), (55.7558 , 37.6173 )), # Moscow, Russia
|
||||||
|
)
|
||||||
|
# note: order doesn't matter, will be sorted in the data provider
|
||||||
|
|
||||||
|
class gpslogger:
|
||||||
|
# path[s]/glob to the exported gpx files
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
# default accuracy for gpslogger
|
||||||
|
accuracy: float = 50.0
|
||||||
|
|
||||||
|
class via_ip:
|
||||||
|
# guess ~15km accuracy for IP addresses
|
||||||
|
accuracy: float = 15_000
|
||||||
|
#+end_src
|
||||||
|
** [[file:../my/time/tz/via_location.py][my.time.tz.via_location]]
|
||||||
|
|
||||||
|
Uses the =my.location= module to determine the timezone for a location.
|
||||||
|
|
||||||
|
This can be used to 'localize' timezones. Most modules here return
|
||||||
|
datetimes in UTC, to prevent confusion whether or not its a local
|
||||||
|
timezone, one from UTC, or one in your timezone.
|
||||||
|
|
||||||
|
Depending on the specific data provider and your level of paranoia you might expect different behaviour.. E.g.:
|
||||||
|
- if your objects already have tz info, you might not need to call localize() at all
|
||||||
|
- it's safer when either all of your objects are tz aware or all are tz unware, not a mixture
|
||||||
|
- you might trust your original timezone, or it might just be UTC, and you want to use something more reasonable
|
||||||
|
|
||||||
|
#+begin_src python
|
||||||
|
TzPolicy = Literal[
|
||||||
|
'keep' , # if datetime is tz aware, just preserve it
|
||||||
|
'convert', # if datetime is tz aware, convert to provider's tz
|
||||||
|
'throw' , # if datetime is tz aware, throw exception
|
||||||
|
]
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
This is still a work in progress, plan is to integrate it with =hpi query=
|
||||||
|
so that you can easily convert/localize timezones for some module/data
|
||||||
|
|
||||||
|
#+begin_src python
|
||||||
|
class time:
|
||||||
|
class tz:
|
||||||
|
policy = 'keep'
|
||||||
|
|
||||||
|
class via_location:
|
||||||
|
# less precise, but faster
|
||||||
|
fast: bool = True
|
||||||
|
|
||||||
|
# sort locations by date
|
||||||
|
# in case multiple sources provide them out of order
|
||||||
|
sort_locations: bool = True
|
||||||
|
|
||||||
|
# if the accuracy for the location is more than 5km (this
|
||||||
|
# isn't an accurate location, so shouldn't use it to determine
|
||||||
|
# timezone), don't use
|
||||||
|
require_accuracy: float = 5_000
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
|
||||||
# TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.
|
# TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.
|
||||||
|
|
||||||
|
@ -70,18 +209,17 @@ import importlib
|
||||||
# from lint import all_modules # meh
|
# from lint import all_modules # meh
|
||||||
# TODO figure out how to discover configs automatically...
|
# TODO figure out how to discover configs automatically...
|
||||||
modules = [
|
modules = [
|
||||||
('google' , 'my.google.takeout.paths'),
|
('google' , 'my.google.takeout.parser'),
|
||||||
('hypothesis' , 'my.hypothesis' ),
|
('hypothesis' , 'my.hypothesis' ),
|
||||||
('reddit' , 'my.reddit' ),
|
('pocket' , 'my.pocket' ),
|
||||||
('pocket' , 'my.pocket' ),
|
('twint' , 'my.twitter.twint' ),
|
||||||
('twint' , 'my.twitter.twint' ),
|
('twitter_archive', 'my.twitter.archive' ),
|
||||||
('twitter_archive', 'my.twitter.archive' ),
|
('lastfm' , 'my.lastfm' ),
|
||||||
('lastfm' , 'my.lastfm' ),
|
('polar' , 'my.polar' ),
|
||||||
('polar' , 'my.reading.polar' ),
|
('instapaper' , 'my.instapaper' ),
|
||||||
('instapaper' , 'my.instapaper' ),
|
('github' , 'my.github.gdpr' ),
|
||||||
('github' , 'my.github.gdpr' ),
|
('github' , 'my.github.ghexport' ),
|
||||||
('github' , 'my.github.ghexport' ),
|
('kobo' , 'my.kobo' ),
|
||||||
('kobo' , 'my.kobo' ),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def indent(s, spaces=4):
|
def indent(s, spaces=4):
|
||||||
|
@ -116,14 +254,29 @@ for cls, p in modules:
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
|
|
||||||
|
** [[file:../my/google/takeout/parser.py][my.google.takeout.parser]]
|
||||||
|
|
||||||
** [[file:../my/google/takeout/paths.py][my.google.takeout.paths]]
|
Parses Google Takeout using [[https://github.com/purarue/google_takeout_parser][google_takeout_parser]]
|
||||||
|
|
||||||
Module for locating and accessing [[https://takeout.google.com][Google Takeout]] data
|
See [[https://github.com/purarue/google_takeout_parser][google_takeout_parser]] for more information about how to export and organize your takeouts
|
||||||
|
|
||||||
|
If the =DISABLE_TAKEOUT_CACHE= environment variable is set, this won't
|
||||||
|
cache individual exports in =~/.cache/google_takeout_parser=
|
||||||
|
|
||||||
|
The directory set as takeout_path can be unpacked directories, or
|
||||||
|
zip files of the exports, which are temporarily unpacked while creating
|
||||||
|
the cachew cache
|
||||||
|
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
class google:
|
class google(user_config):
|
||||||
takeout_path: Paths # path/paths/glob for the takeout zips
|
# directory which includes unpacked/zipped takeouts
|
||||||
|
takeout_path: Paths
|
||||||
|
|
||||||
|
error_policy: ErrorPolicy = 'yield'
|
||||||
|
|
||||||
|
# experimental flag to use core.kompress.ZipPath
|
||||||
|
# instead of unpacking to a tmp dir via match_structure
|
||||||
|
_use_zippath: bool = False
|
||||||
#+end_src
|
#+end_src
|
||||||
** [[file:../my/hypothesis.py][my.hypothesis]]
|
** [[file:../my/hypothesis.py][my.hypothesis]]
|
||||||
|
|
||||||
|
@ -138,19 +291,6 @@ for cls, p in modules:
|
||||||
# paths[s]/glob to the exported JSON data
|
# paths[s]/glob to the exported JSON data
|
||||||
export_path: Paths
|
export_path: Paths
|
||||||
#+end_src
|
#+end_src
|
||||||
** [[file:../my/reddit.py][my.reddit]]
|
|
||||||
|
|
||||||
Reddit data: saved items/comments/upvotes/etc.
|
|
||||||
|
|
||||||
#+begin_src python
|
|
||||||
class reddit:
|
|
||||||
'''
|
|
||||||
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
|
|
||||||
'''
|
|
||||||
|
|
||||||
# path[s]/glob to the exported JSON data
|
|
||||||
export_path: Paths
|
|
||||||
#+end_src
|
|
||||||
** [[file:../my/pocket.py][my.pocket]]
|
** [[file:../my/pocket.py][my.pocket]]
|
||||||
|
|
||||||
[[https://getpocket.com][Pocket]] bookmarks and highlights
|
[[https://getpocket.com][Pocket]] bookmarks and highlights
|
||||||
|
@ -195,7 +335,7 @@ for cls, p in modules:
|
||||||
"""
|
"""
|
||||||
export_path: Paths
|
export_path: Paths
|
||||||
#+end_src
|
#+end_src
|
||||||
** [[file:../my/reading/polar.py][my.reading.polar]]
|
** [[file:../my/polar.py][my.polar]]
|
||||||
|
|
||||||
[[https://github.com/burtonator/polar-bookshelf][Polar]] articles and highlights
|
[[https://github.com/burtonator/polar-bookshelf][Polar]] articles and highlights
|
||||||
|
|
||||||
|
|
331
doc/MODULE_DESIGN.org
Normal file
331
doc/MODULE_DESIGN.org
Normal file
|
@ -0,0 +1,331 @@
|
||||||
|
Some thoughts on modules, how to structure them, and adding your own/extending HPI
|
||||||
|
|
||||||
|
This is slightly more advanced, and would be useful if you're trying to extend HPI by developing your own modules, or contributing back to HPI
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
:PROPERTIES:
|
||||||
|
:TOC: :include all :depth 1 :force (nothing) :ignore (this) :local (nothing)
|
||||||
|
:END:
|
||||||
|
:CONTENTS:
|
||||||
|
- [[#allpy][all.py]]
|
||||||
|
- [[#module-count][module count]]
|
||||||
|
- [[#single-file-modules][single file modules]]
|
||||||
|
- [[#adding-new-modules][Adding new modules]]
|
||||||
|
- [[#an-extendable-module-structure][An Extendable module structure]]
|
||||||
|
- [[#logging-guidelines][Logging guidelines]]
|
||||||
|
:END:
|
||||||
|
|
||||||
|
* all.py
|
||||||
|
|
||||||
|
Some modules have lots of different sources for data. For example, ~my.location~ (location data) has lots of possible sources -- from ~my.google.takeout.parser~, using the ~gpslogger~ android app, or through geo locating ~my.ip~ addresses. For a module with multiple possible sources, its common to split it into files like:
|
||||||
|
|
||||||
|
#+begin_src
|
||||||
|
my/location
|
||||||
|
├── all.py -- specifies all possible sources/combines/merges data
|
||||||
|
├── common.py -- defines shared code, e.g. to merge data from across entries, a shared model (namedtuple/dataclass) or protocol
|
||||||
|
├── google_takeout.py -- source for data using my.google.takeout.parser
|
||||||
|
├── gpslogger.py -- source for data using gpslogger
|
||||||
|
├── home.py -- fallback source
|
||||||
|
└── via_ip.py -- source using my.ip
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
Its common for each of those sources to have their own file, like ~my.location.google_takeout~, ~my.location.gpslogger~ and ~my.location.via_ip~, and then they all get merged into a single function in ~my.location.all~, like:
|
||||||
|
|
||||||
|
#+begin_src python
|
||||||
|
from .common import Location
|
||||||
|
|
||||||
|
def locations() -> Iterator[Location]:
|
||||||
|
# can add/comment out sources here to enable/disable them
|
||||||
|
yield from _takeout_locations()
|
||||||
|
yield from _gpslogger_locations()
|
||||||
|
|
||||||
|
|
||||||
|
@import_source(module_name="my.location.google_takeout")
|
||||||
|
def _takeout_locations() -> Iterator[Location]:
|
||||||
|
from . import google_takeout
|
||||||
|
yield from google_takeout.locations()
|
||||||
|
|
||||||
|
|
||||||
|
@import_source(module_name="my.location.gpslogger")
|
||||||
|
def _gpslogger_locations() -> Iterator[Location]:
|
||||||
|
from . import gpslogger
|
||||||
|
yield from gpslogger.locations()
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
If you want to disable a source, you have a few options.
|
||||||
|
|
||||||
|
- If you're using a local editable install or just want to quickly troubleshoot, you can just comment out the line in the ~locations~ function
|
||||||
|
- Since these are decorated behind ~import_source~, they automatically catch import/config errors, so instead of fatally erroring and crashing if you don't have a module setup, it'll warn you and continue to process the other sources. To get rid of the warnings, you can add the module you're not planning on using to your core config, like:
|
||||||
|
|
||||||
|
#+begin_src python
|
||||||
|
class core:
|
||||||
|
disabled_modules = (
|
||||||
|
"my.location.gpslogger",
|
||||||
|
"my.location.via_ip",
|
||||||
|
)
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
... that suppresses the warning message and lets you use ~my.location.all~ without having to change any lines of code
|
||||||
|
|
||||||
|
Another benefit is that all the custom sources/data is localized to the ~all.py~ file, so a user can override the ~all.py~ (see the sections below on ~namespace packages~) file in their own HPI repository, adding additional sources without having to maintain a fork and patching in changes as things eventually change. For a 'real world' example of that, see [[https://github.com/purarue/HPI#partially-in-usewith-overrides][purarue]]s location and ip modules.
|
||||||
|
|
||||||
|
This is of course not required for personal or single file modules, its just the pattern that seems to have the least amount of friction for the user, while being extendable, and without using a bulky plugin system to let users add additional sources.
|
||||||
|
|
||||||
|
Another common way an ~all.py~ file is used is to merge data from a periodic export, and a GDPR export (e.g. see the ~stackexchange~, or ~github~ modules)
|
||||||
|
|
||||||
|
* module count
|
||||||
|
|
||||||
|
Having way too many modules could end up being an issue. For now, I'm basically happy to merge new modules - With the current module count, things don't seem to break much, and most of them are modules I use myself, so they get tested with my own data.
|
||||||
|
|
||||||
|
For services I don't use, I would prefer if they had tests/example data somewhere, else I can't guarantee they're still working...
|
||||||
|
|
||||||
|
Its great if when you start using HPI, you get a few modules 'for free' (perhaps ~github~ and ~reddit~), but its likely not everyone uses the same services
|
||||||
|
|
||||||
|
This shouldn't end up becoming a monorepo (a la [[https://www.spacemacs.org/][Spacemacs]]) with hundreds of modules supporting every use case. Its hard to know what the common usecase is for everyone, and new services/companies which silo your data appear all the time...
|
||||||
|
|
||||||
|
Its also not obvious how people want to access their data. This problem is often mitigated by the output of HPI being python functions -- one can always write a small script to take the output data from a module and wrangle it into some format you want
|
||||||
|
|
||||||
|
This is why HPI aims to be as extendable as possible. If you have some programming know-how, hopefully you're able to create some basic modules for yourself - plug in your own data and gain the benefits of using the functions in ~my.core~, the configuration layer and possibly libraries like [[https://github.com/karlicoss/cachew][cachew]] to 'automatically' cache your data
|
||||||
|
|
||||||
|
In some ways it may make sense to think of HPI as akin to emacs or a ones 'dotfiles'. This provides a configuration layer and structure for you to access your data, and you can extend it to your own use case.
|
||||||
|
|
||||||
|
* single file modules
|
||||||
|
|
||||||
|
... or, the question 'should we split code from individual HPI files into setuptools packages'
|
||||||
|
|
||||||
|
It's possible for a single HPI module or file to handle *everything*. Most of the python files in ~my/~ are 'single file' modules
|
||||||
|
|
||||||
|
By everything, I mean:
|
||||||
|
|
||||||
|
- Exporting data from an API/locating data on your disk/maybe saving data so you don't lose it
|
||||||
|
- Parsing data from some raw (JSON/SQLite/HTML) format
|
||||||
|
- Merging different data sources into some common =NamedTuple=-like schema
|
||||||
|
- caching expensive computation/merge results
|
||||||
|
- configuration through ~my.config~
|
||||||
|
|
||||||
|
For short modules which aren't that complex, while developing your own personal modules, or while bootstrapping modules - this is actually fine.
|
||||||
|
|
||||||
|
From a users perspective, the ability to clone and install HPI as editable, add an new python file into ~my/~, and it immediately be accessible as ~my.modulename~ is a pattern that should always be supported
|
||||||
|
|
||||||
|
However, as modules get more and more complex, especially if they include backing up/locating data from some location on your filesystem or interacting with a live API -- ideally they should be split off into their own repositories. There are trade-offs to doing this, but they are typically worth it.
|
||||||
|
|
||||||
|
As an example of this, take a look at the [[https://github.com/karlicoss/HPI/tree/5ef277526577daaa115223e79a07a064ffa9bc85/my/github][my.github]] and the corresponding [[https://github.com/karlicoss/ghexport][ghexport]] data exporter which saves github data.
|
||||||
|
|
||||||
|
- Pros:
|
||||||
|
- This allows someone to install and use ~ghexport~ without having to setup HPI at all -- its a standalone tool which means there's less barrier to entry
|
||||||
|
- It being a separate repository means issues relating to exporting data and the [[https://beepb00p.xyz/exports.html#dal][DAL]] (loading the data) can be handled there, instead of in HPI
|
||||||
|
- This reduces complexity for someone looking at the ~my.github~ files trying to debug issues related to HPI. The functionality for ~ghexport~ can be tested independently of someone new to HPI trying to debug a configuration issue
|
||||||
|
- Is easier to combine additional data sources, like ~my.github.gdpr~, which includes additional data from the GDPR export
|
||||||
|
|
||||||
|
- Cons:
|
||||||
|
- Leads to some code duplication, as you can no longer use helper functions from ~my.core~ in the new repository
|
||||||
|
- Additional boilerplate - instructions, installation scripts, testing. It's not required, but typically you want to leverage ~setuptools~ to allows ~pip install git+https...~ type installs, which are used in ~hpi module install~
|
||||||
|
- Is difficult to convert to a namespace module/directory down the road
|
||||||
|
|
||||||
|
Not all HPI Modules are currently at that level of complexity -- some are simple enough that one can understand the file by just reading it top to bottom. Some wouldn't make sense to split off into separate modules for one reason or another.
|
||||||
|
|
||||||
|
A related concern is how to structure namespace packages to allow users to easily extend them, and how this conflicts with single file modules (Keep reading below for more information on namespace packages/extension) If a module is converted from a single file module to a namespace with multiple files, it seems this is a breaking change, see [[https://github.com/karlicoss/HPI/issues/89][#89]] for an example of this. The current workaround is to leave it a regular python package with an =__init__.py= for some amount of time and send a deprecation warning, and then eventually remove the =__init__.py= file to convert it into a namespace package. For an example, see the [[https://github.com/karlicoss/HPI/blob/8422c6e420f5e274bd1da91710663be6429c666c/my/reddit/__init__.py][reddit init file]].
|
||||||
|
|
||||||
|
Its quite a pain to have to convert a file from a single file module to a namespace module, so if there's *any* possibility that you might convert it to a namespace package, might as well just start it off as one, to avoid the pain down the road. As an example, say you were creating something to parse ~zsh~ history. Instead of creating ~my/zsh.py~, it would be better to create ~my/zsh/parser.py~. That lets users override the file using editable/namespace packages, and it also means in the future its much more trivial to extend it to something like:
|
||||||
|
|
||||||
|
#+begin_src
|
||||||
|
my/zsh
|
||||||
|
├── all.py -- e.g. combined/unique/sorted zsh history
|
||||||
|
├── aliases.py -- parse zsh alias files
|
||||||
|
├── common.py -- shared models/merging code
|
||||||
|
├── compdump.py -- parse zsh compdump files
|
||||||
|
└── parser.py -- parse individual zsh history files
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
There's no requirement to follow this entire structure when you start off, the entire module could live in ~my/zsh/parser.py~, including all the merging/parsing/locating code. It just avoids the trouble in the future, and the only downside is having to type a bit more when importing from it.
|
||||||
|
|
||||||
|
#+html: <div id="addingmodules"></div>
|
||||||
|
|
||||||
|
* Adding new modules
|
||||||
|
|
||||||
|
As always, if the changes you wish to make are small, or you just want to add a few modules, you can clone and edit an editable install of HPI. See [[file:SETUP.org][SETUP]] for more information
|
||||||
|
|
||||||
|
The "proper way" (unless you want to contribute to the upstream) is to create a separate file hierarchy and add your module to =PYTHONPATH= (or use 'editable namespace packages' as described below, which also modifies your computed ~sys.path~)
|
||||||
|
|
||||||
|
# TODO link to 'overlays' documentation?
|
||||||
|
You can check my own [[https://github.com/karlicoss/hpi-personal-overlay][personal overlay]] as a reference.
|
||||||
|
|
||||||
|
For example, if you want to add an =awesomedatasource=, it could be:
|
||||||
|
|
||||||
|
: custom_module
|
||||||
|
: └── my
|
||||||
|
: └──awesomedatasource.py
|
||||||
|
|
||||||
|
You can use all existing HPI modules in =awesomedatasource.py=, including =my.config= and everything from =my.core=.
|
||||||
|
=hpi modules= or =hpi doctor= commands should also detect your extra modules.
|
||||||
|
|
||||||
|
- In addition, you can *override* the builtin HPI modules too:
|
||||||
|
|
||||||
|
: custom_lastfm_overlay
|
||||||
|
: └── my
|
||||||
|
: └──lastfm.py
|
||||||
|
|
||||||
|
Now if you add =custom_lastfm_overlay= [[https://docs.python.org/3/using/cmdline.html#envvar-PYTHONPATH][*in front* of ~PYTHONPATH~]], all the downstream scripts using =my.lastfm= will load it from =custom_lastfm_overlay= instead.
|
||||||
|
|
||||||
|
This could be useful to monkey patch some behaviours, or dynamically add some extra data sources -- anything that comes to your mind.
|
||||||
|
You can check [[https://github.com/karlicoss/hpi-personal-overlay/blob/7fca8b1b6031bf418078da2d8be70fd81d2d8fa0/src/my/calendar/holidays.py#L1-L14][my.calendar.holidays]] in my personal overlay as a reference.
|
||||||
|
|
||||||
|
** Namespace Packages
|
||||||
|
|
||||||
|
Note: this section covers some of the complexities and benefits with this being a namespace package and/or editable install, so it assumes some familiarity with python/imports
|
||||||
|
|
||||||
|
HPI is installed as a namespace package, which allows an additional way to add your own modules. For the details on namespace packages, see [[https://www.python.org/dev/peps/pep-0420/][PEP420]], or the [[https://packaging.python.org/guides/packaging-namespace-packages][packaging docs for a summary]], but for our use case, a sufficient description might be: Namespace packages let you split a package across multiple directories on disk.
|
||||||
|
|
||||||
|
Without adding a bulky/boilerplate-y plugin framework to HPI, as that increases the barrier to entry, [[https://packaging.python.org/guides/creating-and-discovering-plugins/#using-namespace-packages][namespace packages offers an alternative]] with little downsides.
|
||||||
|
|
||||||
|
Creating a separate file hierarchy still allows you to keep up to date with any changes from this repository by running ~git pull~ on your local clone of HPI periodically (assuming you've installed it as an editable package (~pip install -e .~)), while creating your own modules, and possibly overwriting any files you wish to override/overlay.
|
||||||
|
|
||||||
|
In order to do that, like stated above, you could edit the ~PYTHONPATH~ variable, which in turn modifies your computed ~sys.path~, which is how python [[https://docs.python.org/3/library/sys.html?highlight=pythonpath#sys.path][determines the search path for modules]]. This is sort of what [[file:../with_my][with_my]] allows you to do.
|
||||||
|
|
||||||
|
In the context of HPI, it being a namespace package means you can have a local clone of this repository, and your own 'HPI' modules in a separate folder, which then get combined into the ~my~ package.
|
||||||
|
|
||||||
|
As an example, say you were trying to override the ~my.lastfm~ file, to include some new feature. You could create a new file hierarchy like:
|
||||||
|
|
||||||
|
: .
|
||||||
|
: ├── my
|
||||||
|
: │ ├── lastfm.py
|
||||||
|
: │ └── some_new_module.py
|
||||||
|
: └── setup.py
|
||||||
|
|
||||||
|
Where ~lastfm.py~ is your version of ~my.lastfm~, which you've copied from this repository and applied your changes to. The ~setup.py~ would be something like:
|
||||||
|
|
||||||
|
#+begin_src python
|
||||||
|
from setuptools import setup, find_namespace_packages
|
||||||
|
|
||||||
|
# should use a different name,
|
||||||
|
# so its possible to differentiate between HPI installs
|
||||||
|
setup(
|
||||||
|
name=f"my-HPI-overlay",
|
||||||
|
zip_safe=False,
|
||||||
|
packages=find_namespace_packages(".", include=("my*")),
|
||||||
|
)
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
Then, running ~python3 -m pip install -e .~ in that directory would install that as part of the namespace package, and assuming (see below for possible issues) this appears on ~sys.path~ before the upstream repository, your ~lastfm.py~ file overrides the upstream. Adding more files, like ~my.some_new_module~ into that directory immediately updates the global ~my~ package -- allowing you to quickly add new modules without having to re-install.
|
||||||
|
|
||||||
|
If you install both directories as editable packages (which has the benefit of any changes you making in either repository immediately updating the globally installed ~my~ package), there are some concerns with which editable install appears on your ~sys.path~ first. If you wanted your modules to override the upstream modules, yours would have to appear on the ~sys.path~ first (this is the same reason that =custom_lastfm_overlay= must be at the front of your ~PYTHONPATH~). For more details and examples on dealing with editable namespace packages in the context of HPI, see the [[https://github.com/purarue/reorder_editable][reorder_editable]] repository.
|
||||||
|
|
||||||
|
There is no limit to how many directories you could install into a single namespace package, which could be a possible way for people to install additional HPI modules, without worrying about the module count here becoming too large to manage.
|
||||||
|
|
||||||
|
There are some other users [[https://github.com/hpi/hpi][who have begun publishing their own modules]] as namespace packages, which you could potentially install and use, in addition to this repository, if any of those interest you. If you want to create your own you can use the [[https://github.com/purarue/HPI-template][template]] to get started.
|
||||||
|
|
||||||
|
Though, enabling this many modules may make ~hpi doctor~ look pretty busy. You can explicitly choose to enable/disable modules with a list of modules/regexes in your [[https://github.com/karlicoss/HPI/blob/f559e7cb899107538e6c6bbcf7576780604697ef/my/core/core_config.py#L24-L55][core config]], see [[https://github.com/purarue/dotfiles/blob/a1a77c581de31bd55a6af3d11b8af588614a207e/.config/my/my/config/__init__.py#L42-L72][here]] for an example.
|
||||||
|
|
||||||
|
You may use the other modules or [[https://github.com/karlicoss/hpi-personal-overlay][my overlay]] as reference, but python packaging is already a complicated issue, before adding complexities like namespace packages and editable installs on top of it... If you're having trouble extending HPI in this fashion, you can open an issue here, preferably with a link to your code/repository and/or ~setup.py~ you're trying to use.
|
||||||
|
|
||||||
|
* An Extendable module structure
|
||||||
|
|
||||||
|
In this context, 'overlay'/'override' means you create your own namespace package/file structure like described above, and since your files are in front of the upstream repository files in the computed ~sys.path~ (either by using namespace modules, the ~PYTHONPATH~ or ~with_my~), your file overrides the upstream repository
|
||||||
|
|
||||||
|
Related issues: [[https://github.com/karlicoss/HPI/issues/102][#102]], [[https://github.com/karlicoss/HPI/issues/89][#89]], [[https://github.com/karlicoss/HPI/issues/154][#154]]
|
||||||
|
|
||||||
|
The main goals are:
|
||||||
|
|
||||||
|
- low effort: ideally it should be a matter of a few lines of code to override something.
|
||||||
|
- good interop: e.g. ability to keep with the upstream, use modules coming from separate repositories, etc.
|
||||||
|
- ideally mypy friendly. This kind of means 'not too dynamic and magical', which is ultimately a good thing even if you don't care about mypy.
|
||||||
|
|
||||||
|
~all.py~ using modules/sources behind ~import_source~ is the solution we've arrived at in HPI, because it meets all of these goals:
|
||||||
|
|
||||||
|
- it doesn't require an additional plugin system, is just python imports and
|
||||||
|
namespace packages
|
||||||
|
- is generally mypy friendly (the only exception is the ~import_source~
|
||||||
|
decorator, but that typically returns nothing if the import failed)
|
||||||
|
- doesn't require you to maintain a fork of this repository, though you can maintain a separate HPI repository (so no patching/merge conflicts)
|
||||||
|
- allows you to easily add/remove sources to the ~all.py~ module, either by:
|
||||||
|
- overriding an ~all.py~ in your own repository
|
||||||
|
- just commenting out the source/adding 2 lines to import and ~yield from~ your new source
|
||||||
|
- doing nothing! (~import_source~ will catch the error and just warn you
|
||||||
|
and continue to work without changing any code)
|
||||||
|
|
||||||
|
It could be argued that namespace packages and editable installs are a bit complex for a new user to get the hang of, and this is true. But fortunately ~import_source~ means any user just using HPI only needs to follow the instructions when a warning is printed, or peruse the docs here a bit -- there's no need to clone or create your own override to just use the ~all.py~ file.
|
||||||
|
|
||||||
|
There's no requirement to use this for individual modules, it just seems to be the best solution we've arrived at so far
|
||||||
|
|
||||||
|
* Logging guidelines
|
||||||
|
HPI doesn't enforce any specific logging mechanism, you're free to use whatever you prefer in your modules.
|
||||||
|
|
||||||
|
However there are some general guidelines for developing modules that can make them more pleasant to use.
|
||||||
|
|
||||||
|
- each module should have its unique logger, the easiest way to ensure that is simply use module's ~__name__~ attribute as the logger name
|
||||||
|
|
||||||
|
In addition, this ensures the logger hierarchy reflect the package hierarchy.
|
||||||
|
For instance, if you initialize the logger for =my.module= with specific settings, the logger for =my.module.helper= would inherit these settings. See more on that [[ https://docs.python.org/3/library/logging.html?highlight=logging#logger-objects][in python docs]].
|
||||||
|
|
||||||
|
As a bonus, if you use the module ~__name__~, this logger will be automatically be picked up and used by ~cachew~.
|
||||||
|
|
||||||
|
- often modules are processing multiple files, extracting data from each one ([[https://beepb00p.xyz/exports.html#types][incremental/synthetic exports]])
|
||||||
|
|
||||||
|
It's nice to log each file name you're processing as =logger.info= so the user of module gets a sense of progress.
|
||||||
|
If possible, add the index of file you're processing and the total count.
|
||||||
|
|
||||||
|
#+begin_src python
|
||||||
|
def process_all_data():
|
||||||
|
paths = inputs()
|
||||||
|
total = len(paths)
|
||||||
|
width = len(str(total))
|
||||||
|
for idx, path in enumerate(paths):
|
||||||
|
# :>{width} to align the logs vertically
|
||||||
|
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
|
||||||
|
yield from process_path(path)
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
If there is a lot of logging happening related to a specific path, instead of adding path to each logging message manually, consider using [[https://docs.python.org/3/library/logging.html?highlight=loggeradapter#logging.LoggerAdapter][LoggerAdapter]].
|
||||||
|
|
||||||
|
- log exceptions, but sparingly
|
||||||
|
|
||||||
|
Generally it's a good practice to call ~logging.exception~ from the ~except~ clause, so it's immediately visible where the errors are happening.
|
||||||
|
|
||||||
|
However, in HPI, instead of crashing on exceptions we often behave defensively and ~yield~ them instead (see [[https://beepb00p.xyz/mypy-error-handling.html][mypy assisted error handling]]).
|
||||||
|
|
||||||
|
In this case logging every time may become a bit spammy, so use exception logging sparingly in this case.
|
||||||
|
Typically it's best to rely on the downstream data consumer to handle the exceptions properly.
|
||||||
|
|
||||||
|
- instead of =logging.getLogger=, it's best to use =my.core.make_logger=
|
||||||
|
|
||||||
|
#+begin_src python
|
||||||
|
from my.core import make_logger
|
||||||
|
|
||||||
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
# or to set a custom level
|
||||||
|
logger = make_logger(__name__, level='warning')
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
This sets up some nicer defaults over standard =logging= module:
|
||||||
|
|
||||||
|
- colored logs (via =colorlog= library)
|
||||||
|
- =INFO= as the initial logging level (instead of default =ERROR=)
|
||||||
|
- logging full exception trace when even when logging outside of the exception handler
|
||||||
|
|
||||||
|
This is particularly useful for [[https://beepb00p.xyz/mypy-error-handling.html][mypy assisted error handling]].
|
||||||
|
|
||||||
|
By default, =logging= only logs the exception message (without the trace) in this case, which makes errors harder to debug.
|
||||||
|
- control logging level from the shell via ~LOGGING_LEVEL_*~ env variable
|
||||||
|
|
||||||
|
This can be useful to suppress logging output if it's too spammy, or showing more output for debugging.
|
||||||
|
|
||||||
|
E.g. ~LOGGING_LEVEL_my_instagram_gdpr=DEBUG hpi query my.instagram.gdpr.messages~
|
||||||
|
|
||||||
|
- experimental: passing env variable ~LOGGING_COLLAPSE=<loglevel>~ will "collapse" logging with the same level
|
||||||
|
|
||||||
|
Instead of printing new logging line each time, it will 'redraw' the last logged line with a new logging message.
|
||||||
|
|
||||||
|
This can be convenient if there are too many logs, you just need logging to get a sense of progress.
|
||||||
|
|
||||||
|
- experimental: passing env variable ~ENLIGHTEN_ENABLE=yes~ will display TUI progress bars in some cases
|
||||||
|
|
||||||
|
See [[https://github.com/Rockhopper-Technologies/enlighten#readme][https://github.com/Rockhopper-Technologies/enlighten#readme]]
|
||||||
|
|
||||||
|
This can be convenient for showing the progress of parallel processing of different files from HPI:
|
||||||
|
|
||||||
|
#+BEGIN_EXAMPLE
|
||||||
|
ghexport.dal[111] 29%|████████████████████ | 29/100 [00:03<00:07, 10.03 files/s]
|
||||||
|
rexport.dal[comments] 17%|████████ | 115/682 [00:03<00:14, 39.15 files/s]
|
||||||
|
my.instagram.android 0%|▎ | 3/2631 [00:02<34:50, 1.26 files/s]
|
||||||
|
#+END_EXAMPLE
|
322
doc/OVERLAYS.org
Normal file
322
doc/OVERLAYS.org
Normal file
|
@ -0,0 +1,322 @@
|
||||||
|
NOTE this kinda overlaps with [[file:MODULE_DESIGN.org][the module design doc]], should be unified in the future.
|
||||||
|
|
||||||
|
Relevant discussion about overlays: https://github.com/karlicoss/HPI/issues/102
|
||||||
|
|
||||||
|
# This is describing TODO
|
||||||
|
# TODO goals
|
||||||
|
# - overrides
|
||||||
|
# - proper mypy support
|
||||||
|
# - TODO reusing parent modules?
|
||||||
|
|
||||||
|
# You can see them TODO in overlays dir
|
||||||
|
|
||||||
|
Consider a toy package/module structure with minimal code, without any actual data parsing, just for demonstration purposes.
|
||||||
|
|
||||||
|
- =main= package structure
|
||||||
|
# TODO do links
|
||||||
|
|
||||||
|
- =my/twitter/gdpr.py=
|
||||||
|
Extracts Twitter data from GDPR archive.
|
||||||
|
- =my/twitter/all.py=
|
||||||
|
Merges twitter data from multiple sources (only =gdpr= in this case), so data consumers are agnostic of specific data sources used.
|
||||||
|
This will be overridden by =overlay=.
|
||||||
|
- =my/twitter/common.py=
|
||||||
|
Contains helper function to merge data, so they can be reused by overlay's =all.py=.
|
||||||
|
- =my/reddit.py=
|
||||||
|
Extracts Reddit data -- this won't be overridden by the overlay, we just keep it for demonstration purposes.
|
||||||
|
|
||||||
|
- =overlay= package structure
|
||||||
|
|
||||||
|
- =my/twitter/talon.py=
|
||||||
|
Extracts Twitter data from Talon android app.
|
||||||
|
- =my/twitter/all.py=
|
||||||
|
Override for =all.py= from =main= package -- it merges together data from =gpdr= and =talon= modules.
|
||||||
|
|
||||||
|
# TODO mention resolution? reorder_editable
|
||||||
|
|
||||||
|
* Installing (editable install)
|
||||||
|
|
||||||
|
NOTE: this was tested with =python 3.10= and =pip 23.3.2=.
|
||||||
|
|
||||||
|
To install, we run:
|
||||||
|
|
||||||
|
: pip3 install --user -e overlay/
|
||||||
|
: pip3 install --user -e main/
|
||||||
|
|
||||||
|
# TODO mention non-editable installs (this bit will still work with non-editable install)
|
||||||
|
|
||||||
|
As a result, we get:
|
||||||
|
|
||||||
|
: pip3 list | grep hpi
|
||||||
|
: hpi-main 0.0.0 /project/main/src
|
||||||
|
: hpi-overlay 0.0.0 /project/overlay/src
|
||||||
|
|
||||||
|
: cat ~/.local/lib/python3.10/site-packages/easy-install.pth
|
||||||
|
: /project/overlay/src
|
||||||
|
: /project/main/src
|
||||||
|
|
||||||
|
(the order above is important, so =overlay= takes precedence over =main= TODO link)
|
||||||
|
|
||||||
|
Verify the setup:
|
||||||
|
|
||||||
|
: $ python3 -c 'import my; print(my.__path__)'
|
||||||
|
: _NamespacePath(['/project/overlay/src/my', '/project/main/src/my'])
|
||||||
|
|
||||||
|
This basically means that modules will be searched in both paths, with overlay taking precedence.
|
||||||
|
|
||||||
|
** Installing with =--use-pep517=
|
||||||
|
|
||||||
|
See here for discussion https://github.com/purarue/reorder_editable/issues/2, but TLDR it should work similarly.
|
||||||
|
|
||||||
|
* Testing runtime behaviour (editable install)
|
||||||
|
|
||||||
|
: $ python3 -c 'import my.reddit as R; print(R.upvotes())'
|
||||||
|
: [main] my.reddit hello
|
||||||
|
: ['reddit upvote1', 'reddit upvote2']
|
||||||
|
|
||||||
|
Just as expected here, =my.reddit= is imported from the =main= package, since it doesn't exist in =overlay=.
|
||||||
|
|
||||||
|
Let's theck twitter now:
|
||||||
|
|
||||||
|
: $ python3 -c 'import my.twitter.all as T; print(T.tweets())'
|
||||||
|
: [overlay] my.twitter.all hello
|
||||||
|
: [main] my.twitter.common hello
|
||||||
|
: [main] my.twitter.gdpr hello
|
||||||
|
: [overlay] my.twitter.talon hello
|
||||||
|
: ['gdpr tweet 1', 'gdpr tweet 2', 'talon tweet 1', 'talon tweet 2']
|
||||||
|
|
||||||
|
As expected, =my.twitter.all= was imported from the =overlay=.
|
||||||
|
As you can see it's merged data from =gdpr= (from =main= package) and =talon= (from =overlay= package).
|
||||||
|
|
||||||
|
So far so good, let's see how it works with mypy.
|
||||||
|
|
||||||
|
* Mypy support (editable install)
|
||||||
|
|
||||||
|
To check that mypy works as expected I injected some statements in modules that have no impact on runtime,
|
||||||
|
but should trigger mypy, like this =trigger_mypy_error: str = 123=:
|
||||||
|
|
||||||
|
Let's run it:
|
||||||
|
|
||||||
|
: $ mypy --namespace-packages --strict -p my
|
||||||
|
: overlay/src/my/twitter/talon.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str")
|
||||||
|
: [assignment]
|
||||||
|
: trigger_mypy_error: str = 123
|
||||||
|
: ^
|
||||||
|
: Found 1 error in 1 file (checked 4 source files)
|
||||||
|
|
||||||
|
Hmm, this did find the statement in the =overlay=, but missed everything from =main= (e.g. =reddit.py= and =gdpr.py= should have also triggered the check).
|
||||||
|
|
||||||
|
First, let's check which sources mypy is processing:
|
||||||
|
|
||||||
|
: $ mypy --namespace-packages --strict -p my -v 2>&1 | grep BuildSource
|
||||||
|
: LOG: Found source: BuildSource(path='/project/overlay/src/my', module='my', has_text=False, base_dir=None)
|
||||||
|
: LOG: Found source: BuildSource(path='/project/overlay/src/my/twitter', module='my.twitter', has_text=False, base_dir=None)
|
||||||
|
: LOG: Found source: BuildSource(path='/project/overlay/src/my/twitter/all.py', module='my.twitter.all', has_text=False, base_dir=None)
|
||||||
|
: LOG: Found source: BuildSource(path='/project/overlay/src/my/twitter/talon.py', module='my.twitter.talon', has_text=False, base_dir=None)
|
||||||
|
|
||||||
|
So seems like mypy is not processing anything from =main= package at all?
|
||||||
|
|
||||||
|
At this point I cloned mypy, put a breakpoint, and found out this is the culprit: https://github.com/python/mypy/blob/1dd8e7fe654991b01bd80ef7f1f675d9e3910c3a/mypy/modulefinder.py#L288
|
||||||
|
|
||||||
|
This basically returns the first path where it finds =my= package, which happens to be the overlay in this case.
|
||||||
|
So everything else is ignored?
|
||||||
|
|
||||||
|
It even seems to have a test for a similar usecase, which is quite sad.
|
||||||
|
https://github.com/python/mypy/blob/1dd8e7fe654991b01bd80ef7f1f675d9e3910c3a/mypy/test/testmodulefinder.py#L64-L71
|
||||||
|
|
||||||
|
For now, I opened an issue in mypy repository https://github.com/python/mypy/issues/16683
|
||||||
|
|
||||||
|
But ok, maybe mypy treats =main= as an external package somehow but still type checks it properly?
|
||||||
|
Let's see what's going on with imports:
|
||||||
|
|
||||||
|
: $ mypy --namespace-packages --strict -p my --follow-imports=error
|
||||||
|
: overlay/src/my/twitter/talon.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str")
|
||||||
|
: [assignment]
|
||||||
|
: trigger_mypy_error: str = 123
|
||||||
|
: ^
|
||||||
|
: overlay/src/my/twitter/all.py:3: error: Import of "my.twitter.common" ignored [misc]
|
||||||
|
: from .common import merge
|
||||||
|
: ^
|
||||||
|
: overlay/src/my/twitter/all.py:6: error: Import of "my.twitter.gdpr" ignored [misc]
|
||||||
|
: from . import gdpr
|
||||||
|
: ^
|
||||||
|
: overlay/src/my/twitter/all.py:6: note: (Using --follow-imports=error, module not passed on command line)
|
||||||
|
: overlay/src/my/twitter/all.py: note: In function "tweets":
|
||||||
|
: overlay/src/my/twitter/all.py:8: error: Returning Any from function declared to return "List[str]" [no-any-return]
|
||||||
|
: return merge(gdpr, talon)
|
||||||
|
: ^
|
||||||
|
: Found 4 errors in 2 files (checked 4 source files)
|
||||||
|
|
||||||
|
Nope -- looks like it's completely unawareof =main=, and what's worst, by default (without tweaking =--follow-imports=), these errors would be suppressed.
|
||||||
|
|
||||||
|
What if we check =my.twitter= directly?
|
||||||
|
|
||||||
|
: $ mypy --namespace-packages --strict -p my.twitter --follow-imports=error
|
||||||
|
: overlay/src/my/twitter/talon.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str")
|
||||||
|
: [assignment]
|
||||||
|
: trigger_mypy_error: str = 123
|
||||||
|
: ^~~
|
||||||
|
: overlay/src/my/twitter: error: Ancestor package "my" ignored [misc]
|
||||||
|
: overlay/src/my/twitter: note: (Using --follow-imports=error, submodule passed on command line)
|
||||||
|
: overlay/src/my/twitter/all.py:3: error: Import of "my.twitter.common" ignored [misc]
|
||||||
|
: from .common import merge
|
||||||
|
: ^
|
||||||
|
: overlay/src/my/twitter/all.py:3: note: (Using --follow-imports=error, module not passed on command line)
|
||||||
|
: overlay/src/my/twitter/all.py:6: error: Import of "my.twitter.gdpr" ignored [misc]
|
||||||
|
: from . import gdpr
|
||||||
|
: ^
|
||||||
|
: overlay/src/my/twitter/all.py: note: In function "tweets":
|
||||||
|
: overlay/src/my/twitter/all.py:8: error: Returning Any from function declared to return "list[str]" [no-any-return]
|
||||||
|
: return merge(gdpr, talon)
|
||||||
|
: ^~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
: Found 5 errors in 3 files (checked 3 source files)
|
||||||
|
|
||||||
|
Now we're also getting =error: Ancestor package "my" ignored [misc]= .. not ideal.
|
||||||
|
|
||||||
|
* What if we don't install at all?
|
||||||
|
Instead of editable install let's try running mypy directly over source files
|
||||||
|
|
||||||
|
First let's only check =main= package:
|
||||||
|
|
||||||
|
: $ MYPYPATH=main/src mypy --namespace-packages --strict -p my
|
||||||
|
: main/src/my/twitter/gdpr.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str") [assignment]
|
||||||
|
: trigger_mypy_error: str = 123
|
||||||
|
: ^~~
|
||||||
|
: main/src/my/reddit.py:11: error: Incompatible types in assignment (expression has type "int", variable has type "str") [assignment]
|
||||||
|
: trigger_mypy_error: str = 123
|
||||||
|
: ^~~
|
||||||
|
: Found 2 errors in 2 files (checked 6 source files)
|
||||||
|
|
||||||
|
As expected, it found both errors.
|
||||||
|
|
||||||
|
Now with overlay as well:
|
||||||
|
|
||||||
|
: $ MYPYPATH=overlay/src:main/src mypy --namespace-packages --strict -p my
|
||||||
|
: overlay/src/my/twitter/all.py:6: note: In module imported here:
|
||||||
|
: main/src/my/twitter/gdpr.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str") [assignment]
|
||||||
|
: trigger_mypy_error: str = 123
|
||||||
|
: ^~~
|
||||||
|
: overlay/src/my/twitter/talon.py:9: error: Incompatible types in assignment (expression has type "int", variable has type "str")
|
||||||
|
: [assignment]
|
||||||
|
: trigger_mypy_error: str = 123
|
||||||
|
: ^~~
|
||||||
|
: Found 2 errors in 2 files (checked 4 source files)
|
||||||
|
|
||||||
|
Interesting enough, this is slightly better than the editable install (it detected error in =gdpr.py= as well).
|
||||||
|
But still no =reddit.py= error.
|
||||||
|
|
||||||
|
TODO possibly worth submitting to mypy issue tracker as well...
|
||||||
|
|
||||||
|
Overall it seems that properly type checking HPI setup as a whole is kinda problematic, especially if the modules actually override/extend base modules.
|
||||||
|
|
||||||
|
* Modifying (monkey patching) original module in the overlay
|
||||||
|
Let's say we want to modify/monkey patch =my.twitter.talon= module from =main=, for example, convert "gdpr" to uppercase, i.e. =tweet.replace('gdpr', 'GDPR')=.
|
||||||
|
|
||||||
|
# TODO see overlay2/
|
||||||
|
|
||||||
|
I think our options are:
|
||||||
|
|
||||||
|
- symlink to the 'parent' packages, e.g. =main= in the case
|
||||||
|
|
||||||
|
Alternatively, somehow install =main= under a different name/alias (managed by pip).
|
||||||
|
|
||||||
|
This is discussed here: https://github.com/karlicoss/HPI/issues/102
|
||||||
|
|
||||||
|
The main upside is that it's relatively simple and (sort of works with mypy).
|
||||||
|
|
||||||
|
There are a few big downsides:
|
||||||
|
- creates a parallel package hierarchy (to the one maintained by pip), symlinks will need to be carefully managed manually
|
||||||
|
|
||||||
|
This may not be such a huge deal if you don't have too many overlays.
|
||||||
|
However this results in problems if you're trying to switch between two different HPI checkouts (e.g. stable and development). If you have symlinks into "stable" from the overlay then stable modules will sometimes be picked up when you're expecting "development" package.
|
||||||
|
|
||||||
|
- symlinks pointing outside of the source tree might cause pip install to go into infinite loop
|
||||||
|
|
||||||
|
- it modifies the package name
|
||||||
|
|
||||||
|
This may potentially result in some confusing behaviours.
|
||||||
|
|
||||||
|
One thing I noticed for example is that cachew caches might get duplicated.
|
||||||
|
|
||||||
|
- it might not work in all cases or might result in recursive imports
|
||||||
|
|
||||||
|
- do not shadow the original module
|
||||||
|
|
||||||
|
Basically instead of shadowing via namespace package mechanism and creating identically named module,
|
||||||
|
create some sort of hook that would patch the original =my.twitter.talon= module from =main=.
|
||||||
|
|
||||||
|
The downside is that it's a bit unclear where to do that, we need some sort of entry point?
|
||||||
|
|
||||||
|
- it could be some global dynamic hook defined in the overlay, and then executed from =my.core=
|
||||||
|
|
||||||
|
However, it's a bit intrusive, and unclear how to handle errors. E.g. what if we're monkey patching a module that we weren't intending to use, don't have dependencies installed and it's crashing?
|
||||||
|
|
||||||
|
Perhaps core could support something like =_hook= in each of HPI's modules?
|
||||||
|
Note that it can't be =my.twitter.all=, since we might want to override =.all= itself.
|
||||||
|
|
||||||
|
The downside is is this probably not going to work well with =tmp_config= and such -- we'll need to somehow execute the hook again on reloading the module?
|
||||||
|
|
||||||
|
- ideally we'd have something that integrates with =importlib= and executed automatically when module is imported?
|
||||||
|
|
||||||
|
TODO explore these:
|
||||||
|
|
||||||
|
- https://stackoverflow.com/questions/43571737/how-to-implement-an-import-hook-that-can-modify-the-source-code-on-the-fly-using
|
||||||
|
- https://github.com/brettlangdon/importhook
|
||||||
|
|
||||||
|
This one is pretty intrusive, and has some issues, e.g. https://github.com/brettlangdon/importhook/issues/4
|
||||||
|
|
||||||
|
Let's try it:
|
||||||
|
: $ PYTHONPATH=overlay3/src:main/src python3 -c 'import my.twitter._hook; import my.twitter.all as M; print(M.tweets())'
|
||||||
|
: [main] my.twitter.all hello
|
||||||
|
: [main] my.twitter.common hello
|
||||||
|
: [main] my.twitter.gdpr hello
|
||||||
|
: EXECUTING IMPORT HOOK!
|
||||||
|
: ['GDPR tweet 1', 'GDPR tweet 2']
|
||||||
|
|
||||||
|
Ok it worked, and seems pretty neat.
|
||||||
|
However sadly it doesn't work with =tmp_config= (TODO add a proper demo?)
|
||||||
|
Not sure if it's more of an issue with =tmp_config= implementation (which is very hacky), or =importhook= itself?
|
||||||
|
|
||||||
|
In addition, still the question is where to put the hook itself, but in that case even a global one could be fine.
|
||||||
|
|
||||||
|
- define hook in =my/twitter/__init__.py=
|
||||||
|
|
||||||
|
Basically, use =extend_path= to make it behave like a namespace package, but in addition, patch original =my.twitter.talon=?
|
||||||
|
|
||||||
|
: $ cat overlay2/src/my/twitter/__init__.py
|
||||||
|
: print(f'[overlay2] {__name__} hello')
|
||||||
|
:
|
||||||
|
: from pkgutil import extend_path
|
||||||
|
: __path__ = extend_path(__path__, __name__)
|
||||||
|
:
|
||||||
|
: def hack_gdpr_module() -> None:
|
||||||
|
: from . import gdpr
|
||||||
|
: tweets_orig = gdpr.tweets
|
||||||
|
: def tweets_patched():
|
||||||
|
: return [t.replace('gdpr', 'GDPR') for t in tweets_orig()]
|
||||||
|
: gdpr.tweets = tweets_patched
|
||||||
|
:
|
||||||
|
: hack_gdpr_module()
|
||||||
|
|
||||||
|
This actually seems to work??
|
||||||
|
|
||||||
|
: PYTHONPATH=overlay2/src:main/src python3 -c 'import my.twitter.all as M; print(M.tweets())'
|
||||||
|
: [overlay2] my.twitter hello
|
||||||
|
: [main] my.twitter.gdpr hello
|
||||||
|
: [main] my.twitter.all hello
|
||||||
|
: [main] my.twitter.common hello
|
||||||
|
: ['GDPR tweet 1', 'GDPR tweet 2']
|
||||||
|
|
||||||
|
However, this doesn't stack, i.e. if the 'parent' overlay had its own =__init__.py=, it won't get called.
|
||||||
|
|
||||||
|
- shadow the original module and temporarily modify =__path__= before importing the same module from the parent overlay
|
||||||
|
|
||||||
|
This approach is implemented in =my.core.experimental.import_original_module=
|
||||||
|
|
||||||
|
TODO demonstrate it properly, but I think that also works in a 'chain' of overlays
|
||||||
|
|
||||||
|
Seems like that option is the most promising so far, albeit very hacky.
|
||||||
|
|
||||||
|
Note that none of these options work well with mypy (since it's all dynamic hackery), even if you disregard the issues described in the previous sections.
|
||||||
|
|
||||||
|
# TODO .pkg files? somewhat interesting... https://github.com/python/cpython/blob/3.12/Lib/pkgutil.py#L395-L410
|
304
doc/QUERY.md
Normal file
304
doc/QUERY.md
Normal file
|
@ -0,0 +1,304 @@
|
||||||
|
`hpi query` is a command line tool for querying the output of any `hpi` function.
|
||||||
|
|
||||||
|
```
|
||||||
|
Usage: hpi query [OPTIONS] FUNCTION_NAME...
|
||||||
|
|
||||||
|
This allows you to query the results from one or more functions in HPI
|
||||||
|
|
||||||
|
By default this runs with '-o json', converting the results to JSON and
|
||||||
|
printing them to STDOUT
|
||||||
|
|
||||||
|
You can specify '-o pprint' to just print the objects using their repr, or
|
||||||
|
'-o repl' to drop into a ipython shell with access to the results
|
||||||
|
|
||||||
|
While filtering using --order-key datetime, the --after, --before and
|
||||||
|
--within flags parse the input to their datetime and timedelta equivalents.
|
||||||
|
datetimes can be epoch time, the string 'now', or an date formatted in the
|
||||||
|
ISO format. timedelta (durations) are parsed from a similar format to the
|
||||||
|
GNU 'sleep' command, e.g. 1w2d8h5m20s -> 1 week, 2 days, 8 hours, 5 minutes,
|
||||||
|
20 seconds
|
||||||
|
|
||||||
|
As an example, to query reddit comments I've made in the last month
|
||||||
|
|
||||||
|
hpi query --order-type datetime --before now --within 4w my.reddit.all.comments
|
||||||
|
or...
|
||||||
|
hpi query --recent 4w my.reddit.all.comments
|
||||||
|
|
||||||
|
Can also query within a range. To filter comments between 2016 and 2018:
|
||||||
|
hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-o, --output [json|pprint|repl|gpx]
|
||||||
|
what to do with the result [default: json]
|
||||||
|
-s, --stream stream objects from the data source instead
|
||||||
|
of printing a list at the end
|
||||||
|
-k, --order-key TEXT order by an object attribute or dict key on
|
||||||
|
the individual objects returned by the HPI
|
||||||
|
function
|
||||||
|
-t, --order-type [datetime|date|int|float]
|
||||||
|
order by searching for some type on the
|
||||||
|
iterable
|
||||||
|
-a, --after TEXT while ordering, filter items for the key or
|
||||||
|
type larger than or equal to this
|
||||||
|
-b, --before TEXT while ordering, filter items for the key or
|
||||||
|
type smaller than this
|
||||||
|
-w, --within TEXT a range 'after' or 'before' to filter items
|
||||||
|
by. see above for further explanation
|
||||||
|
-r, --recent TEXT a shorthand for '--order-type datetime
|
||||||
|
--reverse --before now --within'. e.g.
|
||||||
|
--recent 5d
|
||||||
|
--reverse / --no-reverse reverse the results returned from the
|
||||||
|
functions
|
||||||
|
-l, --limit INTEGER limit the number of items returned from the
|
||||||
|
(functions)
|
||||||
|
--drop-unsorted if the order of an item can't be determined
|
||||||
|
while ordering, drop those items from the
|
||||||
|
results
|
||||||
|
--wrap-unsorted if the order of an item can't be determined
|
||||||
|
while ordering, wrap them into an
|
||||||
|
'Unsortable' object
|
||||||
|
--warn-exceptions if any errors are returned, print them as
|
||||||
|
errors on STDERR
|
||||||
|
--raise-exceptions if any errors are returned (as objects, not
|
||||||
|
raised) from the functions, raise them
|
||||||
|
--drop-exceptions ignore any errors returned as objects from
|
||||||
|
the functions
|
||||||
|
--help Show this message and exit.
|
||||||
|
```
|
||||||
|
|
||||||
|
This works with any function which returns an iterable, for example `my.coding.commits`, which searches for `git commit`s on your computer:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hpi query my.coding.commits
|
||||||
|
```
|
||||||
|
|
||||||
|
When run with a module, this does some analysis of the functions in that module and tries to find ones that look like data sources. If it can't figure out which, it prompts you like:
|
||||||
|
|
||||||
|
```
|
||||||
|
Which function should be used from 'my.coding.commits'?
|
||||||
|
|
||||||
|
1. commits
|
||||||
|
2. repos
|
||||||
|
```
|
||||||
|
|
||||||
|
You select the one you want by clicking `1` or `2` on your keyboard. Otherwise, you can provide a fully qualified path, like:
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query my.coding.commits.repos
|
||||||
|
```
|
||||||
|
|
||||||
|
The corresponding `repos` function this queries is defined in [`my/coding/commits.py`](../my/coding/commits.py)
|
||||||
|
|
||||||
|
### Ordering/Filtering/Streaming
|
||||||
|
|
||||||
|
By default, this just returns the items in the order they were returned by the function. This allows you to filter by specifying a `--order-key`, or `--order-type`. For example, to get the 10 most recent commits. `--order-type datetime` will try to automatically figure out which attribute to use. If it chooses the wrong one (since `Commit`s have both a `committed_dt` and `authored_dt`), you could tell it which to use. For example, to scan my computer and find the most recent commit I made:
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query my.coding.commits.commits --order-key committed_dt --limit 1 --reverse --output pprint --stream
|
||||||
|
Commit(committed_dt=datetime.datetime(2023, 4, 14, 23, 9, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))),
|
||||||
|
authored_dt=datetime.datetime(2023, 4, 14, 23, 4, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))),
|
||||||
|
message='sources.smscalls: propagate errors if there are breaking '
|
||||||
|
'schema changes',
|
||||||
|
repo='/home/username/Repos/promnesia-fork',
|
||||||
|
sha='22a434fca9a28df9b0915ccf16368df129d2c9ce',
|
||||||
|
ref='refs/heads/smscalls-handle-result')
|
||||||
|
```
|
||||||
|
|
||||||
|
To instead limit in some range, you can use `--before` and `--within` to filter by a range. For example, to get all the commits I committed in the last day:
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query my.coding.commits.commits --order-type datetime --before now --within 1d
|
||||||
|
```
|
||||||
|
|
||||||
|
That prints a a list of `Commit` as JSON objects. You could also use `--output pprint` to pretty-print the objects or `--output repl` drop into a REPL.
|
||||||
|
|
||||||
|
To process the JSON, you can pipe it to [`jq`](https://github.com/stedolan/jq). I often use `jq length` to get the count of some output:
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query my.coding.commits.commits --order-type datetime --before now --within 1d | jq length
|
||||||
|
6
|
||||||
|
```
|
||||||
|
|
||||||
|
Because grabbing data `--before now` is such a common use case, the `--recent` flag is a shorthand for `--order-type datetime --reverse --before now --within`. The same as above, to get the commits from the last day:
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query my.coding.commits.commits --recent 1d | jq length
|
||||||
|
6
|
||||||
|
```
|
||||||
|
|
||||||
|
To select a range of commits, you can use `--after` and `--before`, passing ISO or epoch timestamps. Those can be full `datetimes` (`2021-01-01T00:05:30`) or just dates (`2021-01-01`). For example, to get all the commits I made on January 1st, 2021:
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query my.coding.commits.commits --order-type datetime --after 2021-01-01 --before 2021-01-02 | jq length
|
||||||
|
1
|
||||||
|
```
|
||||||
|
|
||||||
|
If you have [`dateparser`](https://github.com/scrapinghub/dateparser#how-to-use) installed, this supports dozens more natural language formats:
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query my.coding.commits.commits --order-type datetime --after 'last week' --before 'day before yesterday' | jq length
|
||||||
|
28
|
||||||
|
```
|
||||||
|
|
||||||
|
If you're having issues ordering because there are exceptions in your results not all data is sortable (may have `None` for some attributes), you can use `--drop-unsorted` to drop those items from the results, or `--drop-exceptions` to remove the exceptions
|
||||||
|
|
||||||
|
You can also stream the results, which is useful for functions that take a while to process or have a lot of data. For example, if you wanted to pick a sha hash from a particular repo, you could combine `jq` to `select` and pick that attribute from the JSON:
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query my.coding.commits.commits --recent 30d --stream | jq 'select(.repo | contains("HPI"))' | jq '.sha' -r
|
||||||
|
4afa899c8b365b3c10e468f6279c02e316d3b650
|
||||||
|
40de162fab741df594b4d9651348ee46ee021e9b
|
||||||
|
e1cb229913482074dc5523e57ef0acf6e9ec2bb2
|
||||||
|
87c13defd131e39292b93dcea661d3191222dace
|
||||||
|
02c738594f2cae36ca4fab43cf9533fe6aa89396
|
||||||
|
0b3a2a6ef3a9e4992771aaea0252fb28217b814a
|
||||||
|
84817ce72d208038b66f634d4ceb6e3a4c7ec5e9
|
||||||
|
47992b8e046d27fc5141839179f06f925c159510
|
||||||
|
425615614bd508e28ccceb56f43c692240e429ab
|
||||||
|
eed8f949460d768fb1f1c4801e9abab58a5f9021
|
||||||
|
d26ad7d9ce6a4718f96346b994c3c1cd0d74380c
|
||||||
|
aec517e53c6ac022f2b4cc91261daab5651cebf0
|
||||||
|
44b75a88fdfc7af132f61905232877031ce32fcb
|
||||||
|
b0ff6f29dd2846e97f8aa85a2ca73736b03254a8
|
||||||
|
```
|
||||||
|
|
||||||
|
`jq`s `select` function acts on a stream of JSON objects, not a list, so it filters the output of `hpi query` the objects are generated (the goal here is to conserve memory as items which aren't needed are filtered). The alternative would be to print the entire JSON list at the end, like:
|
||||||
|
|
||||||
|
`hpi query my.coding.commits.commits --recent 30d | jq '.[] | select(.repo | contains("Repos/HPI"))' | jq '.sha' -r`, using `jq '.[]'` to convert the JSON list into a stream of JSON objects.
|
||||||
|
|
||||||
|
## Usage on non-HPI code
|
||||||
|
|
||||||
|
The command can accept any qualified function name, so this could for example be used to check the output of [`promnesia`](https://github.com/karlicoss/promnesia) sources:
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query promnesia.sources.smscalls | jq length
|
||||||
|
371
|
||||||
|
```
|
||||||
|
|
||||||
|
This can be used on any function that produces an `Iterator`/`Generator` like output, as long as it can be called with no arguments.
|
||||||
|
|
||||||
|
## GPX
|
||||||
|
|
||||||
|
The `hpi query` command can also be used with the `--output gpx` flag to generate gpx files from a list of locations, like the ones defined in the `my.location` package. This could be used to extract some date range and create a `gpx` file which can then be visualized by a GUI application.
|
||||||
|
|
||||||
|
This prints the contents for the `gpx` file to STDOUT, and prints warnings for any objects it could not convert to locations to STDERR, so pipe STDOUT to a output file, like `>out.gpx`
|
||||||
|
|
||||||
|
```
|
||||||
|
hpi query my.location.all --after '2021-07-01T00:00:00' --before '2021-07-05T00:00:00' --order-type datetime --output gpx >out.gpx
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to ignore any errors, you can use `--drop-exceptions`.
|
||||||
|
|
||||||
|
To preview, you can use something like [`qgis`](https://qgis.org/en/site/) or for something easier more lightweight, [`gpxsee`](https://github.com/tumic0/GPXSee):
|
||||||
|
|
||||||
|
`gpxsee out.gpx`:
|
||||||
|
|
||||||
|
<img src="https://user-images.githubusercontent.com/7804791/232249184-7e203ee6-a3ec-4053-800c-751d2c28e690.png" width=500 alt="chicago trip" />
|
||||||
|
|
||||||
|
(Sidenote: this is [`@purarue`](https://github.com/purarue/)s locations, on a trip to Chicago)
|
||||||
|
|
||||||
|
## Python reference
|
||||||
|
|
||||||
|
The `hpi query` command is a CLI wrapper around the code in [`query.py`](../my/core/query.py) and [`query_range.py`](../my/core/query_range.py). The `select` function is the core of this, and `select_range` lets you specify dates, timedelta, start-end ranges, and other CLI-specific code.
|
||||||
|
|
||||||
|
`my.core.query.select`:
|
||||||
|
|
||||||
|
```
|
||||||
|
A function to query, order, sort and filter items from one or more sources
|
||||||
|
This supports iterables and lists of mixed types (including handling errors),
|
||||||
|
by allowing you to provide custom predicates (functions) which can sort
|
||||||
|
by a function, an attribute, dict key, or by the attributes values.
|
||||||
|
|
||||||
|
Since this supports mixed types, there's always a possibility
|
||||||
|
of KeyErrors or AttributeErrors while trying to find some value to order by,
|
||||||
|
so this provides multiple mechanisms to deal with that
|
||||||
|
|
||||||
|
'where' lets you filter items before ordering, to remove possible errors
|
||||||
|
or filter the iterator by some condition
|
||||||
|
|
||||||
|
There are multiple ways to instruct select on how to order items. The most
|
||||||
|
flexible is to provide an 'order_by' function, which takes an item in the
|
||||||
|
iterator, does any custom checks you may want and then returns the value to sort by
|
||||||
|
|
||||||
|
'order_key' is best used on items which have a similar structure, or have
|
||||||
|
the same attribute name for every item in the iterator. If you have a
|
||||||
|
iterator of objects whose datetime is accessed by the 'timestamp' attribute,
|
||||||
|
supplying order_key='timestamp' would sort by that (dictionary or attribute) key
|
||||||
|
|
||||||
|
'order_value' is the most confusing, but often the most useful. Instead of
|
||||||
|
testing against the keys of an item, this allows you to write a predicate
|
||||||
|
(function) to test against its values (dictionary, NamedTuple, dataclass, object).
|
||||||
|
If you had an iterator of mixed types and wanted to sort by the datetime,
|
||||||
|
but the attribute to access the datetime is different on each type, you can
|
||||||
|
provide `order_value=lambda v: isinstance(v, datetime)`, and this will
|
||||||
|
try to find that value for each type in the iterator, to sort it by
|
||||||
|
the value which is received when the predicate is true
|
||||||
|
|
||||||
|
'order_value' is often used in the 'hpi query' interface, because of its brevity.
|
||||||
|
Just given the input function, this can typically sort it by timestamp with
|
||||||
|
no human intervention. It can sort of be thought as an educated guess,
|
||||||
|
but it can always be improved by providing a more complete guess function
|
||||||
|
|
||||||
|
Note that 'order_value' is also the most computationally expensive, as it has
|
||||||
|
to copy the iterator in memory (using itertools.tee) to determine how to order it
|
||||||
|
in memory
|
||||||
|
|
||||||
|
The 'drop_exceptions', 'raise_exceptions', 'warn_exceptions' let you ignore or raise
|
||||||
|
when the src contains exceptions. The 'warn_func' lets you provide a custom function
|
||||||
|
to call when an exception is encountered instead of using the 'warnings' module
|
||||||
|
|
||||||
|
src: an iterable of mixed types, or a function to be called,
|
||||||
|
as the input to this function
|
||||||
|
|
||||||
|
where: a predicate which filters the results before sorting
|
||||||
|
|
||||||
|
order_by: a function which when given an item in the src,
|
||||||
|
returns the value to sort by. Similar to the 'key' value
|
||||||
|
typically passed directly to 'sorted'
|
||||||
|
|
||||||
|
order_key: a string which represents a dict key or attribute name
|
||||||
|
to use as they key to sort by
|
||||||
|
|
||||||
|
order_value: predicate which determines which attribute on an ADT-like item to sort by,
|
||||||
|
when given its value. lambda o: isinstance(o, datetime) is commonly passed to sort
|
||||||
|
by datetime, without knowing the attributes or interface for the items in the src
|
||||||
|
|
||||||
|
default: while ordering, if the order for an object cannot be determined,
|
||||||
|
use this as the default value
|
||||||
|
|
||||||
|
reverse: reverse the order of the resulting iterable
|
||||||
|
|
||||||
|
limit: limit the results to this many items
|
||||||
|
|
||||||
|
drop_unsorted: before ordering, drop any items from the iterable for which a
|
||||||
|
order could not be determined. False by default
|
||||||
|
|
||||||
|
wrap_unsorted: before ordering, wrap any items into an 'Unsortable' object. Place
|
||||||
|
them at the front of the list. True by default
|
||||||
|
|
||||||
|
drop_exceptions: ignore any exceptions from the src
|
||||||
|
|
||||||
|
raise_exceptions: raise exceptions when received from the input src
|
||||||
|
```
|
||||||
|
|
||||||
|
`my.core.query_range.select_range`:
|
||||||
|
|
||||||
|
```
|
||||||
|
A specialized select function which offers generating functions
|
||||||
|
to filter/query ranges from an iterable
|
||||||
|
|
||||||
|
order_key and order_value are used in the same way they are in select
|
||||||
|
|
||||||
|
If you specify order_by_value_type, it tries to search for an attribute
|
||||||
|
on each object/type which has that type, ordering the iterable by that value
|
||||||
|
|
||||||
|
unparsed_range is a tuple of length 3, specifying 'after', 'before', 'duration',
|
||||||
|
i.e. some start point to allow the computed value we're ordering by, some
|
||||||
|
end point and a duration (can use the RangeTuple NamedTuple to construct one)
|
||||||
|
|
||||||
|
(this is typically parsed/created in my.core.__main__, from CLI flags
|
||||||
|
|
||||||
|
If you specify a range, drop_unsorted is forced to be True
|
||||||
|
```
|
||||||
|
|
||||||
|
Those can be imported and accept any sort of iterator, `hpi query` just defaults to the output of functions here. As an example, see [`listens`](https://github.com/purarue/HPI-personal/blob/master/scripts/listens) which just passes an generator (iterator) as the first argument to `query_range`
|
120
doc/SETUP.org
120
doc/SETUP.org
|
@ -20,6 +20,7 @@ You'd be really helping me, I want to make the setup as straightforward as possi
|
||||||
- [[#private-configuration-myconfig][private configuration (my.config)]]
|
- [[#private-configuration-myconfig][private configuration (my.config)]]
|
||||||
- [[#module-dependencies][module dependencies]]
|
- [[#module-dependencies][module dependencies]]
|
||||||
- [[#troubleshooting][Troubleshooting]]
|
- [[#troubleshooting][Troubleshooting]]
|
||||||
|
- [[#common-issues][common issues]]
|
||||||
- [[#usage-examples][Usage examples]]
|
- [[#usage-examples][Usage examples]]
|
||||||
- [[#end-to-end-roam-research-setup][End-to-end Roam Research setup]]
|
- [[#end-to-end-roam-research-setup][End-to-end Roam Research setup]]
|
||||||
- [[#polar][Polar]]
|
- [[#polar][Polar]]
|
||||||
|
@ -39,13 +40,15 @@ You'd be really helping me, I want to make the setup as straightforward as possi
|
||||||
|
|
||||||
|
|
||||||
* Few notes
|
* Few notes
|
||||||
I understand that people who'd like to use this may not be super familiar with Python, PIP or generally unix, so here are some useful notes:
|
I understand that people who'd like to use this may not be super familiar with Python, pip or generally unix, so here are some useful notes:
|
||||||
|
|
||||||
- only ~python >= 3.6~ is supported
|
- only ~python >= 3.7~ is supported
|
||||||
- I'm using ~pip3~ command, but on your system you might only have ~pip~.
|
- I'm using ~pip3~ command, but on your system you might only have ~pip~.
|
||||||
|
|
||||||
If your ~pip --version~ says python 3, feel free to use ~pip~.
|
If your ~pip --version~ says python 3, feel free to use ~pip~.
|
||||||
|
|
||||||
|
- If you have issues getting ~pip~ or ~pip3~ to work, it may be worth invoking the module instead using a fully qualified path, like ~python3 -m pip~ (e.g. ~python3 -m pip install --user ..~)
|
||||||
|
|
||||||
- similarly, I'm using =python3= in the documentation, but if your =python --version= says python3, it's okay to use =python=
|
- similarly, I'm using =python3= in the documentation, but if your =python --version= says python3, it's okay to use =python=
|
||||||
|
|
||||||
- when you are using ~pip install~, [[https://stackoverflow.com/a/42989020/706389][always pass]] =--user=, and *never install third party packages with sudo* (unless you know what you are doing)
|
- when you are using ~pip install~, [[https://stackoverflow.com/a/42989020/706389][always pass]] =--user=, and *never install third party packages with sudo* (unless you know what you are doing)
|
||||||
|
@ -96,15 +99,17 @@ This is less convenient, but gives you more control.
|
||||||
The benefit of this way is that you get a bit more control, explicitly allowing your scripts to use your data.
|
The benefit of this way is that you get a bit more control, explicitly allowing your scripts to use your data.
|
||||||
|
|
||||||
** appendix: optional packages
|
** appendix: optional packages
|
||||||
You can also install some opional packages
|
You can also install some optional packages
|
||||||
|
|
||||||
: pip3 install 'HPI[optional]'
|
: pip3 install 'HPI[optional]'
|
||||||
|
|
||||||
They aren't necessary, but will improve your experience. At the moment these are:
|
They aren't necessary, but will improve your experience. At the moment these are:
|
||||||
|
|
||||||
|
- [[https://github.com/ijl/orjson][orjson]]: a library for serializing data to JSON, used in ~my.core.serialize~ and the ~hpi query~ interface
|
||||||
- [[https://github.com/karlicoss/cachew][cachew]]: automatic caching library, which can greatly speedup data access
|
- [[https://github.com/karlicoss/cachew][cachew]]: automatic caching library, which can greatly speedup data access
|
||||||
- [[https://github.com/metachris/logzero][logzero]]: a nice logging library, supporting colors
|
|
||||||
- [[https://github.com/python/mypy][mypy]]: mypy is used for checking configs and troubleshooting
|
- [[https://github.com/python/mypy][mypy]]: mypy is used for checking configs and troubleshooting
|
||||||
|
- [[https://github.com/borntyping/python-colorlog][colorlog]]: colored formatter for ~logging~ module
|
||||||
|
- [[https://github.com/Rockhopper-Technologies/enlighten]]: console progress bar library
|
||||||
|
|
||||||
* Setting up modules
|
* Setting up modules
|
||||||
This is an *optional step* as few modules work without extra setup.
|
This is an *optional step* as few modules work without extra setup.
|
||||||
|
@ -117,11 +122,12 @@ elaborating on some technical rationales behind the current configuration system
|
||||||
|
|
||||||
** private configuration (=my.config=)
|
** private configuration (=my.config=)
|
||||||
# TODO write about dynamic configuration
|
# TODO write about dynamic configuration
|
||||||
# TODO add a command to edit config?? e.g. HPI config edit
|
# todo add a command to edit config?? e.g. HPI config edit
|
||||||
If you're not planning to use private configuration (some modules don't need it) you can skip straight to the next step. Still, I'd recommend you to read anyway.
|
If you're not planning to use private configuration (some modules don't need it) you can skip straight to the next step. Still, I'd recommend you to read anyway.
|
||||||
|
|
||||||
The configuration usually contains paths to the data on your disk, and some modules have extra settings.
|
The configuration usually contains paths to the data on your disk, and some modules have extra settings.
|
||||||
The config is simply a *python package* (named =my.config=), expected to be in =~/.config/my=.
|
The config is simply a *python package* (named =my.config=), expected to be in =~/.config/my=.
|
||||||
|
If you'd like to change the location of the =my.config= directory, you can set the =MY_CONFIG= environment variable. e.g. in your .bashrc add: ~export MY_CONFIG=$HOME/.my/~
|
||||||
|
|
||||||
Since it's a Python package, generally it's very *flexible* and there are many ways to set it up.
|
Since it's a Python package, generally it's very *flexible* and there are many ways to set it up.
|
||||||
|
|
||||||
|
@ -158,11 +164,21 @@ Since it's a Python package, generally it's very *flexible* and there are many w
|
||||||
|
|
||||||
- or you can just try running them and fill in the attributes Python complains about!
|
- or you can just try running them and fill in the attributes Python complains about!
|
||||||
|
|
||||||
|
or run =hpi doctor my.modulename=
|
||||||
|
|
||||||
# TODO link to post about exports?
|
# TODO link to post about exports?
|
||||||
** module dependencies
|
** module dependencies
|
||||||
Dependencies are different for specific modules you're planning to use, so it's hard to specify.
|
Dependencies are different for specific modules you're planning to use, so it's hard to tell in advance what you'll need.
|
||||||
|
|
||||||
Generally you can just try using the module and then install missing packages via ~pip3 install --user~, should be fairly straightforward.
|
First thing you should try is just using the module; if it works -- great! If it doesn't (i.e. you get something like =ImportError=):
|
||||||
|
|
||||||
|
- try using =hpi module install modulename= (where =<modulename>= is something like =my.hypothesis=, etc.)
|
||||||
|
|
||||||
|
This command uses [[https://github.com/karlicoss/HPI/search?l=Python&q=REQUIRES][REQUIRES]] declaration to install the dependencies.
|
||||||
|
|
||||||
|
- otherwise manually install missing packages via ~pip3 install --user~
|
||||||
|
|
||||||
|
Also please feel free to report if the command above didn't install some dependencies!
|
||||||
|
|
||||||
|
|
||||||
* Troubleshooting
|
* Troubleshooting
|
||||||
|
@ -174,14 +190,35 @@ HPI comes with a command line tool that can help you detect potential issues. Ru
|
||||||
: # alternatively, for more output:
|
: # alternatively, for more output:
|
||||||
: hpi doctor --verbose
|
: hpi doctor --verbose
|
||||||
|
|
||||||
If you only have few modules set up, lots of them will error for you, which is expected, so check the ones you expect to work.
|
If you only have a few modules set up, lots of them will error for you, which is expected, so check the ones you expect to work.
|
||||||
|
|
||||||
|
If you're having issues with ~cachew~ or want to show logs to troubleshoot what may be happening, you can pass the debug flag (e.g., ~hpi --debug doctor my.module_name~) or set the ~LOGGING_LEVEL_HPI~ environment variable (e.g., ~LOGGING_LEVEL_HPI=debug hpi query my.module_name~) to print all logs, including the ~cachew~ dependencies. ~LOGGING_LEVEL_HPI~ could also be used to silence ~info~ logs, like ~LOGGING_LEVEL_HPI=warning hpi ...~
|
||||||
|
|
||||||
|
If you want to enable logs for a particular module, you can use the
|
||||||
|
~LOGGING_LEVEL_~ prefix and then the module name with underscores, like
|
||||||
|
~LOGGING_LEVEL_my_hypothesis=debug hpi query my.hypothesis~
|
||||||
|
|
||||||
|
If you want ~HPI~ to autocomplete the module names for you, this comes with shell completion, see [[../misc/completion/][misc/completion]]
|
||||||
|
|
||||||
If you have any ideas on how to improve it, please let me know!
|
If you have any ideas on how to improve it, please let me know!
|
||||||
|
|
||||||
Here's a screenshot how it looks when everything is mostly good: [[https://user-images.githubusercontent.com/291333/82806066-f7dfe400-9e7c-11ea-8763-b3bee8ada308.png][link]].
|
Here's a screenshot how it looks when everything is mostly good: [[https://user-images.githubusercontent.com/291333/82806066-f7dfe400-9e7c-11ea-8763-b3bee8ada308.png][link]].
|
||||||
|
|
||||||
|
If you experience issues, feel free to report, but please attach your:
|
||||||
|
|
||||||
|
- OS version
|
||||||
|
- python version: =python3 --version=
|
||||||
|
- HPI version: =pip3 show HPI=
|
||||||
|
- if you see some exception, attach a full log (just make sure there is not private information in it)
|
||||||
|
- if you think it can help, attach screenshots
|
||||||
|
|
||||||
|
** common issues
|
||||||
|
- run =hpi config check=, it help to spot certain errors
|
||||||
|
Also really recommended to install =mypy= first, it really helps to spot various trivial errors
|
||||||
|
- if =hpi= shows you something like 'command not found', try using =python3 -m my.core= instead
|
||||||
|
This likely means that your =$HOME/.local/bin= directory isn't in your =$PATH=
|
||||||
|
|
||||||
* Usage examples
|
* Usage examples
|
||||||
If you run your script with ~with_my~ wrapper, you'd have ~my~ in ~PYTHONPATH~ which gives you access to your data from within the script.
|
|
||||||
|
|
||||||
** End-to-end Roam Research setup
|
** End-to-end Roam Research setup
|
||||||
In [[https://beepb00p.xyz/myinfra-roam.html#export][this]] post you can trace all steps:
|
In [[https://beepb00p.xyz/myinfra-roam.html#export][this]] post you can trace all steps:
|
||||||
|
@ -201,7 +238,7 @@ Polar doesn't require any setup as it accesses the highlights on your filesystem
|
||||||
|
|
||||||
You can try if it works with:
|
You can try if it works with:
|
||||||
|
|
||||||
: python3 -c 'import my.reading.polar as polar; print(polar.get_entries())'
|
: python3 -c 'import my.polar as polar; print(polar.get_entries())'
|
||||||
|
|
||||||
** Google Takeout
|
** Google Takeout
|
||||||
If you have zip Google Takeout archives, you can use HPI to access it:
|
If you have zip Google Takeout archives, you can use HPI to access it:
|
||||||
|
@ -229,7 +266,7 @@ It uses exports provided by [[https://github.com/karlicoss/kobuddy][kobuddy]] pa
|
||||||
- prepare the config
|
- prepare the config
|
||||||
|
|
||||||
1. Install =kobuddy= from PIP
|
1. Install =kobuddy= from PIP
|
||||||
2. Add kobo config to =~/.config/my/my/config/__init__.py=
|
2. Add kobo config to =~/.config/my/my/config.py=
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
class kobo:
|
class kobo:
|
||||||
export_dir = '/backups/to/kobo/'
|
export_dir = '/backups/to/kobo/'
|
||||||
|
@ -272,20 +309,18 @@ Polar keeps the data:
|
||||||
- as a bunch of *JSON files*
|
- as a bunch of *JSON files*
|
||||||
|
|
||||||
It's excellent from all perspectives, except one -- you can only use meaningfully use it through Polar app.
|
It's excellent from all perspectives, except one -- you can only use meaningfully use it through Polar app.
|
||||||
Which is, by all means, great!
|
However, you might want to integrate your data elsewhere and use it in ways that Polar developers never even anticipated!
|
||||||
|
|
||||||
But you might want to integrate your data elsewhere and use it in ways that Polar developer never even anticipated!
|
|
||||||
|
|
||||||
If you check the data layout ([[https://github.com/TheCedarPrince/KnowledgeRepository][example]]), you can see it's messy: scattered across multiple directories, contains raw HTML, obscure entities, etc.
|
If you check the data layout ([[https://github.com/TheCedarPrince/KnowledgeRepository][example]]), you can see it's messy: scattered across multiple directories, contains raw HTML, obscure entities, etc.
|
||||||
It's understandable from the app developer's perspective, but it makes things frustrating when you want to work with this data.
|
It's understandable from the app developer's perspective, but it makes things frustrating when you want to work with this data.
|
||||||
|
|
||||||
# todo hmm what if I could share deserialization with Polar app?
|
# todo hmm what if I could share deserialization with Polar app?
|
||||||
|
|
||||||
Here comes the HPI [[file:../my/reading/polar.py][polar module]]!
|
Here comes the HPI [[file:../my/polar.py][polar module]]!
|
||||||
|
|
||||||
: |💾 ~/.polar (raw JSON data) |
|
: |💾 ~/.polar (raw JSON data) |
|
||||||
: ⇓⇓⇓
|
: ⇓⇓⇓
|
||||||
: HPI (my.reading.polar)
|
: HPI (my.polar)
|
||||||
: ⇓⇓⇓
|
: ⇓⇓⇓
|
||||||
: < python interface >
|
: < python interface >
|
||||||
|
|
||||||
|
@ -323,14 +358,13 @@ Of course, HPI helps you here by encapsulating all this parsing logic and exposi
|
||||||
The only thing you need to do is to tell it where to find the files on your disk, via [[file:MODULES.org::#mygoogletakeoutpaths][the config]], because different people use different paths for backups.
|
The only thing you need to do is to tell it where to find the files on your disk, via [[file:MODULES.org::#mygoogletakeoutpaths][the config]], because different people use different paths for backups.
|
||||||
|
|
||||||
# TODO how to emphasize config?
|
# TODO how to emphasize config?
|
||||||
# TODO python is just one of the interfaces?
|
|
||||||
|
|
||||||
** Reddit
|
** Reddit
|
||||||
|
|
||||||
Reddit has a proper API, so in theory HPI could talk directly to Reddit and retrieve the latest data. But that's not what it doing!
|
Reddit has a proper API, so in theory HPI could talk directly to Reddit and retrieve the latest data. But that's not what it doing!
|
||||||
|
|
||||||
- first, there are excellent programmatic APIs for Reddit out there already, for example, [[https://github.com/praw-dev/praw][praw]]
|
- first, there are excellent programmatic APIs for Reddit out there already, for example, [[https://github.com/praw-dev/praw][praw]]
|
||||||
- more importantly, this is the [[https://beepb00p.xyz/exports.html#design][design decision]] of HP
|
- more importantly, this is the [[https://beepb00p.xyz/exports.html#design][design decision]] of HPI
|
||||||
|
|
||||||
It doesn't deal with all with the complexities of API interactions.
|
It doesn't deal with all with the complexities of API interactions.
|
||||||
Instead, it relies on other tools to put *intermediate, raw data*, on your disk and then transforms this data into something nice.
|
Instead, it relies on other tools to put *intermediate, raw data*, on your disk and then transforms this data into something nice.
|
||||||
|
@ -343,19 +377,18 @@ As an example, for [[file:../my/reddit.py][Reddit]], HPI is relying on data fetc
|
||||||
: ⇓⇓⇓
|
: ⇓⇓⇓
|
||||||
: |💾 /backups/reddit/*.json |
|
: |💾 /backups/reddit/*.json |
|
||||||
: ⇓⇓⇓
|
: ⇓⇓⇓
|
||||||
: HPI (my.reddit)
|
: HPI (my.reddit.rexport)
|
||||||
: ⇓⇓⇓
|
: ⇓⇓⇓
|
||||||
: < python interface >
|
: < python interface >
|
||||||
|
|
||||||
So, in your [[file:MODULES.org::#myreddit][reddit config]], similarly to Takeout, you need =export_path=, so HPI knows how to find your Reddit data on the disk.
|
So, in your [[file:MODULES.org::#myreddit][reddit config]], similarly to Takeout, you need =export_path=, so HPI knows how to find your Reddit data on the disk.
|
||||||
|
|
||||||
But there is an extra caveat: rexport is already coming with nice [[https://github.com/karlicoss/rexport/blob/master/dal.py][data bindings]] to parse its outputs.
|
But there is an extra caveat: rexport is already coming with nice [[https://github.com/karlicoss/rexport/blob/master/dal.py][data bindings]] to parse its outputs.
|
||||||
Another *design decision* of HPI is to use existing code and libraries as much as possible, so we also specify a path to =rexport= repository in the config.
|
|
||||||
|
|
||||||
(note: in the future it's possible that rexport will be installed via PIP, I just haven't had time for it so far).
|
|
||||||
|
|
||||||
Several other HPI modules are following a similar pattern: hypothesis, instapaper, pinboard, kobo, etc.
|
Several other HPI modules are following a similar pattern: hypothesis, instapaper, pinboard, kobo, etc.
|
||||||
|
|
||||||
|
Since the [[https://github.com/karlicoss/rexport#api-limitations][reddit API has limited results]], you can use [[https://github.com/purarue/pushshift_comment_export][my.reddit.pushshift]] to access older reddit comments, which both then get merged into =my.reddit.all.comments=
|
||||||
|
|
||||||
** Twitter
|
** Twitter
|
||||||
|
|
||||||
Twitter is interesting, because it's an example of an HPI module that *arbitrates* between several data sources from the same service.
|
Twitter is interesting, because it's an example of an HPI module that *arbitrates* between several data sources from the same service.
|
||||||
|
@ -406,10 +439,9 @@ Since you have two different sources of raw data, you need to specify two bits o
|
||||||
: export_path = '/backups/twitter-archives/*.zip'
|
: export_path = '/backups/twitter-archives/*.zip'
|
||||||
|
|
||||||
Note that you can also just use =my.twitter.archive= or =my.twitter.twint= directly, or set either of paths to empty string: =''=
|
Note that you can also just use =my.twitter.archive= or =my.twitter.twint= directly, or set either of paths to empty string: =''=
|
||||||
# (TODO mypy-safe?)
|
|
||||||
|
|
||||||
# #addingmodifying-modules
|
# #addingmodifying-modules
|
||||||
# Now, say you prefer to use a different library for your Twitter data instead of twint (for whatever reason), and you want to use it TODO
|
# Now, say you prefer to use a different library for your Twitter data instead of twint (for whatever reason), and you want to use it
|
||||||
# TODO docs on overlays?
|
# TODO docs on overlays?
|
||||||
|
|
||||||
** Connecting to other apps
|
** Connecting to other apps
|
||||||
|
@ -425,40 +457,22 @@ connect the data with other apps and libraries!
|
||||||
|
|
||||||
See more in [[file:../README.org::#how-do-you-use-it]["How do you use it?"]] section.
|
See more in [[file:../README.org::#how-do-you-use-it]["How do you use it?"]] section.
|
||||||
|
|
||||||
# TODO memacs module would be nice
|
Also check out [[https://beepb00p.xyz/myinfra.html#hpi][my personal infrastructure map]] to see where I'm using HPI.
|
||||||
# todo dashboard?
|
|
||||||
# todo more examples?
|
|
||||||
|
|
||||||
* Adding/modifying modules
|
* Adding/modifying modules
|
||||||
# TODO link to 'overlays' documentation?
|
# TODO link to 'overlays' documentation?
|
||||||
# TODO don't be afraid to TODO make sure to install in editable mode
|
# TODO don't be afraid to TODO make sure to install in editable mode
|
||||||
|
|
||||||
The easiest is just to run HPI via [[#use-without-installing][with_my]] wrapper or with an editable PIP install.
|
- The easiest is just to clone HPI repository and run an editable PIP install (=pip3 install --user -e .=), or via [[#use-without-installing][with_my]] wrapper.
|
||||||
That way your changes will be reflected immediately, and you will be able to quickly iterate/fix bugs/add new methods.
|
|
||||||
|
After that you can just edit the code directly, your changes will be reflected immediately, and you will be able to quickly iterate/fix bugs/add new methods.
|
||||||
|
|
||||||
|
This is great if you just want to add a few of your own personal modules, or make minimal changes to a few files. If you do much more than that, you may run into possible merge conflicts if/when you update (~git pull~) HPI
|
||||||
|
|
||||||
# TODO eh. doesn't even have to be in 'my' namespace?? need to check it
|
# TODO eh. doesn't even have to be in 'my' namespace?? need to check it
|
||||||
The "proper way" (unless you want to contribute to the upstream) is to create a separate file hierarchy and add your module to =PYTHONPATH=.
|
- The "proper way" (unless you want to contribute to the upstream) is to create a separate file hierarchy and add your module to =PYTHONPATH=.
|
||||||
|
|
||||||
For example, if you want to add an =awesomedatasource=, it could be:
|
|
||||||
|
|
||||||
: custom_module
|
|
||||||
: └── my
|
|
||||||
: └──awesomedatasource.py
|
|
||||||
|
|
||||||
You can use all existing HPI modules in =awesomedatasource.py=, for example, =my.config=, or everything from =my.core=.
|
|
||||||
|
|
||||||
But also, you can use *override* the builtin HPI modules too:
|
|
||||||
|
|
||||||
: custom_reddit_overlay
|
|
||||||
: └── my
|
|
||||||
: └──reddit.py
|
|
||||||
|
|
||||||
# TODO confusing
|
|
||||||
Now if you add =my_reddit_overlay= *in the front* of ~PYTHONPATH~, all the downstream scripts using =my.reddit= will load it from =custom_reddit_overlay= instead.
|
|
||||||
|
|
||||||
This could be useful to monkey patch some behaviours, or dynamically add some extra data sources -- anything that comes to your mind.
|
|
||||||
|
|
||||||
I'll put up a better guide on this, in the meantime see [[https://packaging.python.org/guides/packaging-namespace-packages]["namespace packages"]] for more info.
|
|
||||||
|
|
||||||
# TODO add example with overriding 'all'
|
|
||||||
|
|
||||||
|
# hmmm seems to be no obvious way to link to a header in a separate file,
|
||||||
|
# if you want this in both emacs and how github renders org mode
|
||||||
|
# https://github.com/karlicoss/HPI/pull/160#issuecomment-817318076
|
||||||
|
See [[file:MODULE_DESIGN.org#addingmodules][MODULE_DESIGN/adding modules]] for more information
|
||||||
|
|
4
doc/overlays/install_packages.sh
Executable file
4
doc/overlays/install_packages.sh
Executable file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -eux
|
||||||
|
pip3 install --user "$@" -e main/
|
||||||
|
pip3 install --user "$@" -e overlay/
|
17
doc/overlays/main/setup.py
Normal file
17
doc/overlays/main/setup.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
from setuptools import setup, find_namespace_packages # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
pkgs = find_namespace_packages('src')
|
||||||
|
pkg = min(pkgs)
|
||||||
|
setup(
|
||||||
|
name='hpi-main',
|
||||||
|
zip_safe=False,
|
||||||
|
packages=pkgs,
|
||||||
|
package_dir={'': 'src'},
|
||||||
|
package_data={pkg: ['py.typed']},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
11
doc/overlays/main/src/my/reddit.py
Normal file
11
doc/overlays/main/src/my/reddit.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
print(f'[main] {__name__} hello')
|
||||||
|
|
||||||
|
|
||||||
|
def upvotes() -> list[str]:
|
||||||
|
return [
|
||||||
|
'reddit upvote1',
|
||||||
|
'reddit upvote2',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
trigger_mypy_error: str = 123
|
7
doc/overlays/main/src/my/twitter/all.py
Normal file
7
doc/overlays/main/src/my/twitter/all.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
print(f'[main] {__name__} hello')
|
||||||
|
|
||||||
|
from .common import merge
|
||||||
|
|
||||||
|
def tweets() -> list[str]:
|
||||||
|
from . import gdpr
|
||||||
|
return merge(gdpr)
|
11
doc/overlays/main/src/my/twitter/common.py
Normal file
11
doc/overlays/main/src/my/twitter/common.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
print(f'[main] {__name__} hello')
|
||||||
|
|
||||||
|
from typing import Protocol
|
||||||
|
|
||||||
|
class Source(Protocol):
|
||||||
|
def tweets(self) -> list[str]:
|
||||||
|
...
|
||||||
|
|
||||||
|
def merge(*sources: Source) -> list[str]:
|
||||||
|
from itertools import chain
|
||||||
|
return list(chain.from_iterable(src.tweets() for src in sources))
|
9
doc/overlays/main/src/my/twitter/gdpr.py
Normal file
9
doc/overlays/main/src/my/twitter/gdpr.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
print(f'[main] {__name__} hello')
|
||||||
|
|
||||||
|
def tweets() -> list[str]:
|
||||||
|
return [
|
||||||
|
'gdpr tweet 1',
|
||||||
|
'gdpr tweet 2',
|
||||||
|
]
|
||||||
|
|
||||||
|
trigger_mypy_error: str = 123
|
17
doc/overlays/overlay/setup.py
Normal file
17
doc/overlays/overlay/setup.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
from setuptools import setup, find_namespace_packages # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
pkgs = find_namespace_packages('src')
|
||||||
|
pkg = min(pkgs)
|
||||||
|
setup(
|
||||||
|
name='hpi-overlay',
|
||||||
|
zip_safe=False,
|
||||||
|
packages=pkgs,
|
||||||
|
package_dir={'': 'src'},
|
||||||
|
package_data={pkg: ['py.typed']},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
8
doc/overlays/overlay/src/my/twitter/all.py
Normal file
8
doc/overlays/overlay/src/my/twitter/all.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
print(f'[overlay] {__name__} hello')
|
||||||
|
|
||||||
|
from .common import merge
|
||||||
|
|
||||||
|
def tweets() -> list[str]:
|
||||||
|
from . import gdpr
|
||||||
|
from . import talon
|
||||||
|
return merge(gdpr, talon)
|
9
doc/overlays/overlay/src/my/twitter/talon.py
Normal file
9
doc/overlays/overlay/src/my/twitter/talon.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
print(f'[overlay] {__name__} hello')
|
||||||
|
|
||||||
|
def tweets() -> list[str]:
|
||||||
|
return [
|
||||||
|
'talon tweet 1',
|
||||||
|
'talon tweet 2',
|
||||||
|
]
|
||||||
|
|
||||||
|
trigger_mypy_error: str = 123
|
17
doc/overlays/overlay2/setup.py
Normal file
17
doc/overlays/overlay2/setup.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
from setuptools import setup, find_namespace_packages # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
pkgs = find_namespace_packages('src')
|
||||||
|
pkg = min(pkgs)
|
||||||
|
setup(
|
||||||
|
name='hpi-overlay2',
|
||||||
|
zip_safe=False,
|
||||||
|
packages=pkgs,
|
||||||
|
package_dir={'': 'src'},
|
||||||
|
package_data={pkg: ['py.typed']},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
13
doc/overlays/overlay2/src/my/twitter/__init__.py
Normal file
13
doc/overlays/overlay2/src/my/twitter/__init__.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
print(f'[overlay2] {__name__} hello')
|
||||||
|
|
||||||
|
from pkgutil import extend_path
|
||||||
|
__path__ = extend_path(__path__, __name__)
|
||||||
|
|
||||||
|
def hack_gdpr_module() -> None:
|
||||||
|
from . import gdpr
|
||||||
|
tweets_orig = gdpr.tweets
|
||||||
|
def tweets_patched():
|
||||||
|
return [t.replace('gdpr', 'GDPR') for t in tweets_orig()]
|
||||||
|
gdpr.tweets = tweets_patched
|
||||||
|
|
||||||
|
hack_gdpr_module()
|
17
doc/overlays/overlay3/setup.py
Normal file
17
doc/overlays/overlay3/setup.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
from setuptools import setup, find_namespace_packages # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
pkgs = find_namespace_packages('src')
|
||||||
|
pkg = min(pkgs)
|
||||||
|
setup(
|
||||||
|
name='hpi-overlay3',
|
||||||
|
zip_safe=False,
|
||||||
|
packages=pkgs,
|
||||||
|
package_dir={'': 'src'},
|
||||||
|
package_data={pkg: ['py.typed']},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
0
doc/overlays/overlay3/src/my/py.typed
Normal file
0
doc/overlays/overlay3/src/my/py.typed
Normal file
9
doc/overlays/overlay3/src/my/twitter/_hook.py
Normal file
9
doc/overlays/overlay3/src/my/twitter/_hook.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
import importhook
|
||||||
|
|
||||||
|
@importhook.on_import('my.twitter.gdpr')
|
||||||
|
def on_import(gdpr):
|
||||||
|
print("EXECUTING IMPORT HOOK!")
|
||||||
|
tweets_orig = gdpr.tweets
|
||||||
|
def tweets_patched():
|
||||||
|
return [t.replace('gdpr', 'GDPR') for t in tweets_orig()]
|
||||||
|
gdpr.tweets = tweets_patched
|
105
lint
105
lint
|
@ -1,105 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
from pathlib import Path
|
|
||||||
from pprint import pprint
|
|
||||||
from itertools import chain
|
|
||||||
from subprocess import check_call, run, PIPE
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from typing import List, Optional, Iterable
|
|
||||||
|
|
||||||
|
|
||||||
def log(*args):
|
|
||||||
print(*args, file=sys.stderr)
|
|
||||||
|
|
||||||
CI = 'CI' in os.environ
|
|
||||||
|
|
||||||
DIR = Path(__file__).absolute().parent
|
|
||||||
|
|
||||||
# hmm. I guess I need to check all subpackages separately
|
|
||||||
# otherwise pylint doesn't work and mypy doesn't discover everything
|
|
||||||
|
|
||||||
# TODO could reuse in readme??
|
|
||||||
# returns None if not a package
|
|
||||||
def package_name(p: Path) -> str:
|
|
||||||
def mname(p: Path):
|
|
||||||
nosuf = p.with_suffix('')
|
|
||||||
return str(nosuf).replace('/', '.')
|
|
||||||
|
|
||||||
has_init = (p.parent / '__init__.py').exists()
|
|
||||||
if has_init:
|
|
||||||
return mname(p.parent)
|
|
||||||
else:
|
|
||||||
return mname(p)
|
|
||||||
|
|
||||||
def subpackages(package: str) -> Iterable[str]:
|
|
||||||
ppath = package.replace('.', '/')
|
|
||||||
yield from sorted({
|
|
||||||
package_name(p.relative_to(DIR)) for p in (DIR / ppath).rglob('*.py')
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
# TODO meh.. think how to check _everything_ on CI
|
|
||||||
def core_modules() -> Iterable[str]:
|
|
||||||
return [
|
|
||||||
*subpackages('my.core'),
|
|
||||||
'my.config',
|
|
||||||
'my.cfg',
|
|
||||||
'tests/misc.py',
|
|
||||||
'tests/get_files.py',
|
|
||||||
# 'tests/config.py', TODO hmm. unclear how to type check this module
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def all_modules() -> Iterable[str]:
|
|
||||||
yield from subpackages('my')
|
|
||||||
yield from sorted(
|
|
||||||
str(f.relative_to(DIR)) for f in (DIR / 'tests').rglob('*.py')
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def pylint():
|
|
||||||
# TODO ugh. pylint still doesn't like checking my.config or my.books
|
|
||||||
# only top level .py files seem ok??
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def mypy(thing: str):
|
|
||||||
is_package = Path(thing).suffix != '.py'
|
|
||||||
cmd = [
|
|
||||||
'mypy',
|
|
||||||
'--color-output', # TODO eh? doesn't work..
|
|
||||||
*(['-p'] if is_package else []), thing,
|
|
||||||
]
|
|
||||||
print(' '.join(cmd), file=sys.stderr)
|
|
||||||
return run(cmd, stdout=PIPE, stderr=PIPE)
|
|
||||||
|
|
||||||
|
|
||||||
def mypy_all() -> Iterable[Exception]:
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
|
|
||||||
pkgs = list(core_modules() if CI else all_modules())
|
|
||||||
log(f"Checking {pkgs}")
|
|
||||||
with ThreadPoolExecutor() as pool:
|
|
||||||
for p, res in zip(pkgs, pool.map(mypy, pkgs)):
|
|
||||||
ret = res.returncode
|
|
||||||
if ret > 0:
|
|
||||||
log(f'FAILED: {p}')
|
|
||||||
else:
|
|
||||||
log(f'OK: {p}')
|
|
||||||
print(res.stdout.decode('utf8'))
|
|
||||||
print(res.stderr.decode('utf8'), file=sys.stderr)
|
|
||||||
try:
|
|
||||||
res.check_returncode()
|
|
||||||
except Exception as e:
|
|
||||||
yield e
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
errors = list(mypy_all())
|
|
||||||
if len(errors) > 0:
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
37
misc/.flake8-karlicoss
Normal file
37
misc/.flake8-karlicoss
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
[flake8]
|
||||||
|
ignore =
|
||||||
|
## these mess up vertical aligment
|
||||||
|
E126 # continuation line over-indented
|
||||||
|
E202 # whitespace before )
|
||||||
|
E203 # whitespace before ':' (e.g. in dict)
|
||||||
|
E221 # multiple spaces before operator
|
||||||
|
E241 # multiple spaces after ,
|
||||||
|
E251 # unexpected spaces after =
|
||||||
|
E261 # 2 spaces before comment. I actually think it's fine so TODO enable back later (TODO or not? still alignment)
|
||||||
|
E271 # multiple spaces after keyword
|
||||||
|
E272 # multiple spaces before keyword
|
||||||
|
##
|
||||||
|
E266 # 'too many leading # in the comment' -- this is just unnecessary pickiness, sometimes it's nice to format a comment
|
||||||
|
E302 # 2 blank lines
|
||||||
|
E501 # 'line too long' -- kinda annoying and the default 79 is shit anyway
|
||||||
|
E702 E704 # multiple statements on one line -- messes with : ... type declataions + sometimes asserts
|
||||||
|
E731 # suggests always using def instead of lambda
|
||||||
|
|
||||||
|
E402 # FIXME module level import -- we want it later
|
||||||
|
E252 # TODO later -- whitespace around equals?
|
||||||
|
# F541: f-string is missing placeholders -- perhaps too picky?
|
||||||
|
|
||||||
|
# F841 is pretty useful (unused variables). maybe worth making it an error on CI
|
||||||
|
|
||||||
|
|
||||||
|
# for imports: we might want to check these
|
||||||
|
# F401 good: unused imports
|
||||||
|
# E401: import order
|
||||||
|
# F811: redefinition of unused import
|
||||||
|
# todo from my.core import __NOT_HPI_MODULE__ this needs to be excluded from 'unused'
|
||||||
|
#
|
||||||
|
|
||||||
|
# as a reference:
|
||||||
|
# https://github.com/purarue/cookiecutter-template/blob/master/%7B%7Bcookiecutter.module_name%7D%7D/setup.cfg
|
||||||
|
# and this https://github.com/karlicoss/HPI/pull/151
|
||||||
|
# find ./my | entr flake8 --ignore=E402,E501,E741,W503,E266,E302,E305,E203,E261,E252,E251,E221,W291,E225,E303,E702,E202,F841,E731,E306,E127 E722,E231 my | grep -v __NOT_HPI_MODULE__
|
105
misc/check-twitter.sh
Executable file
105
misc/check-twitter.sh
Executable file
|
@ -0,0 +1,105 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# just a hacky script to check twitter module behaviour w.r.t. merging and normalising data
|
||||||
|
# this checks against orger output for @karlicoss data
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
FILE="$1"
|
||||||
|
|
||||||
|
function check() {
|
||||||
|
x="$1"
|
||||||
|
if [[ $(rg --count "$x" "$FILE") != "1" ]]; then
|
||||||
|
echo "FAILED! $x"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# only in old twitter archive data + test mentions
|
||||||
|
check '2010-03-24 Wed 10:02.*@GDRussia подлагивает'
|
||||||
|
|
||||||
|
# check that old twitter archive data replaces </>
|
||||||
|
check '2011-05-12 Thu 17:51.*set ><'
|
||||||
|
# this would probs be from twint or something?
|
||||||
|
check '2013-06-01 Sat 18:48.*<inputfile'
|
||||||
|
|
||||||
|
|
||||||
|
# https://twitter.com/karlicoss/status/363703394201894912
|
||||||
|
# the quoted acc was suspended and the tweet is only present in archives?
|
||||||
|
check '2013-08-03 Sat 16:50.*удивительно, как в одном человеке'
|
||||||
|
# similar
|
||||||
|
# https://twitter.com/karlicoss/status/712186968382291968
|
||||||
|
check '2016-03-22 Tue 07:59.*Очень хорошо'
|
||||||
|
|
||||||
|
|
||||||
|
# RTs are missing from twint
|
||||||
|
# https://twitter.com/karlicoss/status/925968541458759681
|
||||||
|
check '2017-11-02 Thu 06:11.*RT @dabeaz: A short esoteric Python'
|
||||||
|
|
||||||
|
|
||||||
|
# twint stopped updating at this point
|
||||||
|
# https://twitter.com/karlicoss/status/1321488603499954177
|
||||||
|
check '2020-10-28 Wed 16:26.*@jborichevskiy I feel like for me'
|
||||||
|
|
||||||
|
# https://twitter.com/karlicoss/status/808769414984331267
|
||||||
|
# archive doesn't expland links in 'text' by default, check we're doing that in HPI
|
||||||
|
# NOTE: hmm twint adds an extra whitespace here before the link?
|
||||||
|
check '2016-12-13 Tue 20:23.*TIL:.*pypi.python.org/pypi/coloredlogs'
|
||||||
|
|
||||||
|
|
||||||
|
# https://twitter.com/karlicoss/status/472151454044917761
|
||||||
|
# archive isn't expanding images by default
|
||||||
|
check '2014-05-29 Thu 23:04.*Выколол сингулярность.*pic.twitter.com/M6XRN1n7KW'
|
||||||
|
|
||||||
|
|
||||||
|
# https://twitter.com/karlicoss/status/565648186816335873
|
||||||
|
# for some reason missing from twint??
|
||||||
|
check '2015-02-11 Wed 23:06.*separation confirmed'
|
||||||
|
|
||||||
|
|
||||||
|
# mentions were missing from twint at some point, check they are still present..
|
||||||
|
# https://twitter.com/karlicoss/status/1228225797283966976
|
||||||
|
check '2020-02-14 Fri 07:53.*thomas536.*looks like a very cool blog'
|
||||||
|
|
||||||
|
|
||||||
|
# just a random timestamp check. RT then reply shortly after -- good check.
|
||||||
|
# https://twitter.com/karlicoss/status/341512959694082049
|
||||||
|
check '2013-06-03 Mon 11:13.*RT @osenin'
|
||||||
|
# https://twitter.com/karlicoss/status/341513515749736448
|
||||||
|
check '2013-06-03 Mon 11:15.*@osenin'
|
||||||
|
|
||||||
|
|
||||||
|
# def was tweeted at 00:00 MSK, so a good timezone check
|
||||||
|
# id 550396141914058752
|
||||||
|
check '2014-12-31 Wed 21:00.*2015 заебал'
|
||||||
|
|
||||||
|
# for some reason is gone, and wasn't in twidump/twint
|
||||||
|
# https://twitter.com/karlicoss/status/1393312193945513985
|
||||||
|
check '2021-05-14 Fri 21:08.*RT @SNunoPerez: Me explaining Rage.*'
|
||||||
|
|
||||||
|
|
||||||
|
# make sure there is a single occurrence (hence, correct tzs)
|
||||||
|
check 'A short esoteric Python'
|
||||||
|
# https://twitter.com/karlicoss/status/1499174823272099842
|
||||||
|
check 'It would be a really good time for countries'
|
||||||
|
# https://twitter.com/karlicoss/status/1530303537476947968
|
||||||
|
check 'so there is clearly a pattern'
|
||||||
|
|
||||||
|
|
||||||
|
# https://twitter.com/karlicoss/status/1488942357303238673
|
||||||
|
# check URL expansion for Talon
|
||||||
|
check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode'
|
||||||
|
|
||||||
|
|
||||||
|
# https://twitter.com/karlicoss/status/349168455964033024
|
||||||
|
# check link which is only in twidump
|
||||||
|
check '2013-06-24 Mon 14:13.*RT @gorod095: Нашел недавно в букинист'
|
||||||
|
|
||||||
|
# some older statuses, useful to test that all input data is properly detected
|
||||||
|
check '2010-04-01 Thu 11:34'
|
||||||
|
check '2010-06-28 Mon 23:42'
|
||||||
|
|
||||||
|
# https://twitter.com/karlicoss/status/22916704915
|
||||||
|
# this one is weird, just disappeared for no reason between 2021-12-22 and 2022-03-15
|
||||||
|
# and the account isn't suspended etc. maybe it was temporary private or something?
|
||||||
|
check '2010-09-03 Fri 20:11.*Джобс'
|
||||||
|
|
||||||
|
# TODO check likes as well
|
84
misc/check_legacy_init_py.py
Executable file
84
misc/check_legacy_init_py.py
Executable file
|
@ -0,0 +1,84 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# NOTE: prerequisites for this test:
|
||||||
|
# fbmessengerexport installed
|
||||||
|
# config configured (can set it to '' though)
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from subprocess import Popen, run, PIPE
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
|
|
||||||
|
import logzero # type: ignore[import]
|
||||||
|
logger = logzero.logger
|
||||||
|
|
||||||
|
|
||||||
|
MSG = 'my.fbmessenger is DEPRECATED'
|
||||||
|
|
||||||
|
def expect(*cmd: str, should_warn: bool=True) -> None:
|
||||||
|
res = run(cmd, stderr=PIPE)
|
||||||
|
errb = res.stderr; assert errb is not None
|
||||||
|
err = errb.decode('utf8')
|
||||||
|
if should_warn:
|
||||||
|
assert MSG in err, res
|
||||||
|
else:
|
||||||
|
assert MSG not in err, res
|
||||||
|
assert res.returncode == 0, res
|
||||||
|
|
||||||
|
|
||||||
|
def _check(*cmd: str, should_warn: bool, run_as_cmd: bool=True) -> None:
|
||||||
|
expecter = lambda *cmd: expect(*cmd, should_warn=should_warn)
|
||||||
|
if cmd[0] == '-c':
|
||||||
|
[_, code] = cmd
|
||||||
|
if run_as_cmd:
|
||||||
|
expecter('python3', '-c', code)
|
||||||
|
# check as a script
|
||||||
|
with TemporaryDirectory() as tdir:
|
||||||
|
script = Path(tdir) / 'script.py'
|
||||||
|
script.write_text(code)
|
||||||
|
expecter('python3', str(script))
|
||||||
|
else:
|
||||||
|
expecter('python3', *cmd)
|
||||||
|
what = 'warns' if should_warn else ' ' # meh
|
||||||
|
logger.info(f"PASSED: {what}: {repr(cmd)}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_warn(*cmd: str, **kwargs) -> None:
|
||||||
|
_check(*cmd, should_warn=True, **kwargs)
|
||||||
|
|
||||||
|
def check_ok(*cmd: str, **kwargs) -> None:
|
||||||
|
_check(*cmd, should_warn=False, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# NOTE these three are actually sort of OK, they are allowed when it's a proper namespace package with all.py etc.
|
||||||
|
# but more likely it means legacy behaviour or just misusing the package?
|
||||||
|
# worst case it's just a warning I guess
|
||||||
|
check_warn('-c', 'from my import fbmessenger')
|
||||||
|
check_warn('-c', 'import my.fbmessenger')
|
||||||
|
check_warn('-c', 'from my.fbmessenger import *')
|
||||||
|
|
||||||
|
# note: dump_chat_history should really be deprecated, but it's a quick way to check we actually fell back to fbmessenger/export.py
|
||||||
|
# NOTE: this is the most common legacy usecase
|
||||||
|
check_warn('-c', 'from my.fbmessenger import messages, dump_chat_history')
|
||||||
|
check_warn('-m', 'my.core', 'query' , 'my.fbmessenger.messages', '-o', 'pprint', '--limit=10')
|
||||||
|
check_warn('-m', 'my.core', 'doctor', 'my.fbmessenger')
|
||||||
|
check_warn('-m', 'my.core', 'module', 'requires', 'my.fbmessenger')
|
||||||
|
|
||||||
|
# todo kinda annoying it doesn't work when executed as -c (but does as script!)
|
||||||
|
# presumably because doesn't have proper line number information?
|
||||||
|
# either way, it'a a bit of a corner case, the script behaviour is more important
|
||||||
|
check_ok ('-c', 'from my.fbmessenger import export', run_as_cmd=False)
|
||||||
|
check_ok ('-c', 'import my.fbmessenger.export')
|
||||||
|
check_ok ('-c', 'from my.fbmessenger.export import *')
|
||||||
|
check_ok ('-c', 'from my.fbmessenger.export import messages, dump_chat_history')
|
||||||
|
check_ok ('-m', 'my.core', 'query' , 'my.fbmessenger.export.messages', '-o', 'pprint', '--limit=10')
|
||||||
|
check_ok ('-m', 'my.core', 'doctor', 'my.fbmessenger.export')
|
||||||
|
check_ok ('-m', 'my.core', 'module', 'requires', 'my.fbmessenger.export')
|
||||||
|
|
||||||
|
# NOTE:
|
||||||
|
# to check that overlays work, run something like
|
||||||
|
# PYTHONPATH=misc/overlay_for_init_py_test/ hpi query my.fbmessenger.all.messages -s -o pprint --limit=10
|
||||||
|
# you should see 1, 2, 3 from mixin.py
|
||||||
|
# TODO would be nice to add an automated test for this
|
||||||
|
|
||||||
|
# TODO with reddit, currently these don't work properly at all
|
||||||
|
# only when imported from scripts etc?
|
37
misc/completion/README.md
Normal file
37
misc/completion/README.md
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
To enable completion for the `hpi` command:
|
||||||
|
|
||||||
|
If you don't want to use the files here, you can do this when you launch your shell like:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
eval "$(_HPI_COMPLETE=bash_source hpi)" # in ~/.bashrc
|
||||||
|
eval "$(_HPI_COMPLETE=zsh_source hpi)" # in ~/.zshrc
|
||||||
|
eval "$(_HPI_COMPLETE=fish_source hpi)" # in ~/.config/fish/config.fish
|
||||||
|
```
|
||||||
|
|
||||||
|
That is slightly slower since its generating the completion code on the fly -- see [click docs](https://click.palletsprojects.com/en/8.0.x/shell-completion/#enabling-completion) for more info
|
||||||
|
|
||||||
|
To use the generated completion files in this repository, you need to source the file in `./bash`, `./zsh`, or `./fish` depending on your shell.
|
||||||
|
|
||||||
|
If you don't have HPI cloned locally, after installing `HPI` you can generate the file yourself using one of the commands above. For example, for `bash`: `_HPI_COMPLETE=bash_source hpi > ~/.config/hpi_bash_completion`, and then source it like `source ~/.config/hpi_bash_completion`
|
||||||
|
|
||||||
|
### bash
|
||||||
|
|
||||||
|
Put `source /path/to/hpi/repo/misc/completion/bash/_hpi` in your `~/.bashrc`
|
||||||
|
|
||||||
|
### zsh
|
||||||
|
|
||||||
|
You can either source the file:
|
||||||
|
|
||||||
|
`source /path/to/hpi/repo/misc/completion/zsh/_hpi`
|
||||||
|
|
||||||
|
..or add the directory to your `fpath` to load it lazily:
|
||||||
|
|
||||||
|
`fpath=("/path/to/hpi/repo/misc/completion/zsh/" "${fpath[@]}")` (Note: the directory, not the script `_hpi`)
|
||||||
|
|
||||||
|
If your zsh configuration doesn't automatically run `compinit`, after modifying your `fpath` you should:
|
||||||
|
|
||||||
|
`autoload -Uz compinit && compinit`
|
||||||
|
|
||||||
|
### fish
|
||||||
|
|
||||||
|
`cp ./fish/hpi.fish ~/.config/fish/completions/`, then restart your shell
|
29
misc/completion/bash/_hpi
Normal file
29
misc/completion/bash/_hpi
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
_hpi_completion() {
|
||||||
|
local IFS=$'\n'
|
||||||
|
local response
|
||||||
|
|
||||||
|
response=$(env COMP_WORDS="${COMP_WORDS[*]}" COMP_CWORD=$COMP_CWORD _HPI_COMPLETE=bash_complete $1)
|
||||||
|
|
||||||
|
for completion in $response; do
|
||||||
|
IFS=',' read type value <<< "$completion"
|
||||||
|
|
||||||
|
if [[ $type == 'dir' ]]; then
|
||||||
|
COMPREPLY=()
|
||||||
|
compopt -o dirnames
|
||||||
|
elif [[ $type == 'file' ]]; then
|
||||||
|
COMPREPLY=()
|
||||||
|
compopt -o default
|
||||||
|
elif [[ $type == 'plain' ]]; then
|
||||||
|
COMPREPLY+=($value)
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
_hpi_completion_setup() {
|
||||||
|
complete -o nosort -F _hpi_completion hpi
|
||||||
|
}
|
||||||
|
|
||||||
|
_hpi_completion_setup;
|
||||||
|
|
18
misc/completion/fish/hpi.fish
Normal file
18
misc/completion/fish/hpi.fish
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
function _hpi_completion;
|
||||||
|
set -l response (env _HPI_COMPLETE=fish_complete COMP_WORDS=(commandline -cp) COMP_CWORD=(commandline -t) hpi);
|
||||||
|
|
||||||
|
for completion in $response;
|
||||||
|
set -l metadata (string split "," $completion);
|
||||||
|
|
||||||
|
if test $metadata[1] = "dir";
|
||||||
|
__fish_complete_directories $metadata[2];
|
||||||
|
else if test $metadata[1] = "file";
|
||||||
|
__fish_complete_path $metadata[2];
|
||||||
|
else if test $metadata[1] = "plain";
|
||||||
|
echo $metadata[2];
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
complete --no-files --command hpi --arguments "(_hpi_completion)";
|
||||||
|
|
12
misc/completion/generate
Executable file
12
misc/completion/generate
Executable file
|
@ -0,0 +1,12 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# assumes HPI is already installed
|
||||||
|
# generates the completion files
|
||||||
|
|
||||||
|
cd "$(realpath "$(dirname "${BASH_SOURCE[0]}")")"
|
||||||
|
|
||||||
|
mkdir -p ./bash ./zsh ./fish
|
||||||
|
|
||||||
|
_HPI_COMPLETE=fish_source hpi >./fish/hpi.fish
|
||||||
|
# underscores to allow these directories to be lazily loaded
|
||||||
|
_HPI_COMPLETE=zsh_source hpi >./zsh/_hpi
|
||||||
|
_HPI_COMPLETE=bash_source hpi >./bash/_hpi
|
41
misc/completion/zsh/_hpi
Normal file
41
misc/completion/zsh/_hpi
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
#compdef hpi
|
||||||
|
|
||||||
|
_hpi_completion() {
|
||||||
|
local -a completions
|
||||||
|
local -a completions_with_descriptions
|
||||||
|
local -a response
|
||||||
|
(( ! $+commands[hpi] )) && return 1
|
||||||
|
|
||||||
|
response=("${(@f)$(env COMP_WORDS="${words[*]}" COMP_CWORD=$((CURRENT-1)) _HPI_COMPLETE=zsh_complete hpi)}")
|
||||||
|
|
||||||
|
for type key descr in ${response}; do
|
||||||
|
if [[ "$type" == "plain" ]]; then
|
||||||
|
if [[ "$descr" == "_" ]]; then
|
||||||
|
completions+=("$key")
|
||||||
|
else
|
||||||
|
completions_with_descriptions+=("$key":"$descr")
|
||||||
|
fi
|
||||||
|
elif [[ "$type" == "dir" ]]; then
|
||||||
|
_path_files -/
|
||||||
|
elif [[ "$type" == "file" ]]; then
|
||||||
|
_path_files -f
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -n "$completions_with_descriptions" ]; then
|
||||||
|
_describe -V unsorted completions_with_descriptions -U
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$completions" ]; then
|
||||||
|
compadd -U -V unsorted -a completions
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ $zsh_eval_context[-1] == loadautofunc ]]; then
|
||||||
|
# autoload from fpath, call function directly
|
||||||
|
_hpi_completion "$@"
|
||||||
|
else
|
||||||
|
# eval/source/. command, register function for later
|
||||||
|
compdef _hpi_completion hpi
|
||||||
|
fi
|
||||||
|
|
7
misc/overlay_for_init_py_test/my/fbmessenger/all.py
Normal file
7
misc/overlay_for_init_py_test/my/fbmessenger/all.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
from my.fbmessenger import export
|
||||||
|
from . import mixin
|
||||||
|
|
||||||
|
|
||||||
|
def messages():
|
||||||
|
yield from mixin.messages()
|
||||||
|
yield from export.messages()
|
2
misc/overlay_for_init_py_test/my/fbmessenger/mixin.py
Normal file
2
misc/overlay_for_init_py_test/my/fbmessenger/mixin.py
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
def messages():
|
||||||
|
yield from ['1', '2', '3']
|
63
misc/repl.py
Executable file
63
misc/repl.py
Executable file
|
@ -0,0 +1,63 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# M-x run-python (raise window so it doesn't hide)
|
||||||
|
# ?? python-shell-send-defun
|
||||||
|
# C-c C-r python-shell-send-region
|
||||||
|
# shit, it isn't autoscrolling??
|
||||||
|
# maybe add hook
|
||||||
|
# (setq comint-move-point-for-output t) ;; https://github.com/jorgenschaefer/elpy/issues/1641#issuecomment-528355368
|
||||||
|
#
|
||||||
|
from itertools import islice, groupby
|
||||||
|
from more_itertools import ilen, bucket
|
||||||
|
|
||||||
|
from importlib import reload
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# todo function to reload hpi?
|
||||||
|
todel = [m for m in sys.modules if m.startswith('my.')]
|
||||||
|
for m in todel: del sys.modules[m]
|
||||||
|
|
||||||
|
import my
|
||||||
|
# todo add to doc?
|
||||||
|
from my.core import get_files
|
||||||
|
|
||||||
|
|
||||||
|
import my.bluemaestro as M
|
||||||
|
|
||||||
|
from my.config import bluemaestro as BC
|
||||||
|
# BC.export_path = get_files(BC.export_path)[:40]
|
||||||
|
|
||||||
|
# print(list(M.measurements())[:10])
|
||||||
|
|
||||||
|
M.fill_influxdb()
|
||||||
|
|
||||||
|
ffwf
|
||||||
|
|
||||||
|
#
|
||||||
|
from my.config import rescuetime as RC
|
||||||
|
|
||||||
|
# todo ugh. doesn't work??
|
||||||
|
# from my.core.cachew import disable_cachew
|
||||||
|
# disable_cachew()
|
||||||
|
# RC.export_path = get_files(RC.export_path)[-1:]
|
||||||
|
|
||||||
|
import my.rescuetime as M
|
||||||
|
# print(len(list(M.entries())))
|
||||||
|
M.fill_influxdb()
|
||||||
|
|
||||||
|
print(M.dataframe())
|
||||||
|
|
||||||
|
e = M.entries()
|
||||||
|
e = list(islice(e, 0, 10))
|
||||||
|
|
||||||
|
|
||||||
|
key = lambda x: 'ERROR' if isinstance(x, Exception) else x.activity
|
||||||
|
|
||||||
|
# TODO move to errors module? how to preserve type signature?
|
||||||
|
# b = bucket(e, key=key)
|
||||||
|
# for k in b:
|
||||||
|
# g = b[k] # meh? should maybe sort
|
||||||
|
# print(k, ilen(g))
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
print(Counter(key(x) for x in e))
|
116
my/arbtt.py
Normal file
116
my/arbtt.py
Normal file
|
@ -0,0 +1,116 @@
|
||||||
|
'''
|
||||||
|
[[https://github.com/nomeata/arbtt#arbtt-the-automatic-rule-based-time-tracker][Arbtt]] time tracking
|
||||||
|
'''
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
REQUIRES = ['ijson', 'cffi']
|
||||||
|
# NOTE likely also needs libyajl2 from apt or elsewhere?
|
||||||
|
|
||||||
|
|
||||||
|
from collections.abc import Iterable, Sequence
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
try:
|
||||||
|
from my.config import arbtt as user_config
|
||||||
|
except ImportError:
|
||||||
|
from my.core.warnings import low
|
||||||
|
low("Couldn't find 'arbtt' config section, falling back to the default capture.log (usually in HOME dir). Add 'arbtt' section with logfiles = '' to suppress this warning.")
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
from .core import get_files
|
||||||
|
return get_files(user_config.logfiles)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from my.core import Json, PathIsh, datetime_aware
|
||||||
|
from my.core.compat import fromisoformat
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Entry:
|
||||||
|
'''
|
||||||
|
For the format reference, see
|
||||||
|
https://github.com/nomeata/arbtt/blob/e120ad20b9b8e753fbeb02041720b7b5b271ab20/src/DumpFormat.hs#L39-L46
|
||||||
|
'''
|
||||||
|
|
||||||
|
json: Json
|
||||||
|
# inactive time -- in ms
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dt(self) -> datetime_aware:
|
||||||
|
# contains utc already
|
||||||
|
# TODO after python>=3.11, could just use fromisoformat
|
||||||
|
ds = self.json['date']
|
||||||
|
elen = 27
|
||||||
|
lds = len(ds)
|
||||||
|
if lds < elen:
|
||||||
|
# ugh. sometimes contains less that 6 decimal points
|
||||||
|
ds = ds[:-1] + '0' * (elen - lds) + 'Z'
|
||||||
|
elif lds > elen:
|
||||||
|
# and sometimes more...
|
||||||
|
ds = ds[:elen - 1] + 'Z'
|
||||||
|
|
||||||
|
return fromisoformat(ds)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def active(self) -> str | None:
|
||||||
|
# NOTE: WIP, might change this in the future...
|
||||||
|
ait = (w for w in self.json['windows'] if w['active'])
|
||||||
|
a = next(ait, None)
|
||||||
|
if a is None:
|
||||||
|
return None
|
||||||
|
a2 = next(ait, None)
|
||||||
|
assert a2 is None, a2 # hopefully only one can be active in a time?
|
||||||
|
|
||||||
|
p = a['program']
|
||||||
|
t = a['title']
|
||||||
|
# todo perhaps best to keep it structured, e.g. for influx
|
||||||
|
return f'{p}: {t}'
|
||||||
|
|
||||||
|
|
||||||
|
# todo multiple threads? not sure if would help much... (+ need to find offset somehow?)
|
||||||
|
def entries() -> Iterable[Entry]:
|
||||||
|
inps = list(inputs())
|
||||||
|
|
||||||
|
base: list[PathIsh] = ['arbtt-dump', '--format=json']
|
||||||
|
|
||||||
|
cmds: list[list[PathIsh]]
|
||||||
|
if len(inps) == 0:
|
||||||
|
cmds = [base] # rely on default
|
||||||
|
else:
|
||||||
|
# otherwise, 'merge' them
|
||||||
|
cmds = [[*base, '--logfile', f] for f in inps]
|
||||||
|
|
||||||
|
from subprocess import PIPE, Popen
|
||||||
|
|
||||||
|
import ijson.backends.yajl2_cffi as ijson # type: ignore
|
||||||
|
for cmd in cmds:
|
||||||
|
with Popen(cmd, stdout=PIPE) as p:
|
||||||
|
out = p.stdout; assert out is not None
|
||||||
|
for json in ijson.items(out, 'item'):
|
||||||
|
yield Entry(json=json)
|
||||||
|
|
||||||
|
|
||||||
|
def fill_influxdb() -> None:
|
||||||
|
from .core.freezer import Freezer
|
||||||
|
from .core.influxdb import magic_fill
|
||||||
|
freezer = Freezer(Entry)
|
||||||
|
fit = (freezer.freeze(e) for e in entries())
|
||||||
|
# TODO crap, influxdb doesn't like None https://github.com/influxdata/influxdb/issues/7722
|
||||||
|
# wonder if can check it statically/warn?
|
||||||
|
fit = (f for f in fit if f.active is not None)
|
||||||
|
|
||||||
|
# todo could tag with computer name or something...
|
||||||
|
# todo should probably also tag with 'program'?
|
||||||
|
magic_fill(fit, name=f'{entries.__module__}:{entries.__name__}')
|
||||||
|
|
||||||
|
|
||||||
|
from .core import Stats, stat
|
||||||
|
|
||||||
|
|
||||||
|
def stats() -> Stats:
|
||||||
|
return stat(entries)
|
261
my/bluemaestro.py
Normal file
261
my/bluemaestro.py
Normal file
|
@ -0,0 +1,261 @@
|
||||||
|
"""
|
||||||
|
[[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
# todo most of it belongs to DAL... but considering so few people use it I didn't bother for now
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
from abc import abstractmethod
|
||||||
|
from collections.abc import Iterable, Sequence
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Protocol
|
||||||
|
|
||||||
|
import pytz
|
||||||
|
|
||||||
|
from my.core import (
|
||||||
|
Paths,
|
||||||
|
Res,
|
||||||
|
Stats,
|
||||||
|
get_files,
|
||||||
|
make_logger,
|
||||||
|
stat,
|
||||||
|
unwrap,
|
||||||
|
)
|
||||||
|
from my.core.cachew import mcachew
|
||||||
|
from my.core.pandas import DataFrameT, as_dataframe
|
||||||
|
from my.core.sqlite import sqlite_connect_immutable
|
||||||
|
|
||||||
|
|
||||||
|
class config(Protocol):
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def export_path(self) -> Paths:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tz(self) -> pytz.BaseTzInfo:
|
||||||
|
# fixme: later, rely on the timezone provider
|
||||||
|
# NOTE: the timezone should be set with respect to the export date!!!
|
||||||
|
return pytz.timezone('Europe/London')
|
||||||
|
# TODO when I change tz, check the diff
|
||||||
|
|
||||||
|
|
||||||
|
def make_config() -> config:
|
||||||
|
from my.config import bluemaestro as user_config
|
||||||
|
|
||||||
|
class combined_config(user_config, config): ...
|
||||||
|
|
||||||
|
return combined_config()
|
||||||
|
|
||||||
|
|
||||||
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
cfg = make_config()
|
||||||
|
return get_files(cfg.export_path)
|
||||||
|
|
||||||
|
|
||||||
|
Celsius = float
|
||||||
|
Percent = float
|
||||||
|
mBar = float
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Measurement:
|
||||||
|
dt: datetime # todo aware/naive
|
||||||
|
temp: Celsius
|
||||||
|
humidity: Percent
|
||||||
|
pressure: mBar
|
||||||
|
dewpoint: Celsius
|
||||||
|
|
||||||
|
|
||||||
|
def is_bad_table(name: str) -> bool:
|
||||||
|
# todo hmm would be nice to have a hook that can patch any module up to
|
||||||
|
delegate = getattr(config, 'is_bad_table', None)
|
||||||
|
return False if delegate is None else delegate(name)
|
||||||
|
|
||||||
|
|
||||||
|
@mcachew(depends_on=inputs)
|
||||||
|
def measurements() -> Iterable[Res[Measurement]]:
|
||||||
|
cfg = make_config()
|
||||||
|
tz = cfg.tz
|
||||||
|
|
||||||
|
# todo ideally this would be via arguments... but needs to be lazy
|
||||||
|
paths = inputs()
|
||||||
|
total = len(paths)
|
||||||
|
width = len(str(total))
|
||||||
|
|
||||||
|
last: datetime | None = None
|
||||||
|
|
||||||
|
# tables are immutable, so can save on processing..
|
||||||
|
processed_tables: set[str] = set()
|
||||||
|
for idx, path in enumerate(paths):
|
||||||
|
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
|
||||||
|
tot = 0
|
||||||
|
new = 0
|
||||||
|
# todo assert increasing timestamp?
|
||||||
|
with sqlite_connect_immutable(path) as db:
|
||||||
|
db_dt: datetime | None = None
|
||||||
|
try:
|
||||||
|
datas = db.execute(
|
||||||
|
f'SELECT "{path.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index'
|
||||||
|
)
|
||||||
|
oldfmt = True
|
||||||
|
[(db_dts,)] = db.execute('SELECT last_download FROM info')
|
||||||
|
if db_dts == 'N/A':
|
||||||
|
# ??? happens for 20180923-20180928
|
||||||
|
continue
|
||||||
|
if db_dts.endswith(':'):
|
||||||
|
db_dts += '00' # wtf.. happens on some day
|
||||||
|
db_dt = tz.localize(datetime.strptime(db_dts, '%Y-%m-%d %H:%M:%S'))
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
# Right, this looks really bad.
|
||||||
|
# The device doesn't have internal time & what it does is:
|
||||||
|
# 1. every X seconds, record a datapoint, store it in the internal memory
|
||||||
|
# 2. on sync, take the phone's datetime ('now') and then ASSIGN the timestamps to the collected data
|
||||||
|
# as now, now - X, now - 2X, etc
|
||||||
|
#
|
||||||
|
# that basically means that for example, hourly timestamps are completely useless? because their error is about 1h
|
||||||
|
# yep, confirmed on some historic exports. seriously, what the fuck???
|
||||||
|
#
|
||||||
|
# The device _does_ have an internal clock, but it's basically set to 0 every time you update settings
|
||||||
|
# So, e.g. if, say, at 17:15 you set the interval to 3600, the 'real' timestamps would be
|
||||||
|
# 17:15, 18:15, 19:15, etc
|
||||||
|
# But depending on when you export, you might get
|
||||||
|
# 17:35, 18:35, 19:35; or 17:55, 18:55, 19:55, etc
|
||||||
|
# basically all you guaranteed is that the 'correct' interval is within the frequency
|
||||||
|
# it doesn't seem to keep the reference time in the database
|
||||||
|
#
|
||||||
|
# UPD: fucking hell, so you can set the reference date in the settings (calcReferenceUnix field in meta db)
|
||||||
|
# but it's not set by default.
|
||||||
|
|
||||||
|
log_tables = [c[0] for c in db.execute('SELECT name FROM sqlite_sequence WHERE name LIKE "%_log"')]
|
||||||
|
log_tables = [t for t in log_tables if t not in processed_tables]
|
||||||
|
processed_tables |= set(log_tables)
|
||||||
|
|
||||||
|
# todo use later?
|
||||||
|
frequencies = [list(db.execute(f'SELECT interval from {t.replace("_log", "_meta")}'))[0][0] for t in log_tables] # noqa: RUF015
|
||||||
|
|
||||||
|
# todo could just filter out the older datapoints?? dunno.
|
||||||
|
|
||||||
|
# eh. a bit horrible, but seems the easiest way to do it?
|
||||||
|
# note: for some reason everything in the new table multiplied by 10
|
||||||
|
query = ' UNION '.join(
|
||||||
|
f'SELECT "{t}" AS name, unix, tempReadings / 10.0, humiReadings / 10.0, pressReadings / 10.0, dewpReadings / 10.0 FROM {t}'
|
||||||
|
for t in log_tables
|
||||||
|
)
|
||||||
|
if len(log_tables) > 0: # ugh. otherwise end up with syntax error..
|
||||||
|
query = f'SELECT * FROM ({query}) ORDER BY name, unix'
|
||||||
|
datas = db.execute(query)
|
||||||
|
oldfmt = False
|
||||||
|
db_dt = None
|
||||||
|
|
||||||
|
for (name, tsc, temp, hum, pres, dewp) in datas:
|
||||||
|
if is_bad_table(name):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# note: bluemaestro keeps local datetime
|
||||||
|
if oldfmt:
|
||||||
|
tss = tsc.replace('Juli', 'Jul').replace('Aug.', 'Aug')
|
||||||
|
dt = datetime.strptime(tss, '%Y-%b-%d %H:%M')
|
||||||
|
dt = tz.localize(dt)
|
||||||
|
assert db_dt is not None
|
||||||
|
else:
|
||||||
|
# todo cache?
|
||||||
|
m = re.search(r'_(\d+)_', name)
|
||||||
|
assert m is not None
|
||||||
|
export_ts = int(m.group(1))
|
||||||
|
db_dt = datetime.fromtimestamp(export_ts / 1000, tz=tz)
|
||||||
|
dt = datetime.fromtimestamp(tsc / 1000, tz=tz)
|
||||||
|
|
||||||
|
## sanity checks (todo make defensive/configurable?)
|
||||||
|
# not sure how that happens.. but basically they'd better be excluded
|
||||||
|
lower = timedelta(days=6000 / 24) # ugh some time ago I only did it once in an hour.. in theory can detect from meta?
|
||||||
|
upper = timedelta(days=10) # kinda arbitrary
|
||||||
|
if not (db_dt - lower < dt < db_dt + timedelta(days=10)):
|
||||||
|
# todo could be more defenive??
|
||||||
|
yield RuntimeError('timestamp too far out', path, name, db_dt, dt)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# err.. sometimes my values are just interleaved with these for no apparent reason???
|
||||||
|
if (temp, hum, pres, dewp) == (-144.1, 100.0, 1152.5, -144.1):
|
||||||
|
yield RuntimeError('the weird sensor bug')
|
||||||
|
continue
|
||||||
|
|
||||||
|
assert -60 <= temp <= 60, (path, dt, temp)
|
||||||
|
##
|
||||||
|
|
||||||
|
tot += 1
|
||||||
|
if last is not None and last >= dt:
|
||||||
|
continue
|
||||||
|
# todo for performance, pass 'last' to sqlite instead?
|
||||||
|
last = dt
|
||||||
|
new += 1
|
||||||
|
p = Measurement(
|
||||||
|
dt=dt,
|
||||||
|
temp=temp,
|
||||||
|
pressure=pres,
|
||||||
|
humidity=hum,
|
||||||
|
dewpoint=dewp,
|
||||||
|
)
|
||||||
|
yield p
|
||||||
|
logger.debug(f'{path}: new {new}/{tot}')
|
||||||
|
# logger.info('total items: %d', len(merged))
|
||||||
|
# for k, v in merged.items():
|
||||||
|
# # TODO shit. quite a few of them have varying values... how is that freaking possible????
|
||||||
|
# # most of them are within 0.5 degree though... so just ignore?
|
||||||
|
# if isinstance(v, set) and len(v) > 1:
|
||||||
|
# print(k, v)
|
||||||
|
# for k, v in merged.items():
|
||||||
|
# yield Point(dt=k, temp=v) # meh?
|
||||||
|
|
||||||
|
|
||||||
|
def stats() -> Stats:
|
||||||
|
return stat(measurements)
|
||||||
|
|
||||||
|
|
||||||
|
def dataframe() -> DataFrameT:
|
||||||
|
"""
|
||||||
|
%matplotlib gtk
|
||||||
|
from my.bluemaestro import dataframe
|
||||||
|
dataframe().plot()
|
||||||
|
"""
|
||||||
|
df = as_dataframe(measurements(), schema=Measurement)
|
||||||
|
# todo not sure how it would handle mixed timezones??
|
||||||
|
# todo hmm, not sure about setting the index
|
||||||
|
return df.set_index('dt')
|
||||||
|
|
||||||
|
|
||||||
|
def fill_influxdb() -> None:
|
||||||
|
from my.core import influxdb
|
||||||
|
|
||||||
|
influxdb.fill(measurements(), measurement=__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def check() -> None:
|
||||||
|
temps = list(measurements())
|
||||||
|
latest = temps[:-2]
|
||||||
|
|
||||||
|
prev = unwrap(latest[-2]).dt
|
||||||
|
last = unwrap(latest[-1]).dt
|
||||||
|
|
||||||
|
# todo stat should expose a dataclass?
|
||||||
|
# TODO ugh. might need to warn about points past 'now'??
|
||||||
|
# the default shouldn't allow points in the future...
|
||||||
|
#
|
||||||
|
# TODO also needs to be filtered out on processing, should be rejected on the basis of export date?
|
||||||
|
|
||||||
|
POINTS_STORED = 6000 # on device?
|
||||||
|
FREQ_SEC = 60
|
||||||
|
SECS_STORED = POINTS_STORED * FREQ_SEC
|
||||||
|
HOURS_STORED = POINTS_STORED / (60 * 60 / FREQ_SEC) # around 4 days
|
||||||
|
NOW = datetime.now()
|
||||||
|
assert NOW - last < timedelta(hours=HOURS_STORED / 2), f'old backup! {last}'
|
||||||
|
|
||||||
|
assert last - prev < timedelta(minutes=3), f'bad interval! {last - prev}'
|
||||||
|
single = (last - prev).seconds
|
|
@ -1,167 +0,0 @@
|
||||||
#!/usr/bin/python3
|
|
||||||
"""
|
|
||||||
[[https://bluemaestro.com/products/product-details/bluetooth-environmental-monitor-and-logger][Bluemaestro]] temperature/humidity/pressure monitor
|
|
||||||
"""
|
|
||||||
|
|
||||||
# todo eh, most of it belongs to DAL
|
|
||||||
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
import sqlite3
|
|
||||||
from typing import Iterable, NamedTuple, Sequence, Set, Optional
|
|
||||||
|
|
||||||
|
|
||||||
from ..core.common import mcachew, LazyLogger, get_files
|
|
||||||
from ..core.cachew import cache_dir
|
|
||||||
from my.config import bluemaestro as config
|
|
||||||
|
|
||||||
|
|
||||||
logger = LazyLogger('bluemaestro', level='debug')
|
|
||||||
|
|
||||||
|
|
||||||
def inputs() -> Sequence[Path]:
|
|
||||||
return get_files(config.export_path)
|
|
||||||
|
|
||||||
|
|
||||||
class Measurement(NamedTuple):
|
|
||||||
dt: datetime
|
|
||||||
temp : float # Celsius
|
|
||||||
humidity: float # percent
|
|
||||||
pressure: float # mBar
|
|
||||||
dewpoint: float # Celsius
|
|
||||||
|
|
||||||
|
|
||||||
# fixme: later, rely on the timezone provider
|
|
||||||
# NOTE: the timezone should be set with respect to the export date!!!
|
|
||||||
import pytz # type: ignore
|
|
||||||
tz = pytz.timezone('Europe/London')
|
|
||||||
# TODO when I change tz, check the diff
|
|
||||||
|
|
||||||
|
|
||||||
@mcachew(cache_path=cache_dir() / 'bluemaestro.cache')
|
|
||||||
def measurements(dbs=inputs()) -> Iterable[Measurement]:
|
|
||||||
last: Optional[datetime] = None
|
|
||||||
|
|
||||||
# tables are immutable, so can save on processing..
|
|
||||||
processed_tables: Set[str] = set()
|
|
||||||
for f in dbs:
|
|
||||||
logger.debug('processing %s', f)
|
|
||||||
tot = 0
|
|
||||||
new = 0
|
|
||||||
# todo assert increasing timestamp?
|
|
||||||
with sqlite3.connect(f'file:{f}?immutable=1', uri=True) as db:
|
|
||||||
try:
|
|
||||||
datas = db.execute(f'SELECT "{f.name}" as name, Time, Temperature, Humidity, Pressure, Dewpoint FROM data ORDER BY log_index')
|
|
||||||
oldfmt = True
|
|
||||||
except sqlite3.OperationalError:
|
|
||||||
# Right, this looks really bad.
|
|
||||||
# The device doesn't have internal time & what it does is:
|
|
||||||
# 1. every X seconds, record a datapoint, store it in the internal memory
|
|
||||||
# 2. on sync, take the phone's datetime ('now') and then ASSIGN the timestamps to the collected data
|
|
||||||
# as now, now - X, now - 2X, etc
|
|
||||||
#
|
|
||||||
# that basically means that for example, hourly timestamps are completely useless? because their error is about 1h
|
|
||||||
# yep, confirmed on some historic exports. seriously, what the fuck???
|
|
||||||
#
|
|
||||||
# The device _does_ have an internal clock, but it's basically set to 0 every time you update settings
|
|
||||||
# So, e.g. if, say, at 17:15 you set the interval to 3600, the 'real' timestamps would be
|
|
||||||
# 17:15, 18:15, 19:15, etc
|
|
||||||
# But depending on when you export, you might get
|
|
||||||
# 17:35, 18:35, 19:35; or 17:55, 18:55, 19:55, etc
|
|
||||||
# basically all you guaranteed is that the 'correct' interval is within the frequency
|
|
||||||
# it doesn't seem to keep the reference time in the database
|
|
||||||
#
|
|
||||||
# UPD: fucking hell, so you can set the reference date in the settings (calcReferenceUnix field in meta db)
|
|
||||||
# but it's not set by default.
|
|
||||||
|
|
||||||
log_tables = [c[0] for c in db.execute('SELECT name FROM sqlite_sequence WHERE name LIKE "%_log"')]
|
|
||||||
log_tables = [t for t in log_tables if t not in processed_tables]
|
|
||||||
processed_tables |= set(log_tables)
|
|
||||||
|
|
||||||
# todo use later?
|
|
||||||
frequencies = [list(db.execute(f'SELECT interval from {t.replace("_log", "_meta")}'))[0][0] for t in log_tables]
|
|
||||||
|
|
||||||
# todo could just filter out the older datapoints?? dunno.
|
|
||||||
|
|
||||||
# eh. a bit horrible, but seems the easiest way to do it?
|
|
||||||
# note: for some reason everything in the new table multiplied by 10
|
|
||||||
query = ' UNION '.join(
|
|
||||||
f'SELECT "{t}" AS name, unix, tempReadings / 10.0, humiReadings / 10.0, pressReadings / 10.0, dewpReadings / 10.0 FROM {t}'
|
|
||||||
for t in log_tables
|
|
||||||
)
|
|
||||||
if len(log_tables) > 0: # ugh. otherwise end up with syntax error..
|
|
||||||
query = f'SELECT * FROM ({query}) ORDER BY name, unix'
|
|
||||||
datas = db.execute(query)
|
|
||||||
oldfmt = False
|
|
||||||
|
|
||||||
for i, (name, tsc, temp, hum, pres, dewp) in enumerate(datas):
|
|
||||||
# note: bluemaestro keeps local datetime
|
|
||||||
if oldfmt:
|
|
||||||
tss = tsc.replace('Juli', 'Jul').replace('Aug.', 'Aug')
|
|
||||||
dt = datetime.strptime(tss, '%Y-%b-%d %H:%M')
|
|
||||||
dt = tz.localize(dt)
|
|
||||||
else:
|
|
||||||
m = re.search(r'_(\d+)_', name)
|
|
||||||
assert m is not None
|
|
||||||
export_ts = int(m.group(1))
|
|
||||||
edt = datetime.fromtimestamp(export_ts / 1000, tz=tz)
|
|
||||||
|
|
||||||
dt = datetime.fromtimestamp(tsc / 1000, tz=tz)
|
|
||||||
|
|
||||||
## sanity checks (todo make defensive/configurable?)
|
|
||||||
# not sure how that happens.. but basically they'd better be excluded
|
|
||||||
assert dt.year >= 2015, (f, name, dt)
|
|
||||||
assert -60 <= temp <= 60, (f, dt, temp)
|
|
||||||
##
|
|
||||||
|
|
||||||
tot += 1
|
|
||||||
if last is not None and last >= dt:
|
|
||||||
continue
|
|
||||||
# todo for performance, pass 'last' to sqlite instead?
|
|
||||||
last = dt
|
|
||||||
new += 1
|
|
||||||
p = Measurement(
|
|
||||||
dt=dt,
|
|
||||||
temp=temp,
|
|
||||||
pressure=pres,
|
|
||||||
humidity=hum,
|
|
||||||
dewpoint=dewp,
|
|
||||||
)
|
|
||||||
yield p
|
|
||||||
logger.debug('%s: new %d/%d', f, new, tot)
|
|
||||||
# logger.info('total items: %d', len(merged))
|
|
||||||
# TODO assert frequency?
|
|
||||||
# for k, v in merged.items():
|
|
||||||
# # TODO shit. quite a few of them have varying values... how is that freaking possible????
|
|
||||||
# # most of them are within 0.5 degree though... so just ignore?
|
|
||||||
# if isinstance(v, set) and len(v) > 1:
|
|
||||||
# print(k, v)
|
|
||||||
# for k, v in merged.items():
|
|
||||||
# yield Point(dt=k, temp=v) # meh?
|
|
||||||
|
|
||||||
from ..core import stat, Stats
|
|
||||||
def stats() -> Stats:
|
|
||||||
return stat(measurements)
|
|
||||||
|
|
||||||
|
|
||||||
from ..core.pandas import DataFrameT, check_dataframe as cdf
|
|
||||||
@cdf
|
|
||||||
def dataframe() -> DataFrameT:
|
|
||||||
"""
|
|
||||||
%matplotlib gtk
|
|
||||||
from my.bluemaestro import dataframe
|
|
||||||
dataframe().plot()
|
|
||||||
"""
|
|
||||||
# todo not sure why x axis time ticks are weird... df[:6269] works, whereas df[:6269] breaks...
|
|
||||||
# either way, plot is not the best representation for the temperature I guess.. maybe also use bokeh?
|
|
||||||
import pandas as pd # type: ignore
|
|
||||||
df = pd.DataFrame(
|
|
||||||
(p._asdict() for p in measurements()),
|
|
||||||
# todo meh. otherwise fails on empty inputs...
|
|
||||||
columns=list(Measurement._fields),
|
|
||||||
)
|
|
||||||
# todo not sure how it would handle mixed timezones??
|
|
||||||
return df.set_index('dt')
|
|
||||||
|
|
||||||
# todo test against an older db?
|
|
|
@ -1,29 +0,0 @@
|
||||||
#!/usr/bin/python3
|
|
||||||
import logging
|
|
||||||
from datetime import timedelta, datetime
|
|
||||||
|
|
||||||
from my.bluemaestro import measurements, logger
|
|
||||||
|
|
||||||
# TODO move this to backup checker?
|
|
||||||
def main() -> None:
|
|
||||||
temps = list(measurements())
|
|
||||||
latest = temps[:-2]
|
|
||||||
|
|
||||||
prev = latest[-2].dt
|
|
||||||
last = latest[-1].dt
|
|
||||||
|
|
||||||
POINTS_STORED = 6000
|
|
||||||
FREQ_SEC = 60
|
|
||||||
SECS_STORED = POINTS_STORED * FREQ_SEC
|
|
||||||
HOURS_STORED = POINTS_STORED / (60 * 60 / FREQ_SEC) # around 4 days
|
|
||||||
NOW = datetime.now()
|
|
||||||
assert NOW - last < timedelta(hours=HOURS_STORED / 2), f'old backup! {last}'
|
|
||||||
|
|
||||||
|
|
||||||
assert last - prev < timedelta(minutes=3), f'bad interval! {last - prev}'
|
|
||||||
single = (last - prev).seconds
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
56
my/body/blood.py
Executable file → Normal file
56
my/body/blood.py
Executable file → Normal file
|
@ -1,43 +1,43 @@
|
||||||
"""
|
"""
|
||||||
Blood tracking
|
Blood tracking (manual org-mode entries)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Iterable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterable, NamedTuple, Optional
|
from typing import NamedTuple
|
||||||
|
|
||||||
from ..core.common import listify
|
|
||||||
from ..core.error import Res, echain
|
|
||||||
from ..core.orgmode import parse_org_datetime, one_table
|
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd # type: ignore
|
|
||||||
import orgparse
|
import orgparse
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from my.config import blood as config # type: ignore[attr-defined]
|
||||||
|
|
||||||
from my.config import blood as config
|
from ..core.error import Res
|
||||||
|
from ..core.orgmode import one_table, parse_org_datetime
|
||||||
|
|
||||||
|
|
||||||
class Entry(NamedTuple):
|
class Entry(NamedTuple):
|
||||||
dt: datetime
|
dt: datetime
|
||||||
|
|
||||||
ketones : Optional[float]=None
|
ketones : float | None=None
|
||||||
glucose : Optional[float]=None
|
glucose : float | None=None
|
||||||
|
|
||||||
vitamin_d : Optional[float]=None
|
vitamin_d : float | None=None
|
||||||
vitamin_b12 : Optional[float]=None
|
vitamin_b12 : float | None=None
|
||||||
|
|
||||||
hdl : Optional[float]=None
|
hdl : float | None=None
|
||||||
ldl : Optional[float]=None
|
ldl : float | None=None
|
||||||
triglycerides: Optional[float]=None
|
triglycerides: float | None=None
|
||||||
|
|
||||||
source : Optional[str]=None
|
source : str | None=None
|
||||||
extra : Optional[str]=None
|
extra : str | None=None
|
||||||
|
|
||||||
|
|
||||||
Result = Res[Entry]
|
Result = Res[Entry]
|
||||||
|
|
||||||
|
|
||||||
def try_float(s: str) -> Optional[float]:
|
def try_float(s: str) -> float | None:
|
||||||
l = s.split()
|
l = s.split()
|
||||||
if len(l) == 0:
|
if len(l) == 0:
|
||||||
return None
|
return None
|
||||||
|
@ -47,9 +47,11 @@ def try_float(s: str) -> Optional[float]:
|
||||||
return None
|
return None
|
||||||
return float(x)
|
return float(x)
|
||||||
|
|
||||||
|
|
||||||
def glucose_ketones_data() -> Iterable[Result]:
|
def glucose_ketones_data() -> Iterable[Result]:
|
||||||
o = orgparse.load(config.blood_log)
|
o = orgparse.load(config.blood_log)
|
||||||
tbl = one_table(o)
|
[n] = [x for x in o if x.heading == 'glucose/ketones']
|
||||||
|
tbl = one_table(n)
|
||||||
# todo some sort of sql-like interface for org tables might be ideal?
|
# todo some sort of sql-like interface for org tables might be ideal?
|
||||||
for l in tbl.as_dicts:
|
for l in tbl.as_dicts:
|
||||||
kets = l['ket']
|
kets = l['ket']
|
||||||
|
@ -74,8 +76,9 @@ def glucose_ketones_data() -> Iterable[Result]:
|
||||||
|
|
||||||
|
|
||||||
def blood_tests_data() -> Iterable[Result]:
|
def blood_tests_data() -> Iterable[Result]:
|
||||||
o = orgparse.load(config.blood_tests_log)
|
o = orgparse.load(config.blood_log)
|
||||||
tbl = one_table(o)
|
[n] = [x for x in o if x.heading == 'blood tests']
|
||||||
|
tbl = one_table(n)
|
||||||
for d in tbl.as_dicts:
|
for d in tbl.as_dicts:
|
||||||
try:
|
try:
|
||||||
dt = parse_org_datetime(d['datetime'])
|
dt = parse_org_datetime(d['datetime'])
|
||||||
|
@ -103,6 +106,7 @@ def blood_tests_data() -> Iterable[Result]:
|
||||||
|
|
||||||
def data() -> Iterable[Result]:
|
def data() -> Iterable[Result]:
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
||||||
from ..core.error import sort_res_by
|
from ..core.error import sort_res_by
|
||||||
datas = chain(glucose_ketones_data(), blood_tests_data())
|
datas = chain(glucose_ketones_data(), blood_tests_data())
|
||||||
return sort_res_by(datas, key=lambda e: e.dt)
|
return sort_res_by(datas, key=lambda e: e.dt)
|
||||||
|
@ -128,11 +132,3 @@ def stats():
|
||||||
def test():
|
def test():
|
||||||
print(dataframe())
|
print(dataframe())
|
||||||
assert len(dataframe()) > 10
|
assert len(dataframe()) > 10
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print(data())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
|
|
17
my/body/exercise/all.py
Normal file
17
my/body/exercise/all.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
'''
|
||||||
|
Combined exercise data
|
||||||
|
'''
|
||||||
|
from ...core.pandas import DataFrameT, check_dataframe
|
||||||
|
|
||||||
|
|
||||||
|
@check_dataframe
|
||||||
|
def dataframe() -> DataFrameT:
|
||||||
|
# this should be somehow more flexible...
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from ...endomondo import dataframe as EDF
|
||||||
|
from ...runnerup import dataframe as RDF
|
||||||
|
return pd.concat([
|
||||||
|
EDF(),
|
||||||
|
RDF(),
|
||||||
|
])
|
|
@ -1,8 +1,7 @@
|
||||||
'''
|
'''
|
||||||
Cardio data, filtered from Endomondo and inferred from other data sources
|
Cardio data, filtered from various data sources
|
||||||
'''
|
'''
|
||||||
from ...core.pandas import DataFrameT, check_dataframe as cdf
|
from ...core.pandas import DataFrameT, check_dataframe
|
||||||
|
|
||||||
|
|
||||||
CARDIO = {
|
CARDIO = {
|
||||||
'Running',
|
'Running',
|
||||||
|
@ -21,12 +20,12 @@ NOT_CARDIO = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@cdf
|
@check_dataframe
|
||||||
def endomondo_cardio() -> DataFrameT:
|
def dataframe() -> DataFrameT:
|
||||||
assert len(CARDIO.intersection(NOT_CARDIO)) == 0, (CARDIO, NOT_CARDIO)
|
assert len(CARDIO.intersection(NOT_CARDIO)) == 0, (CARDIO, NOT_CARDIO)
|
||||||
|
|
||||||
from ...endomondo import dataframe as EDF
|
from .all import dataframe as DF
|
||||||
df = EDF()
|
df = DF()
|
||||||
|
|
||||||
# not sure...
|
# not sure...
|
||||||
# df = df[df['heart_rate_avg'].notna()]
|
# df = df[df['heart_rate_avg'].notna()]
|
||||||
|
@ -42,7 +41,3 @@ def endomondo_cardio() -> DataFrameT:
|
||||||
df = df[is_cardio | neither]
|
df = df[is_cardio | neither]
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def dataframe() -> DataFrameT:
|
|
||||||
return endomondo_cardio()
|
|
||||||
|
|
|
@ -1,20 +1,22 @@
|
||||||
'''
|
'''
|
||||||
My cross trainer exercise data, arbitrated between differen sources (mainly, Endomondo and various manual plaintext notes)
|
My cross trainer exercise data, arbitrated from different sources (mainly, Endomondo and manual text notes)
|
||||||
|
|
||||||
This is probably too specific to my needs, so later I will move it away to a personal 'layer'.
|
This is probably too specific to my needs, so later I will move it away to a personal 'layer'.
|
||||||
For now it's worth keeping it here as an example and perhaps utility functions might be useful for other HPI modules.
|
For now it's worth keeping it here as an example and perhaps utility functions might be useful for other HPI modules.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from datetime import datetime, timedelta
|
from __future__ import annotations
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from ...core.pandas import DataFrameT, check_dataframe as cdf
|
from datetime import datetime, timedelta
|
||||||
from ...core.orgmode import collect, Table, parse_org_datetime, TypedTable
|
|
||||||
|
import pytz
|
||||||
|
|
||||||
from my.config import exercise as config
|
from my.config import exercise as config
|
||||||
|
|
||||||
|
from ...core.orgmode import Table, TypedTable, collect, parse_org_datetime
|
||||||
|
from ...core.pandas import DataFrameT
|
||||||
|
from ...core.pandas import check_dataframe as cdf
|
||||||
|
|
||||||
import pytz
|
|
||||||
# FIXME how to attach it properly?
|
# FIXME how to attach it properly?
|
||||||
tz = pytz.timezone('Europe/London')
|
tz = pytz.timezone('Europe/London')
|
||||||
|
|
||||||
|
@ -78,7 +80,7 @@ def cross_trainer_manual_dataframe() -> DataFrameT:
|
||||||
'''
|
'''
|
||||||
Only manual org-mode entries
|
Only manual org-mode entries
|
||||||
'''
|
'''
|
||||||
import pandas as pd # type: ignore[import]
|
import pandas as pd
|
||||||
df = pd.DataFrame(cross_trainer_data())
|
df = pd.DataFrame(cross_trainer_data())
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
@ -91,7 +93,7 @@ def dataframe() -> DataFrameT:
|
||||||
'''
|
'''
|
||||||
Attaches manually logged data (which Endomondo can't capture) and attaches it to Endomondo
|
Attaches manually logged data (which Endomondo can't capture) and attaches it to Endomondo
|
||||||
'''
|
'''
|
||||||
import pandas as pd # type: ignore[import]
|
import pandas as pd
|
||||||
|
|
||||||
from ...endomondo import dataframe as EDF
|
from ...endomondo import dataframe as EDF
|
||||||
edf = EDF()
|
edf = EDF()
|
||||||
|
@ -105,7 +107,7 @@ def dataframe() -> DataFrameT:
|
||||||
rows = []
|
rows = []
|
||||||
idxs = [] # type: ignore[var-annotated]
|
idxs = [] # type: ignore[var-annotated]
|
||||||
NO_ENDOMONDO = 'no endomondo matches'
|
NO_ENDOMONDO = 'no endomondo matches'
|
||||||
for i, row in mdf.iterrows():
|
for _i, row in mdf.iterrows():
|
||||||
rd = row.to_dict()
|
rd = row.to_dict()
|
||||||
mdate = row['date']
|
mdate = row['date']
|
||||||
if pd.isna(mdate):
|
if pd.isna(mdate):
|
||||||
|
@ -114,7 +116,7 @@ def dataframe() -> DataFrameT:
|
||||||
rows.append(rd) # presumably has an error set
|
rows.append(rd) # presumably has an error set
|
||||||
continue
|
continue
|
||||||
|
|
||||||
idx: Optional[int]
|
idx: int | None
|
||||||
close = edf[edf['start_time'].apply(lambda t: pd_date_diff(t, mdate)).abs() < _DELTA]
|
close = edf[edf['start_time'].apply(lambda t: pd_date_diff(t, mdate)).abs() < _DELTA]
|
||||||
if len(close) == 0:
|
if len(close) == 0:
|
||||||
idx = None
|
idx = None
|
||||||
|
@ -146,7 +148,7 @@ def dataframe() -> DataFrameT:
|
||||||
# todo careful about 'how'? we need it to preserve the errors
|
# todo careful about 'how'? we need it to preserve the errors
|
||||||
# maybe pd.merge is better suited for this??
|
# maybe pd.merge is better suited for this??
|
||||||
df = edf.join(mdf, how='outer', rsuffix='_manual')
|
df = edf.join(mdf, how='outer', rsuffix='_manual')
|
||||||
# todo reindex? so we dont' have Nan leftovers
|
# todo reindex? so we don't have Nan leftovers
|
||||||
|
|
||||||
# todo set date anyway? maybe just squeeze into the index??
|
# todo set date anyway? maybe just squeeze into the index??
|
||||||
noendo = df['error'] == NO_ENDOMONDO
|
noendo = df['error'] == NO_ENDOMONDO
|
||||||
|
@ -163,7 +165,9 @@ def dataframe() -> DataFrameT:
|
||||||
# TODO wtf?? where is speed coming from??
|
# TODO wtf?? where is speed coming from??
|
||||||
|
|
||||||
|
|
||||||
from ...core import stat, Stats
|
from ...core import Stats, stat
|
||||||
|
|
||||||
|
|
||||||
def stats() -> Stats:
|
def stats() -> Stats:
|
||||||
return stat(cross_trainer_data)
|
return stat(cross_trainer_data)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from ...core import stat, Stats
|
from ...core import Stats, stat
|
||||||
from ...core.pandas import DataFrameT, check_dataframe as cdf
|
from ...core.pandas import DataFrameT
|
||||||
|
from ...core.pandas import check_dataframe as cdf
|
||||||
|
|
||||||
|
|
||||||
class Combine:
|
class Combine:
|
||||||
|
@ -7,8 +8,8 @@ class Combine:
|
||||||
self.modules = modules
|
self.modules = modules
|
||||||
|
|
||||||
@cdf
|
@cdf
|
||||||
def dataframe(self, with_temperature: bool=True) -> DataFrameT:
|
def dataframe(self, *, with_temperature: bool=True) -> DataFrameT:
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd
|
||||||
# todo include 'source'?
|
# todo include 'source'?
|
||||||
df = pd.concat([m.dataframe() for m in self.modules])
|
df = pd.concat([m.dataframe() for m in self.modules])
|
||||||
|
|
||||||
|
@ -17,6 +18,13 @@ class Combine:
|
||||||
bdf = BM.dataframe()
|
bdf = BM.dataframe()
|
||||||
temp = bdf['temp']
|
temp = bdf['temp']
|
||||||
|
|
||||||
|
# sort index and drop nans, otherwise indexing with [start: end] gonna complain
|
||||||
|
temp = pd.Series(
|
||||||
|
temp.values,
|
||||||
|
index=pd.to_datetime(temp.index, utc=True)
|
||||||
|
).sort_index()
|
||||||
|
temp = temp.loc[temp.index.dropna()]
|
||||||
|
|
||||||
def calc_avg_temperature(row):
|
def calc_avg_temperature(row):
|
||||||
start = row['sleep_start']
|
start = row['sleep_start']
|
||||||
end = row['sleep_end']
|
end = row['sleep_end']
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ... import jawbone
|
from ... import emfit, jawbone
|
||||||
from ... import emfit
|
|
||||||
|
|
||||||
from .common import Combine
|
from .common import Combine
|
||||||
|
|
||||||
_combined = Combine([
|
_combined = Combine([
|
||||||
jawbone,
|
jawbone,
|
||||||
emfit,
|
emfit,
|
||||||
|
|
|
@ -2,21 +2,29 @@
|
||||||
Weight data (manually logged)
|
Weight data (manually logged)
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import NamedTuple, Iterator
|
from typing import Any
|
||||||
|
|
||||||
from ..core import LazyLogger
|
from my import orgmode
|
||||||
from ..core.error import Res, set_error_datetime, extract_error_datetime
|
from my.core import make_logger
|
||||||
|
from my.core.error import Res, extract_error_datetime, set_error_datetime
|
||||||
|
|
||||||
from .. import orgmode
|
config = Any
|
||||||
|
|
||||||
from my.config import weight as config
|
|
||||||
|
|
||||||
|
|
||||||
log = LazyLogger('my.body.weight')
|
def make_config() -> config:
|
||||||
|
from my.config import weight as user_config # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
return user_config()
|
||||||
|
|
||||||
|
|
||||||
class Entry(NamedTuple):
|
log = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Entry:
|
||||||
dt: datetime
|
dt: datetime
|
||||||
value: float
|
value: float
|
||||||
# TODO comment??
|
# TODO comment??
|
||||||
|
@ -26,6 +34,8 @@ Result = Res[Entry]
|
||||||
|
|
||||||
|
|
||||||
def from_orgmode() -> Iterator[Result]:
|
def from_orgmode() -> Iterator[Result]:
|
||||||
|
cfg = make_config()
|
||||||
|
|
||||||
orgs = orgmode.query()
|
orgs = orgmode.query()
|
||||||
for o in orgmode.query().all():
|
for o in orgmode.query().all():
|
||||||
if 'weight' not in o.tags:
|
if 'weight' not in o.tags:
|
||||||
|
@ -46,7 +56,8 @@ def from_orgmode() -> Iterator[Result]:
|
||||||
yield e
|
yield e
|
||||||
continue
|
continue
|
||||||
# FIXME use timezone provider
|
# FIXME use timezone provider
|
||||||
created = config.default_timezone.localize(created)
|
created = cfg.default_timezone.localize(created)
|
||||||
|
assert created is not None # ??? somehow mypy wasn't happy?
|
||||||
yield Entry(
|
yield Entry(
|
||||||
dt=created,
|
dt=created,
|
||||||
value=w,
|
value=w,
|
||||||
|
@ -55,22 +66,24 @@ def from_orgmode() -> Iterator[Result]:
|
||||||
|
|
||||||
|
|
||||||
def make_dataframe(data: Iterator[Result]):
|
def make_dataframe(data: Iterator[Result]):
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd
|
||||||
|
|
||||||
def it():
|
def it():
|
||||||
for e in data:
|
for e in data:
|
||||||
if isinstance(e, Exception):
|
if isinstance(e, Exception):
|
||||||
dt = extract_error_datetime(e)
|
dt = extract_error_datetime(e)
|
||||||
yield {
|
yield {
|
||||||
'dt' : dt,
|
'dt': dt,
|
||||||
'error': str(e),
|
'error': str(e),
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
yield {
|
yield {
|
||||||
'dt' : e.dt,
|
'dt': e.dt,
|
||||||
'weight': e.value,
|
'weight': e.value,
|
||||||
}
|
}
|
||||||
|
|
||||||
df = pd.DataFrame(it())
|
df = pd.DataFrame(it())
|
||||||
df.set_index('dt', inplace=True)
|
df = df.set_index('dt')
|
||||||
# TODO not sure about UTC??
|
# TODO not sure about UTC??
|
||||||
df.index = pd.to_datetime(df.index, utc=True)
|
df.index = pd.to_datetime(df.index, utc=True)
|
||||||
return df
|
return df
|
||||||
|
@ -80,6 +93,7 @@ def dataframe():
|
||||||
entries = from_orgmode()
|
entries = from_orgmode()
|
||||||
return make_dataframe(entries)
|
return make_dataframe(entries)
|
||||||
|
|
||||||
|
|
||||||
# TODO move to a submodule? e.g. my.body.weight.orgmode?
|
# TODO move to a submodule? e.g. my.body.weight.orgmode?
|
||||||
# so there could be more sources
|
# so there could be more sources
|
||||||
# not sure about my.body thing though
|
# not sure about my.body thing though
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..core import warnings
|
from my.core import warnings
|
||||||
|
|
||||||
warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!')
|
warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!')
|
||||||
|
|
||||||
from ..core.util import __NOT_HPI_MODULE__
|
from my.core.util import __NOT_HPI_MODULE__
|
||||||
|
from my.kobo import *
|
||||||
from ..kobo import *
|
|
||||||
|
|
54
my/browser/active_browser.py
Normal file
54
my/browser/active_browser.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
"""
|
||||||
|
Parses active browser history by backing it up with [[http://github.com/purarue/sqlite_backup][sqlite_backup]]
|
||||||
|
"""
|
||||||
|
|
||||||
|
REQUIRES = ["browserexport", "sqlite_backup"]
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from my.config import browser as user_config
|
||||||
|
from my.core import Paths
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class config(user_config.active_browser):
|
||||||
|
# paths to sqlite database files which you use actively
|
||||||
|
# to read from. For example:
|
||||||
|
# from browserexport.browsers.all import Firefox
|
||||||
|
# export_path = Firefox.locate_database()
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
from collections.abc import Iterator, Sequence
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from browserexport.merge import Visit, read_visits
|
||||||
|
from sqlite_backup import sqlite_backup
|
||||||
|
|
||||||
|
from my.core import Stats, get_files, make_logger
|
||||||
|
|
||||||
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
from .common import _patch_browserexport_logs
|
||||||
|
|
||||||
|
_patch_browserexport_logs(logger.level)
|
||||||
|
|
||||||
|
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
|
||||||
|
def history() -> Iterator[Visit]:
|
||||||
|
for ad in inputs():
|
||||||
|
conn = sqlite_backup(ad)
|
||||||
|
assert conn is not None
|
||||||
|
try:
|
||||||
|
yield from read_visits(conn)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def stats() -> Stats:
|
||||||
|
from my.core import stat
|
||||||
|
|
||||||
|
return {**stat(history)}
|
35
my/browser/all.py
Normal file
35
my/browser/all.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
|
from browserexport.merge import Visit, merge_visits
|
||||||
|
|
||||||
|
from my.core import Stats
|
||||||
|
from my.core.source import import_source
|
||||||
|
|
||||||
|
src_export = import_source(module_name="my.browser.export")
|
||||||
|
src_active = import_source(module_name="my.browser.active_browser")
|
||||||
|
|
||||||
|
|
||||||
|
@src_export
|
||||||
|
def _visits_export() -> Iterator[Visit]:
|
||||||
|
from . import export
|
||||||
|
return export.history()
|
||||||
|
|
||||||
|
|
||||||
|
@src_active
|
||||||
|
def _visits_active() -> Iterator[Visit]:
|
||||||
|
from . import active_browser
|
||||||
|
return active_browser.history()
|
||||||
|
|
||||||
|
|
||||||
|
# NOTE: you can comment out the sources you don't need
|
||||||
|
def history() -> Iterator[Visit]:
|
||||||
|
yield from merge_visits([
|
||||||
|
_visits_active(),
|
||||||
|
_visits_export(),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def stats() -> Stats:
|
||||||
|
from my.core import stat
|
||||||
|
|
||||||
|
return {**stat(history)}
|
8
my/browser/common.py
Normal file
8
my/browser/common.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from my.core.util import __NOT_HPI_MODULE__
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_browserexport_logs(level: int):
|
||||||
|
# grab the computed level (respects LOGGING_LEVEL_ prefixes) and set it on the browserexport logger
|
||||||
|
from browserexport.log import setup as setup_browserexport_logger
|
||||||
|
|
||||||
|
setup_browserexport_logger(level)
|
48
my/browser/export.py
Normal file
48
my/browser/export.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
"""
|
||||||
|
Parses browser history using [[http://github.com/purarue/browserexport][browserexport]]
|
||||||
|
"""
|
||||||
|
|
||||||
|
REQUIRES = ["browserexport"]
|
||||||
|
|
||||||
|
from collections.abc import Iterator, Sequence
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from browserexport.merge import Visit, read_and_merge
|
||||||
|
|
||||||
|
from my.core import (
|
||||||
|
Paths,
|
||||||
|
Stats,
|
||||||
|
get_files,
|
||||||
|
make_logger,
|
||||||
|
stat,
|
||||||
|
)
|
||||||
|
from my.core.cachew import mcachew
|
||||||
|
|
||||||
|
from .common import _patch_browserexport_logs
|
||||||
|
|
||||||
|
import my.config # isort: skip
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class config(my.config.browser.export):
|
||||||
|
# path[s]/glob to your backed up browser history sqlite files
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
logger = make_logger(__name__)
|
||||||
|
_patch_browserexport_logs(logger.level)
|
||||||
|
|
||||||
|
|
||||||
|
# all of my backed up databases
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
|
||||||
|
@mcachew(depends_on=inputs, logger=logger)
|
||||||
|
def history() -> Iterator[Visit]:
|
||||||
|
yield from read_and_merge(inputs())
|
||||||
|
|
||||||
|
|
||||||
|
def stats() -> Stats:
|
||||||
|
return {**stat(history)}
|
157
my/bumble/android.py
Normal file
157
my/bumble/android.py
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
"""
|
||||||
|
Bumble data from Android app database (in =/data/data/com.bumble.app/databases/ChatComDatabase=)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Iterator, Sequence
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from more_itertools import unique_everseen
|
||||||
|
|
||||||
|
from my.core import Paths, get_files
|
||||||
|
|
||||||
|
from my.config import bumble as user_config # isort: skip
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class config(user_config.android):
|
||||||
|
# paths[s]/glob to the exported sqlite databases
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class Person:
|
||||||
|
user_id: str
|
||||||
|
user_name: str
|
||||||
|
|
||||||
|
|
||||||
|
# todo not sure about order of fields...
|
||||||
|
@dataclass
|
||||||
|
class _BaseMessage:
|
||||||
|
id: str
|
||||||
|
created: datetime
|
||||||
|
is_incoming: bool
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class _Message(_BaseMessage):
|
||||||
|
conversation_id: str
|
||||||
|
reply_to_id: str | None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class Message(_BaseMessage):
|
||||||
|
person: Person
|
||||||
|
reply_to: Message | None
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from my.core.compat import assert_never
|
||||||
|
|
||||||
|
from ..core import Res
|
||||||
|
from ..core.sqlite import select, sqlite_connect_immutable
|
||||||
|
|
||||||
|
EntitiesRes = Res[Union[Person, _Message]]
|
||||||
|
|
||||||
|
def _entities() -> Iterator[EntitiesRes]:
|
||||||
|
for db_file in inputs():
|
||||||
|
with sqlite_connect_immutable(db_file) as db:
|
||||||
|
yield from _handle_db(db)
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_db(db: sqlite3.Connection) -> Iterator[EntitiesRes]:
|
||||||
|
# todo hmm not sure
|
||||||
|
# on the one hand kinda nice to use dataset..
|
||||||
|
# on the other, it's somewhat of a complication, and
|
||||||
|
# would be nice to have something type-directed for sql queries though
|
||||||
|
# e.g. with typeddict or something, so the number of parameter to the sql query matches?
|
||||||
|
for (user_id, user_name) in select(
|
||||||
|
('user_id', 'user_name'),
|
||||||
|
'FROM conversation_info',
|
||||||
|
db=db,
|
||||||
|
):
|
||||||
|
yield Person(
|
||||||
|
user_id=user_id,
|
||||||
|
user_name=user_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
# note: has sender_name, but it's always None
|
||||||
|
for ( id, conversation_id , created , is_incoming , payload_type , payload , reply_to_id) in select(
|
||||||
|
('id', 'conversation_id', 'created_timestamp', 'is_incoming', 'payload_type', 'payload', 'reply_to_id'),
|
||||||
|
'FROM message ORDER BY created_timestamp',
|
||||||
|
db=db
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
key = {'TEXT': 'text', 'QUESTION_GAME': 'text', 'IMAGE': 'url', 'GIF': 'url', 'AUDIO': 'url', 'VIDEO': 'url'}[payload_type]
|
||||||
|
text = json.loads(payload)[key]
|
||||||
|
yield _Message(
|
||||||
|
id=id,
|
||||||
|
# TODO not sure if utc??
|
||||||
|
created=datetime.fromtimestamp(created / 1000),
|
||||||
|
is_incoming=bool(is_incoming),
|
||||||
|
text=text,
|
||||||
|
conversation_id=conversation_id,
|
||||||
|
reply_to_id=reply_to_id,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
yield e
|
||||||
|
|
||||||
|
|
||||||
|
def _key(r: EntitiesRes):
|
||||||
|
if isinstance(r, _Message):
|
||||||
|
if '/hidden?' in r.text:
|
||||||
|
# ugh. seems that image URLs change all the time in the db?
|
||||||
|
# can't access them without login anyway
|
||||||
|
# so use a different key for such messages
|
||||||
|
# todo maybe normalize text instead? since it's gonna always trigger diffs down the line
|
||||||
|
return (r.id, r.created)
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
_UNKNOWN_PERSON = "UNKNOWN_PERSON"
|
||||||
|
|
||||||
|
|
||||||
|
def messages() -> Iterator[Res[Message]]:
|
||||||
|
id2person: dict[str, Person] = {}
|
||||||
|
id2msg: dict[str, Message] = {}
|
||||||
|
for x in unique_everseen(_entities(), key=_key):
|
||||||
|
if isinstance(x, Exception):
|
||||||
|
yield x
|
||||||
|
continue
|
||||||
|
if isinstance(x, Person):
|
||||||
|
id2person[x.user_id] = x
|
||||||
|
continue
|
||||||
|
if isinstance(x, _Message):
|
||||||
|
reply_to_id = x.reply_to_id
|
||||||
|
# hmm seems that sometimes there are messages with no corresponding conversation_info?
|
||||||
|
# possibly if user never clicked on conversation before..
|
||||||
|
person = id2person.get(x.conversation_id)
|
||||||
|
if person is None:
|
||||||
|
person = Person(user_id=x.conversation_id, user_name=_UNKNOWN_PERSON)
|
||||||
|
try:
|
||||||
|
reply_to = None if reply_to_id is None else id2msg[reply_to_id]
|
||||||
|
except Exception as e:
|
||||||
|
yield e
|
||||||
|
continue
|
||||||
|
m = Message(
|
||||||
|
id=x.id,
|
||||||
|
created=x.created,
|
||||||
|
is_incoming=x.is_incoming,
|
||||||
|
text=x.text,
|
||||||
|
person=person,
|
||||||
|
reply_to=reply_to,
|
||||||
|
)
|
||||||
|
id2msg[m.id] = m
|
||||||
|
yield m
|
||||||
|
continue
|
||||||
|
assert_never(x)
|
|
@ -9,19 +9,21 @@ from datetime import date, datetime, timedelta
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from ..core.time import zone_to_countrycode
|
from my.core import Stats
|
||||||
|
from my.core.time import zone_to_countrycode
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(1)
|
@lru_cache(1)
|
||||||
def _calendar():
|
def _calendar():
|
||||||
from workalendar.registry import registry # type: ignore
|
from workalendar.registry import registry # type: ignore
|
||||||
|
|
||||||
# todo switch to using time.tz.main once _get_tz stabilizes?
|
# todo switch to using time.tz.main once _get_tz stabilizes?
|
||||||
from ..time.tz import via_location as LTZ
|
from ..time.tz import via_location as LTZ
|
||||||
# TODO would be nice to do it dynamically depending on the past timezones...
|
# TODO would be nice to do it dynamically depending on the past timezones...
|
||||||
tz = LTZ._get_tz(datetime.now())
|
tz = LTZ.get_tz(datetime.now())
|
||||||
assert tz is not None
|
assert tz is not None
|
||||||
|
zone = tz.zone; assert zone is not None
|
||||||
code = zone_to_countrycode(tz.zone)
|
code = zone_to_countrycode(zone)
|
||||||
Cal = registry.get_calendars()[code]
|
Cal = registry.get_calendars()[code]
|
||||||
return Cal()
|
return Cal()
|
||||||
|
|
||||||
|
@ -46,7 +48,6 @@ def is_workday(d: DateIsh) -> bool:
|
||||||
return not is_holiday(d)
|
return not is_holiday(d)
|
||||||
|
|
||||||
|
|
||||||
from ..core.common import Stats
|
|
||||||
def stats() -> Stats:
|
def stats() -> Stats:
|
||||||
# meh, but not sure what would be a better test?
|
# meh, but not sure what would be a better test?
|
||||||
res = {}
|
res = {}
|
||||||
|
|
34
my/cfg.py
34
my/cfg.py
|
@ -1,33 +1,7 @@
|
||||||
"""
|
|
||||||
A helper to allow configuring the modules dynamically.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
|
|
||||||
from my.cfg import config
|
|
||||||
|
|
||||||
After that, you can set config attributes:
|
|
||||||
|
|
||||||
class user_config:
|
|
||||||
export_path = '/path/to/twitter/exports'
|
|
||||||
config.twitter = user_config
|
|
||||||
"""
|
|
||||||
# todo why do we bring this into scope? don't remember..
|
|
||||||
import my.config as config
|
import my.config as config
|
||||||
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Union
|
|
||||||
def set_repo(name: str, repo: Union[Path, str]) -> None:
|
|
||||||
from .core.init import assign_module
|
|
||||||
from . common import import_from
|
|
||||||
|
|
||||||
r = Path(repo)
|
|
||||||
module = import_from(r.parent, name)
|
|
||||||
assign_module('my.config.repos', name, module)
|
|
||||||
|
|
||||||
|
|
||||||
# TODO set_repo is still useful, but perhaps move this thing away to core?
|
|
||||||
|
|
||||||
# TODO ok, I need to get rid of this, better to rely on regular imports
|
|
||||||
|
|
||||||
from .core import __NOT_HPI_MODULE__
|
from .core import __NOT_HPI_MODULE__
|
||||||
|
from .core import warnings as W
|
||||||
|
|
||||||
|
# still used in Promnesia, maybe in dashboard?
|
||||||
|
W.high("DEPRECATED! Please import my.config directly instead.")
|
||||||
|
|
78
my/codeforces.py
Normal file
78
my/codeforces.py
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
import json
|
||||||
|
from collections.abc import Iterator, Sequence
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from functools import cached_property
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from my.config import codeforces as config # type: ignore[attr-defined]
|
||||||
|
from my.core import Res, datetime_aware, get_files
|
||||||
|
|
||||||
|
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
|
||||||
|
ContestId = int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Contest:
|
||||||
|
contest_id: ContestId
|
||||||
|
when: datetime_aware
|
||||||
|
name: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Competition:
|
||||||
|
contest: Contest
|
||||||
|
old_rating: int
|
||||||
|
new_rating: int
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def when(self) -> datetime_aware:
|
||||||
|
return self.contest.when
|
||||||
|
|
||||||
|
|
||||||
|
# todo not sure if parser is the best name? hmm
|
||||||
|
class Parser:
|
||||||
|
def __init__(self, *, inputs: Sequence[Path]) -> None:
|
||||||
|
self.inputs = inputs
|
||||||
|
self.contests: dict[ContestId, Contest] = {}
|
||||||
|
|
||||||
|
def _parse_allcontests(self, p: Path) -> Iterator[Contest]:
|
||||||
|
j = json.loads(p.read_text())
|
||||||
|
for c in j['result']:
|
||||||
|
yield Contest(
|
||||||
|
contest_id=c['id'],
|
||||||
|
when=datetime.fromtimestamp(c['startTimeSeconds'], tz=timezone.utc),
|
||||||
|
name=c['name'],
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_competitions(self, p: Path) -> Iterator[Competition]:
|
||||||
|
j = json.loads(p.read_text())
|
||||||
|
for c in j['result']:
|
||||||
|
contest_id = c['contestId']
|
||||||
|
contest = self.contests[contest_id]
|
||||||
|
yield Competition(
|
||||||
|
contest=contest,
|
||||||
|
old_rating=c['oldRating'],
|
||||||
|
new_rating=c['newRating'],
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self) -> Iterator[Res[Competition]]:
|
||||||
|
for path in inputs():
|
||||||
|
if 'allcontests' in path.name:
|
||||||
|
# these contain information about all CF contests along with useful metadata
|
||||||
|
for contest in self._parse_allcontests(path):
|
||||||
|
# TODO some method to assert on mismatch if it exists? not sure
|
||||||
|
self.contests[contest.contest_id] = contest
|
||||||
|
elif 'codeforces' in path.name:
|
||||||
|
# these contain only contests the user participated in
|
||||||
|
yield from self._parse_competitions(path)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"shouldn't happen: {path.name}")
|
||||||
|
|
||||||
|
|
||||||
|
def data() -> Iterator[Res[Competition]]:
|
||||||
|
return Parser(inputs=inputs()).parse()
|
|
@ -1,114 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
from my.config import codeforces as config
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import NamedTuple
|
|
||||||
from pathlib import Path
|
|
||||||
import json
|
|
||||||
from typing import Dict, Iterator, Any
|
|
||||||
|
|
||||||
from ..common import cproperty, get_files
|
|
||||||
from ..error import Res, unwrap
|
|
||||||
from ..core.konsume import zoom, ignore, wrap
|
|
||||||
|
|
||||||
from kython import fget
|
|
||||||
# TODO remove
|
|
||||||
from kython.kdatetime import as_utc
|
|
||||||
|
|
||||||
|
|
||||||
Cid = int
|
|
||||||
|
|
||||||
class Contest(NamedTuple):
|
|
||||||
cid: Cid
|
|
||||||
when: datetime
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def make(cls, j) -> 'Contest':
|
|
||||||
return cls(
|
|
||||||
cid=j['id'],
|
|
||||||
when=as_utc(j['startTimeSeconds']),
|
|
||||||
)
|
|
||||||
|
|
||||||
Cmap = Dict[Cid, Contest]
|
|
||||||
|
|
||||||
|
|
||||||
def get_contests() -> Cmap:
|
|
||||||
last = max(get_files(config.export_path, 'allcontests*.json'))
|
|
||||||
j = json.loads(last.read_text())
|
|
||||||
d = {}
|
|
||||||
for c in j['result']:
|
|
||||||
cc = Contest.make(c)
|
|
||||||
d[cc.cid] = cc
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
class Competition(NamedTuple):
|
|
||||||
contest_id: Cid
|
|
||||||
contest: str
|
|
||||||
cmap: Cmap
|
|
||||||
|
|
||||||
@cproperty
|
|
||||||
def uid(self) -> Cid:
|
|
||||||
return self.contest_id
|
|
||||||
|
|
||||||
def __hash__(self):
|
|
||||||
return hash(self.contest_id)
|
|
||||||
|
|
||||||
@cproperty
|
|
||||||
def when(self) -> datetime:
|
|
||||||
return self.cmap[self.uid].when
|
|
||||||
|
|
||||||
@cproperty
|
|
||||||
def summary(self) -> str:
|
|
||||||
return f'participated in {self.contest}' # TODO
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def make(cls, cmap, json) -> Iterator[Res['Competition']]:
|
|
||||||
# TODO try here??
|
|
||||||
contest_id = json['contestId'].zoom().value
|
|
||||||
contest = json['contestName'].zoom().value
|
|
||||||
yield cls(
|
|
||||||
contest_id=contest_id,
|
|
||||||
contest=contest,
|
|
||||||
cmap=cmap,
|
|
||||||
)
|
|
||||||
# TODO ytry???
|
|
||||||
ignore(json, 'rank', 'oldRating', 'newRating')
|
|
||||||
|
|
||||||
|
|
||||||
def iter_data() -> Iterator[Res[Competition]]:
|
|
||||||
cmap = get_contests()
|
|
||||||
last = max(get_files(config.export_path, 'codeforces*.json'))
|
|
||||||
|
|
||||||
with wrap(json.loads(last.read_text())) as j:
|
|
||||||
j['status'].ignore()
|
|
||||||
res = j['result'].zoom()
|
|
||||||
|
|
||||||
for c in list(res): # TODO maybe we want 'iter' method??
|
|
||||||
ignore(c, 'handle', 'ratingUpdateTimeSeconds')
|
|
||||||
yield from Competition.make(cmap=cmap, json=c)
|
|
||||||
c.consume()
|
|
||||||
# TODO maybe if they are all empty, no need to consume??
|
|
||||||
|
|
||||||
|
|
||||||
def get_data():
|
|
||||||
return list(sorted(iter_data(), key=fget(Competition.when)))
|
|
||||||
|
|
||||||
|
|
||||||
def test():
|
|
||||||
assert len(get_data()) > 10
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
for d in iter_data():
|
|
||||||
try:
|
|
||||||
d = unwrap(d)
|
|
||||||
except Exception as e:
|
|
||||||
print(f'ERROR! {d}')
|
|
||||||
else:
|
|
||||||
print(f'{d.when}: {d.summary}')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -2,61 +2,87 @@
|
||||||
Git commits data for repositories on your filesystem
|
Git commits data for repositories on your filesystem
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pathlib import Path
|
from __future__ import annotations
|
||||||
|
|
||||||
|
REQUIRES = [
|
||||||
|
'gitpython',
|
||||||
|
]
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
from collections.abc import Iterator, Sequence
|
||||||
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import List, NamedTuple, Optional, Dict, Any, Iterator, Set
|
from pathlib import Path
|
||||||
|
from typing import Optional, cast
|
||||||
|
|
||||||
from ..common import PathIsh, LazyLogger, mcachew
|
from my.core import LazyLogger, PathIsh, make_config
|
||||||
from my.config import commits as config
|
from my.core.cachew import cache_dir, mcachew
|
||||||
|
from my.core.warnings import high
|
||||||
|
|
||||||
# pip3 install gitpython
|
from my.config import commits as user_config # isort: skip
|
||||||
import git # type: ignore
|
|
||||||
from git.repo.fun import is_git_dir, find_worktree_git_dir # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
log = LazyLogger('my.commits', level='info')
|
@dataclass
|
||||||
|
class commits_cfg(user_config):
|
||||||
|
roots: Sequence[PathIsh] = field(default_factory=list)
|
||||||
|
emails: Sequence[str] | None = None
|
||||||
|
names: Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
_things = {
|
# experiment to make it lazy?
|
||||||
*config.emails,
|
# would be nice to have a nicer syntax for it... maybe make_config could return a 'lazy' object
|
||||||
*config.names,
|
def config() -> commits_cfg:
|
||||||
}
|
res = make_config(commits_cfg)
|
||||||
|
if res.emails is None and res.names is None:
|
||||||
|
# todo error policy? throw/warn/ignore
|
||||||
|
high("Set either 'emails' or 'names', otherwise you'll get no commits")
|
||||||
|
return res
|
||||||
|
|
||||||
|
##########################
|
||||||
|
|
||||||
|
import git
|
||||||
|
from git.repo.fun import is_git_dir
|
||||||
|
|
||||||
|
log = LazyLogger(__name__, level='info')
|
||||||
|
|
||||||
|
|
||||||
def by_me(c) -> bool:
|
def by_me(c: git.objects.commit.Commit) -> bool:
|
||||||
actor = c.author
|
actor = c.author
|
||||||
if actor.email in config.emails:
|
if actor.email in (config().emails or ()):
|
||||||
return True
|
return True
|
||||||
if actor.name in config.names:
|
if actor.name in (config().names or ()):
|
||||||
return True
|
return True
|
||||||
aa = f"{actor.email} {actor.name}"
|
|
||||||
for thing in _things:
|
|
||||||
if thing in aa:
|
|
||||||
# TODO this is probably useless
|
|
||||||
raise RuntimeError("WARNING!!!", actor, c, c.repo)
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
class Commit(NamedTuple):
|
@dataclass
|
||||||
commited_dt: datetime
|
class Commit:
|
||||||
|
committed_dt: datetime
|
||||||
authored_dt: datetime
|
authored_dt: datetime
|
||||||
message: str
|
message: str
|
||||||
repo: str # TODO put canonical name here straightaway??
|
repo: str # TODO put canonical name here straight away??
|
||||||
sha: str
|
sha: str
|
||||||
ref: Optional[str]=None
|
ref: Optional[str] = None
|
||||||
# TODO filter so they are authored by me
|
# TODO filter so they are authored by me
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dt(self) -> datetime:
|
def dt(self) -> datetime:
|
||||||
return self.commited_dt
|
return self.committed_dt
|
||||||
|
|
||||||
|
# for backwards compatibility, was misspelled previously
|
||||||
|
@property
|
||||||
|
def commited_dt(self) -> datetime:
|
||||||
|
high("DEPRECATED! Please replace 'commited_dt' with 'committed_dt' (two 't's instead of one)")
|
||||||
|
return self.committed_dt
|
||||||
|
|
||||||
|
|
||||||
# TODO not sure, maybe a better idea to move it to timeline?
|
# TODO not sure, maybe a better idea to move it to timeline?
|
||||||
def fix_datetime(dt) -> datetime:
|
def fix_datetime(dt: datetime) -> datetime:
|
||||||
# git module got it's own tzinfo object.. and it's pretty weird
|
# git module got it's own tzinfo object.. and it's pretty weird
|
||||||
tz = dt.tzinfo
|
tz = dt.tzinfo
|
||||||
assert tz._name == 'fixed'
|
assert tz is not None, dt
|
||||||
offset = tz._offset
|
assert getattr(tz, '_name') == 'fixed'
|
||||||
|
offset = getattr(tz, '_offset')
|
||||||
ntz = timezone(offset)
|
ntz = timezone(offset)
|
||||||
return dt.replace(tzinfo=ntz)
|
return dt.replace(tzinfo=ntz)
|
||||||
|
|
||||||
|
@ -69,7 +95,7 @@ def _git_root(git_dir: PathIsh) -> Path:
|
||||||
return gd # must be bare
|
return gd # must be bare
|
||||||
|
|
||||||
|
|
||||||
def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Commit]:
|
def _repo_commits_aux(gr: git.Repo, rev: str, emitted: set[str]) -> Iterator[Commit]:
|
||||||
# without path might not handle pull heads properly
|
# without path might not handle pull heads properly
|
||||||
for c in gr.iter_commits(rev=rev):
|
for c in gr.iter_commits(rev=rev):
|
||||||
if not by_me(c):
|
if not by_me(c):
|
||||||
|
@ -79,12 +105,15 @@ def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Com
|
||||||
continue
|
continue
|
||||||
emitted.add(sha)
|
emitted.add(sha)
|
||||||
|
|
||||||
repo = str(_git_root(gr.git_dir))
|
# todo figure out how to handle Union[str, PathLike[Any]].. should it be part of PathIsh?
|
||||||
|
repo = str(_git_root(gr.git_dir)) # type: ignore[arg-type]
|
||||||
|
|
||||||
yield Commit(
|
yield Commit(
|
||||||
commited_dt=fix_datetime(c.committed_datetime),
|
committed_dt=fix_datetime(c.committed_datetime),
|
||||||
authored_dt=fix_datetime(c.authored_datetime),
|
authored_dt=fix_datetime(c.authored_datetime),
|
||||||
message=c.message.strip(),
|
# hmm no idea why is it typed with Union[str, bytes]??
|
||||||
|
# https://github.com/gitpython-developers/GitPython/blob/1746b971387eccfc6fb4e34d3c334079bbb14b2e/git/objects/commit.py#L214
|
||||||
|
message=cast(str, c.message).strip(),
|
||||||
repo=repo,
|
repo=repo,
|
||||||
sha=sha,
|
sha=sha,
|
||||||
ref=rev,
|
ref=rev,
|
||||||
|
@ -93,7 +122,7 @@ def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Com
|
||||||
|
|
||||||
def repo_commits(repo: PathIsh):
|
def repo_commits(repo: PathIsh):
|
||||||
gr = git.Repo(str(repo))
|
gr = git.Repo(str(repo))
|
||||||
emitted: Set[str] = set()
|
emitted: set[str] = set()
|
||||||
for r in gr.references:
|
for r in gr.references:
|
||||||
yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted)
|
yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted)
|
||||||
|
|
||||||
|
@ -112,70 +141,81 @@ def canonical_name(repo: Path) -> str:
|
||||||
# pass # TODO
|
# pass # TODO
|
||||||
|
|
||||||
|
|
||||||
# TODO could reuse in clustergit?..
|
def _fd_path() -> str:
|
||||||
def git_repos_in(roots: List[Path]) -> List[Path]:
|
# todo move it to core
|
||||||
|
fd_path: str | None = shutil.which("fdfind") or shutil.which("fd-find") or shutil.which("fd")
|
||||||
|
if fd_path is None:
|
||||||
|
high("my.coding.commits requires 'fd' to be installed, See https://github.com/sharkdp/fd#installation")
|
||||||
|
assert fd_path is not None
|
||||||
|
return fd_path
|
||||||
|
|
||||||
|
|
||||||
|
def git_repos_in(roots: list[Path]) -> list[Path]:
|
||||||
from subprocess import check_output
|
from subprocess import check_output
|
||||||
outputs = check_output([
|
outputs = check_output([
|
||||||
'fdfind',
|
_fd_path(),
|
||||||
# '--follow', # right, not so sure about follow... make configurable?
|
# '--follow', # right, not so sure about follow... make configurable?
|
||||||
'--hidden',
|
'--hidden',
|
||||||
|
'--no-ignore', # otherwise doesn't go inside .git directory (from fd v9)
|
||||||
'--full-path',
|
'--full-path',
|
||||||
'--type', 'f',
|
'--type', 'f',
|
||||||
'/HEAD', # judging by is_git_dir, it should always be here..
|
'/HEAD', # judging by is_git_dir, it should always be here..
|
||||||
*roots,
|
*roots,
|
||||||
]).decode('utf8').splitlines()
|
]).decode('utf8').splitlines()
|
||||||
candidates = set(Path(o).resolve().absolute().parent for o in outputs)
|
|
||||||
|
candidates = {Path(o).resolve().absolute().parent for o in outputs}
|
||||||
|
|
||||||
# exclude stuff within .git dirs (can happen for submodules?)
|
# exclude stuff within .git dirs (can happen for submodules?)
|
||||||
candidates = {c for c in candidates if '.git' not in c.parts[:-1]}
|
candidates = {c for c in candidates if '.git' not in c.parts[:-1]}
|
||||||
|
|
||||||
candidates = {c for c in candidates if is_git_dir(c)}
|
candidates = {c for c in candidates if is_git_dir(c)}
|
||||||
|
|
||||||
repos = list(sorted(map(_git_root, candidates)))
|
repos = sorted(map(_git_root, candidates))
|
||||||
return repos
|
return repos
|
||||||
|
|
||||||
|
|
||||||
def repos():
|
def repos() -> list[Path]:
|
||||||
return git_repos_in(config.roots)
|
return git_repos_in(list(map(Path, config().roots)))
|
||||||
|
|
||||||
|
|
||||||
def _hashf(_repos: List[Path]):
|
# returns modification time for an index to use as hash function
|
||||||
# TODO maybe use smth from git library? ugh..
|
def _repo_depends_on(_repo: Path) -> int:
|
||||||
res = []
|
for pp in [
|
||||||
|
".git/FETCH_HEAD",
|
||||||
|
".git/HEAD",
|
||||||
|
"FETCH_HEAD", # bare
|
||||||
|
"HEAD", # bare
|
||||||
|
]:
|
||||||
|
ff = _repo / pp
|
||||||
|
if ff.exists():
|
||||||
|
return int(ff.stat().st_mtime)
|
||||||
|
raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}")
|
||||||
|
|
||||||
|
|
||||||
|
def _commits(_repos: list[Path]) -> Iterator[Commit]:
|
||||||
for r in _repos:
|
for r in _repos:
|
||||||
# TODO just use anything except index? ugh.
|
yield from _cached_commits(r)
|
||||||
for pp in {
|
|
||||||
'.git/FETCH_HEAD',
|
|
||||||
'.git/HEAD',
|
|
||||||
'FETCH_HEAD', # bare
|
|
||||||
'HEAD', # bare
|
|
||||||
}:
|
|
||||||
ff = r / pp
|
|
||||||
if ff.exists():
|
|
||||||
updated = ff.stat().st_mtime
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise RuntimeError(r)
|
|
||||||
res.append((r, updated))
|
|
||||||
return res
|
|
||||||
|
|
||||||
# TODO per-repo cache?
|
|
||||||
# TODO set default cache path?
|
def _cached_commits_path(p: Path) -> str:
|
||||||
# TODO got similar issue as in photos with a helper method.. figure it out
|
p = cache_dir() / 'my.coding.commits:_cached_commits' / str(p.absolute()).strip("/")
|
||||||
@mcachew(hashf=_hashf, logger=log)
|
p.mkdir(parents=True, exist_ok=True)
|
||||||
def _commits(_repos) -> Iterator[Commit]:
|
return str(p)
|
||||||
for r in _repos:
|
|
||||||
log.info('processing %s', r)
|
|
||||||
yield from repo_commits(r)
|
# per-repo commits, to use cachew
|
||||||
|
@mcachew(
|
||||||
|
depends_on=_repo_depends_on,
|
||||||
|
logger=log,
|
||||||
|
cache_path=_cached_commits_path,
|
||||||
|
)
|
||||||
|
def _cached_commits(repo: Path) -> Iterator[Commit]:
|
||||||
|
log.debug('processing %s', repo)
|
||||||
|
yield from repo_commits(repo)
|
||||||
|
|
||||||
|
|
||||||
def commits() -> Iterator[Commit]:
|
def commits() -> Iterator[Commit]:
|
||||||
return _commits(repos())
|
return _commits(repos())
|
||||||
|
|
||||||
|
|
||||||
def print_all():
|
|
||||||
for c in commits():
|
|
||||||
print(c)
|
|
||||||
|
|
||||||
|
|
||||||
# TODO enforce read only? although it doesn't touch index
|
# TODO enforce read only? although it doesn't touch index
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
import warnings
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
warnings.warn('my.coding.github is deprecated! Please use my.github.all instead!')
|
from my.core import warnings
|
||||||
|
|
||||||
|
warnings.high('my.coding.github is deprecated! Please use my.github.all instead!')
|
||||||
# todo why aren't DeprecationWarning shown by default??
|
# todo why aren't DeprecationWarning shown by default??
|
||||||
|
|
||||||
from ..github.all import events, get_events
|
if not TYPE_CHECKING:
|
||||||
|
from ..github.all import events, get_events # noqa: F401
|
||||||
|
|
||||||
# todo deprecate properly
|
# todo deprecate properly
|
||||||
iter_events = events
|
iter_events = events
|
||||||
|
|
|
@ -1,102 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
from my.config import topcoder as config
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import NamedTuple
|
|
||||||
from pathlib import Path
|
|
||||||
import json
|
|
||||||
from typing import Dict, Iterator, Any
|
|
||||||
|
|
||||||
from ..common import cproperty, get_files
|
|
||||||
from ..error import Res, unwrap
|
|
||||||
|
|
||||||
# TODO get rid of fget?
|
|
||||||
from kython import fget
|
|
||||||
from ..core.konsume import zoom, wrap, ignore
|
|
||||||
|
|
||||||
|
|
||||||
# TODO json type??
|
|
||||||
def _get_latest() -> Dict:
|
|
||||||
pp = max(get_files(config.export_path, glob='*.json'))
|
|
||||||
return json.loads(pp.read_text())
|
|
||||||
|
|
||||||
|
|
||||||
class Competition(NamedTuple):
|
|
||||||
contest_id: str
|
|
||||||
contest: str
|
|
||||||
percentile: float
|
|
||||||
dates: str
|
|
||||||
|
|
||||||
@cproperty
|
|
||||||
def uid(self) -> str:
|
|
||||||
return self.contest_id
|
|
||||||
|
|
||||||
def __hash__(self):
|
|
||||||
return hash(self.contest_id)
|
|
||||||
|
|
||||||
@cproperty
|
|
||||||
def when(self) -> datetime:
|
|
||||||
return datetime.strptime(self.dates, '%Y-%m-%dT%H:%M:%S.%fZ')
|
|
||||||
|
|
||||||
@cproperty
|
|
||||||
def summary(self) -> str:
|
|
||||||
return f'participated in {self.contest}: {self.percentile:.0f}'
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def make(cls, json) -> Iterator[Res['Competition']]:
|
|
||||||
ignore(json, 'rating', 'placement')
|
|
||||||
cid = json['challengeId'].zoom().value
|
|
||||||
cname = json['challengeName'].zoom().value
|
|
||||||
percentile = json['percentile'].zoom().value
|
|
||||||
dates = json['date'].zoom().value
|
|
||||||
yield cls(
|
|
||||||
contest_id=cid,
|
|
||||||
contest=cname,
|
|
||||||
percentile=percentile,
|
|
||||||
dates=dates,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def iter_data() -> Iterator[Res[Competition]]:
|
|
||||||
with wrap(_get_latest()) as j:
|
|
||||||
ignore(j, 'id', 'version')
|
|
||||||
|
|
||||||
res = j['result'].zoom()
|
|
||||||
ignore(res, 'success', 'status', 'metadata')
|
|
||||||
|
|
||||||
cont = res['content'].zoom()
|
|
||||||
ignore(cont, 'handle', 'handleLower', 'userId', 'createdAt', 'updatedAt', 'createdBy', 'updatedBy')
|
|
||||||
|
|
||||||
cont['DEVELOP'].ignore() # TODO handle it??
|
|
||||||
ds = cont['DATA_SCIENCE'].zoom()
|
|
||||||
|
|
||||||
mar, srm = zoom(ds, 'MARATHON_MATCH', 'SRM')
|
|
||||||
|
|
||||||
mar = mar['history'].zoom()
|
|
||||||
srm = srm['history'].zoom()
|
|
||||||
# TODO right, I guess I could rely on pylint for unused variables??
|
|
||||||
|
|
||||||
for c in mar + srm:
|
|
||||||
yield from Competition.make(json=c)
|
|
||||||
c.consume()
|
|
||||||
|
|
||||||
|
|
||||||
def get_data():
|
|
||||||
return list(sorted(iter_data(), key=fget(Competition.when)))
|
|
||||||
|
|
||||||
|
|
||||||
def test():
|
|
||||||
assert len(get_data()) > 10
|
|
||||||
|
|
||||||
def main():
|
|
||||||
for d in iter_data():
|
|
||||||
try:
|
|
||||||
d = unwrap(d)
|
|
||||||
except Exception as e:
|
|
||||||
print(f'ERROR! {d}')
|
|
||||||
else:
|
|
||||||
print(d.summary)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .core.warnings import high
|
from .core.warnings import high
|
||||||
|
|
||||||
high("DEPRECATED! Please use my.core.common instead.")
|
high("DEPRECATED! Please use my.core.common instead.")
|
||||||
|
|
||||||
from .core import __NOT_HPI_MODULE__
|
from .core import __NOT_HPI_MODULE__
|
||||||
|
|
||||||
from .core.common import *
|
from .core.common import *
|
||||||
|
|
244
my/config.py
244
my/config.py
|
@ -9,29 +9,46 @@ This file is used for:
|
||||||
- mypy: this file provides some type annotations
|
- mypy: this file provides some type annotations
|
||||||
- for loading the actual user config
|
- for loading the actual user config
|
||||||
'''
|
'''
|
||||||
#### vvvv you won't need this VVV in your personal config
|
|
||||||
from my.core import init
|
from __future__ import annotations
|
||||||
|
|
||||||
|
#### NOTE: you won't need this line VVVV in your personal config
|
||||||
|
from my.core import init # noqa: F401 # isort: skip
|
||||||
###
|
###
|
||||||
|
|
||||||
|
|
||||||
from my.core import Paths, PathIsh
|
from datetime import tzinfo
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from my.core import PathIsh, Paths
|
||||||
|
|
||||||
|
|
||||||
class hypothesis:
|
class hypothesis:
|
||||||
# expects outputs from https://github.com/karlicoss/hypexport
|
# expects outputs from https://github.com/karlicoss/hypexport
|
||||||
# (it's just the standard Hypothes.is export format)
|
# (it's just the standard Hypothes.is export format)
|
||||||
export_path: Paths = '/path/to/hypothesis/data'
|
export_path: Paths = r'/path/to/hypothesis/data'
|
||||||
|
|
||||||
class instapaper:
|
class instapaper:
|
||||||
export_path: Paths = ''
|
export_path: Paths = ''
|
||||||
|
|
||||||
|
class smscalls:
|
||||||
|
export_path: Paths = ''
|
||||||
|
|
||||||
class pocket:
|
class pocket:
|
||||||
export_path: Paths = ''
|
export_path: Paths = ''
|
||||||
|
|
||||||
class github:
|
class github:
|
||||||
export_path: Paths = ''
|
export_path: Paths = ''
|
||||||
|
|
||||||
|
gdpr_dir: Paths = ''
|
||||||
|
|
||||||
class reddit:
|
class reddit:
|
||||||
export_path: Paths = ''
|
class rexport:
|
||||||
|
export_path: Paths = ''
|
||||||
|
class pushshift:
|
||||||
|
export_path: Paths = ''
|
||||||
|
class gdpr:
|
||||||
|
export_path: Paths = ''
|
||||||
|
|
||||||
class endomondo:
|
class endomondo:
|
||||||
export_path: Paths = ''
|
export_path: Paths = ''
|
||||||
|
@ -42,21 +59,228 @@ class exercise:
|
||||||
class bluemaestro:
|
class bluemaestro:
|
||||||
export_path: Paths = ''
|
export_path: Paths = ''
|
||||||
|
|
||||||
|
class stackexchange:
|
||||||
|
export_path: Paths = ''
|
||||||
|
|
||||||
|
class goodreads:
|
||||||
|
export_path: Paths = ''
|
||||||
|
|
||||||
|
class pinboard:
|
||||||
|
export_dir: Paths = ''
|
||||||
|
|
||||||
class google:
|
class google:
|
||||||
|
class maps:
|
||||||
|
class android:
|
||||||
|
export_path: Paths = ''
|
||||||
|
|
||||||
takeout_path: Paths = ''
|
takeout_path: Paths = ''
|
||||||
|
|
||||||
|
|
||||||
from typing import Sequence, Union, Tuple
|
from collections.abc import Sequence
|
||||||
from datetime import datetime, date
|
from datetime import date, datetime, timedelta
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
DateIsh = Union[datetime, date, str]
|
DateIsh = Union[datetime, date, str]
|
||||||
LatLon = Tuple[float, float]
|
LatLon = tuple[float, float]
|
||||||
class location:
|
class location:
|
||||||
# todo ugh, need to think about it... mypy wants the type here to be general, otherwise it can't deduce
|
# todo ugh, need to think about it... mypy wants the type here to be general, otherwise it can't deduce
|
||||||
# and we can't import the types from the module itself, otherwise would be circular. common module?
|
# and we can't import the types from the module itself, otherwise would be circular. common module?
|
||||||
home: Union[LatLon, Sequence[Tuple[DateIsh, LatLon]]] = (1.0, -1.0)
|
home: LatLon | Sequence[tuple[DateIsh, LatLon]] = (1.0, -1.0)
|
||||||
|
home_accuracy = 30_000.0
|
||||||
|
|
||||||
|
class via_ip:
|
||||||
|
accuracy: float
|
||||||
|
for_duration: timedelta
|
||||||
|
|
||||||
|
class gpslogger:
|
||||||
|
export_path: Paths = ''
|
||||||
|
accuracy: float
|
||||||
|
|
||||||
|
class google_takeout_semantic:
|
||||||
|
# a value between 0 and 100, 100 being the most confident
|
||||||
|
# set to 0 to include all locations
|
||||||
|
# https://locationhistoryformat.com/reference/semantic/#/$defs/placeVisit/properties/locationConfidence
|
||||||
|
require_confidence: float = 40
|
||||||
|
# default accuracy for semantic locations
|
||||||
|
accuracy: float = 100
|
||||||
|
|
||||||
|
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
|
||||||
class time:
|
class time:
|
||||||
class tz:
|
class tz:
|
||||||
pass
|
policy: Literal['keep', 'convert', 'throw']
|
||||||
|
|
||||||
|
class via_location:
|
||||||
|
fast: bool
|
||||||
|
sort_locations: bool
|
||||||
|
require_accuracy: float
|
||||||
|
|
||||||
|
|
||||||
|
class orgmode:
|
||||||
|
paths: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class arbtt:
|
||||||
|
logfiles: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class commits:
|
||||||
|
emails: Sequence[str] | None
|
||||||
|
names: Sequence[str] | None
|
||||||
|
roots: Sequence[PathIsh]
|
||||||
|
|
||||||
|
|
||||||
|
class pdfs:
|
||||||
|
paths: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class zulip:
|
||||||
|
class organization:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class bumble:
|
||||||
|
class android:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class tinder:
|
||||||
|
class android:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class instagram:
|
||||||
|
class android:
|
||||||
|
export_path: Paths
|
||||||
|
username: str | None
|
||||||
|
full_name: str | None
|
||||||
|
|
||||||
|
class gdpr:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class hackernews:
|
||||||
|
class dogsheep:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class materialistic:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class fbmessenger:
|
||||||
|
class fbmessengerexport:
|
||||||
|
export_db: PathIsh
|
||||||
|
facebook_id: str | None
|
||||||
|
class android:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class twitter_archive:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class twitter:
|
||||||
|
class talon:
|
||||||
|
export_path: Paths
|
||||||
|
class android:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class twint:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class browser:
|
||||||
|
class export:
|
||||||
|
export_path: Paths = ''
|
||||||
|
class active_browser:
|
||||||
|
export_path: Paths = ''
|
||||||
|
|
||||||
|
|
||||||
|
class telegram:
|
||||||
|
class telegram_backup:
|
||||||
|
export_path: PathIsh = ''
|
||||||
|
|
||||||
|
|
||||||
|
class demo:
|
||||||
|
data_path: Paths
|
||||||
|
username: str
|
||||||
|
timezone: tzinfo
|
||||||
|
|
||||||
|
|
||||||
|
class simple:
|
||||||
|
count: int
|
||||||
|
|
||||||
|
|
||||||
|
class vk_messages_backup:
|
||||||
|
storage_path: Path
|
||||||
|
user_id: int
|
||||||
|
|
||||||
|
|
||||||
|
class kobo:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class feedly:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class feedbin:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class taplog:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class lastfm:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class rescuetime:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class runnerup:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class emfit:
|
||||||
|
export_path: Path
|
||||||
|
timezone: tzinfo
|
||||||
|
excluded_sids: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
class foursquare:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class rtm:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class imdb:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class roamresearch:
|
||||||
|
export_path: Paths
|
||||||
|
username: str
|
||||||
|
|
||||||
|
|
||||||
|
class whatsapp:
|
||||||
|
class android:
|
||||||
|
export_path: Paths
|
||||||
|
my_user_id: str | None
|
||||||
|
|
||||||
|
|
||||||
|
class harmonic:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
class monzo:
|
||||||
|
class monzoexport:
|
||||||
|
export_path: Paths
|
||||||
|
|
|
@ -1,11 +1,61 @@
|
||||||
# this file only keeps the most common & critical types/utility functions
|
# this file only keeps the most common & critical types/utility functions
|
||||||
from .common import PathIsh, Paths, Json
|
from typing import TYPE_CHECKING
|
||||||
from .common import get_files
|
|
||||||
from .common import LazyLogger
|
|
||||||
from .common import warn_if_empty
|
|
||||||
from .common import stat, Stats
|
|
||||||
|
|
||||||
from .cfg import make_config
|
from .cfg import make_config
|
||||||
|
from .common import PathIsh, Paths, get_files
|
||||||
|
from .compat import assert_never
|
||||||
|
from .error import Res, notnone, unwrap
|
||||||
|
from .logging import (
|
||||||
|
make_logger,
|
||||||
|
)
|
||||||
|
from .stats import Stats, stat
|
||||||
|
from .types import (
|
||||||
|
Json,
|
||||||
|
datetime_aware,
|
||||||
|
datetime_naive,
|
||||||
|
)
|
||||||
from .util import __NOT_HPI_MODULE__
|
from .util import __NOT_HPI_MODULE__
|
||||||
|
from .utils.itertools import warn_if_empty
|
||||||
|
|
||||||
from .error import Res, unwrap
|
LazyLogger = make_logger # TODO deprecate this in favor of make_logger
|
||||||
|
|
||||||
|
|
||||||
|
if not TYPE_CHECKING:
|
||||||
|
# we used to keep these here for brevity, but feels like it only adds confusion,
|
||||||
|
# e.g. suggest that we perhaps somehow modify builtin behaviour or whatever
|
||||||
|
# so best to prefer explicit behaviour
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'__NOT_HPI_MODULE__',
|
||||||
|
'Json',
|
||||||
|
'LazyLogger', # legacy import
|
||||||
|
'Path',
|
||||||
|
'PathIsh',
|
||||||
|
'Paths',
|
||||||
|
'Res',
|
||||||
|
'Stats',
|
||||||
|
'assert_never', # TODO maybe deprecate from use in my.core? will be in stdlib soon
|
||||||
|
'dataclass',
|
||||||
|
'datetime_aware',
|
||||||
|
'datetime_naive',
|
||||||
|
'get_files',
|
||||||
|
'make_config',
|
||||||
|
'make_logger',
|
||||||
|
'notnone',
|
||||||
|
'stat',
|
||||||
|
'unwrap',
|
||||||
|
'warn_if_empty',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
## experimental for now
|
||||||
|
# you could put _init_hook.py next to your private my/config
|
||||||
|
# that way you can configure logging/warnings/env variables on every HPI import
|
||||||
|
try:
|
||||||
|
import my._init_hook # type: ignore[import-not-found] # noqa: F401
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
##
|
||||||
|
|
File diff suppressed because it is too large
Load diff
35
my/core/_cpu_pool.py
Normal file
35
my/core/_cpu_pool.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
"""
|
||||||
|
EXPERIMENTAL! use with caution
|
||||||
|
Manages 'global' ProcessPoolExecutor which is 'managed' by HPI itself, and
|
||||||
|
can be passed down to DALs to speed up data processing.
|
||||||
|
|
||||||
|
The reason to have it managed by HPI is because we don't want DALs instantiate pools
|
||||||
|
themselves -- they can't cooperate and it would be hard/infeasible to control
|
||||||
|
how many cores we want to dedicate to the DAL.
|
||||||
|
|
||||||
|
Enabled by the env variable, specifying how many cores to dedicate
|
||||||
|
e.g. "HPI_CPU_POOL=4 hpi query ..."
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
|
_NOT_SET = cast(ProcessPoolExecutor, object())
|
||||||
|
_INSTANCE: ProcessPoolExecutor | None = _NOT_SET
|
||||||
|
|
||||||
|
|
||||||
|
def get_cpu_pool() -> ProcessPoolExecutor | None:
|
||||||
|
global _INSTANCE
|
||||||
|
if _INSTANCE is _NOT_SET:
|
||||||
|
use_cpu_pool = os.environ.get('HPI_CPU_POOL')
|
||||||
|
if use_cpu_pool is None or int(use_cpu_pool) == 0:
|
||||||
|
_INSTANCE = None
|
||||||
|
else:
|
||||||
|
# NOTE: this won't be cleaned up properly, but I guess it's fine?
|
||||||
|
# since this it's basically a singleton for the whole process
|
||||||
|
# , and will be destroyed when python exists
|
||||||
|
_INSTANCE = ProcessPoolExecutor(max_workers=int(use_cpu_pool))
|
||||||
|
return _INSTANCE
|
12
my/core/_deprecated/dataset.py
Normal file
12
my/core/_deprecated/dataset.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
from ..common import PathIsh
|
||||||
|
from ..sqlite import sqlite_connect_immutable
|
||||||
|
|
||||||
|
|
||||||
|
def connect_readonly(db: PathIsh):
|
||||||
|
import dataset # type: ignore
|
||||||
|
|
||||||
|
# see https://github.com/pudo/dataset/issues/136#issuecomment-128693122
|
||||||
|
# todo not sure if mode=ro has any benefit, but it doesn't work on read-only filesystems
|
||||||
|
# maybe it should autodetect readonly filesystems and apply this? not sure
|
||||||
|
creator = lambda: sqlite_connect_immutable(db)
|
||||||
|
return dataset.connect('sqlite:///', engine_kwargs={'creator': creator})
|
261
my/core/_deprecated/kompress.py
Normal file
261
my/core/_deprecated/kompress.py
Normal file
|
@ -0,0 +1,261 @@
|
||||||
|
"""
|
||||||
|
Various helpers for compression
|
||||||
|
"""
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import pathlib
|
||||||
|
from collections.abc import Iterator, Sequence
|
||||||
|
from datetime import datetime
|
||||||
|
from functools import total_ordering
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import IO, Union
|
||||||
|
|
||||||
|
PathIsh = Union[Path, str]
|
||||||
|
|
||||||
|
|
||||||
|
class Ext:
|
||||||
|
xz = '.xz'
|
||||||
|
zip = '.zip'
|
||||||
|
lz4 = '.lz4'
|
||||||
|
zstd = '.zstd'
|
||||||
|
zst = '.zst'
|
||||||
|
targz = '.tar.gz'
|
||||||
|
|
||||||
|
|
||||||
|
def is_compressed(p: Path) -> bool:
|
||||||
|
# todo kinda lame way for now.. use mime ideally?
|
||||||
|
# should cooperate with kompress.kopen?
|
||||||
|
return any(p.name.endswith(ext) for ext in [Ext.xz, Ext.zip, Ext.lz4, Ext.zstd, Ext.zst, Ext.targz])
|
||||||
|
|
||||||
|
|
||||||
|
def _zstd_open(path: Path, *args, **kwargs) -> IO:
|
||||||
|
import zstandard as zstd # type: ignore
|
||||||
|
fh = path.open('rb')
|
||||||
|
dctx = zstd.ZstdDecompressor()
|
||||||
|
reader = dctx.stream_reader(fh)
|
||||||
|
|
||||||
|
mode = kwargs.get('mode', 'rt')
|
||||||
|
if mode == 'rb':
|
||||||
|
return reader
|
||||||
|
else:
|
||||||
|
# must be text mode
|
||||||
|
kwargs.pop('mode') # TextIOWrapper doesn't like it
|
||||||
|
return io.TextIOWrapper(reader, **kwargs) # meh
|
||||||
|
|
||||||
|
|
||||||
|
# TODO use the 'dependent type' trick for return type?
|
||||||
|
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO:
|
||||||
|
# just in case, but I think this shouldn't be necessary anymore
|
||||||
|
# since when we call .read_text, encoding is passed already
|
||||||
|
if mode in {'r', 'rt'}:
|
||||||
|
encoding = kwargs.get('encoding', 'utf8')
|
||||||
|
else:
|
||||||
|
encoding = None
|
||||||
|
kwargs['encoding'] = encoding
|
||||||
|
|
||||||
|
pp = Path(path)
|
||||||
|
name = pp.name
|
||||||
|
if name.endswith(Ext.xz):
|
||||||
|
import lzma
|
||||||
|
|
||||||
|
# ugh. for lzma, 'r' means 'rb'
|
||||||
|
# https://github.com/python/cpython/blob/d01cf5072be5511595b6d0c35ace6c1b07716f8d/Lib/lzma.py#L97
|
||||||
|
# whereas for regular open, 'r' means 'rt'
|
||||||
|
# https://docs.python.org/3/library/functions.html#open
|
||||||
|
if mode == 'r':
|
||||||
|
mode = 'rt'
|
||||||
|
kwargs['mode'] = mode
|
||||||
|
return lzma.open(pp, *args, **kwargs)
|
||||||
|
elif name.endswith(Ext.zip):
|
||||||
|
# eh. this behaviour is a bit dodgy...
|
||||||
|
from zipfile import ZipFile
|
||||||
|
zfile = ZipFile(pp)
|
||||||
|
|
||||||
|
[subpath] = args # meh?
|
||||||
|
|
||||||
|
## oh god... https://stackoverflow.com/a/5639960/706389
|
||||||
|
ifile = zfile.open(subpath, mode='r')
|
||||||
|
ifile.readable = lambda: True # type: ignore
|
||||||
|
ifile.writable = lambda: False # type: ignore
|
||||||
|
ifile.seekable = lambda: False # type: ignore
|
||||||
|
ifile.read1 = ifile.read # type: ignore
|
||||||
|
# TODO pass all kwargs here??
|
||||||
|
# todo 'expected "BinaryIO"'??
|
||||||
|
return io.TextIOWrapper(ifile, encoding=encoding)
|
||||||
|
elif name.endswith(Ext.lz4):
|
||||||
|
import lz4.frame # type: ignore
|
||||||
|
return lz4.frame.open(str(pp), mode, *args, **kwargs)
|
||||||
|
elif name.endswith(Ext.zstd) or name.endswith(Ext.zst): # noqa: PIE810
|
||||||
|
kwargs['mode'] = mode
|
||||||
|
return _zstd_open(pp, *args, **kwargs)
|
||||||
|
elif name.endswith(Ext.targz):
|
||||||
|
import tarfile
|
||||||
|
# FIXME pass mode?
|
||||||
|
tf = tarfile.open(pp)
|
||||||
|
# TODO pass encoding?
|
||||||
|
x = tf.extractfile(*args); assert x is not None
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
return pp.open(mode, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
import typing
|
||||||
|
|
||||||
|
if typing.TYPE_CHECKING:
|
||||||
|
# otherwise mypy can't figure out that BasePath is a type alias..
|
||||||
|
BasePath = pathlib.Path
|
||||||
|
else:
|
||||||
|
BasePath = pathlib.WindowsPath if os.name == 'nt' else pathlib.PosixPath
|
||||||
|
|
||||||
|
|
||||||
|
class CPath(BasePath):
|
||||||
|
"""
|
||||||
|
Hacky way to support compressed files.
|
||||||
|
If you can think of a better way to do this, please let me know! https://github.com/karlicoss/HPI/issues/20
|
||||||
|
|
||||||
|
Ugh. So, can't override Path because of some _flavour thing.
|
||||||
|
Path only has _accessor and _closed slots, so can't directly set .open method
|
||||||
|
_accessor.open has to return file descriptor, doesn't work for compressed stuff.
|
||||||
|
"""
|
||||||
|
def open(self, *args, **kwargs): # noqa: ARG002
|
||||||
|
kopen_kwargs = {}
|
||||||
|
mode = kwargs.get('mode')
|
||||||
|
if mode is not None:
|
||||||
|
kopen_kwargs['mode'] = mode
|
||||||
|
encoding = kwargs.get('encoding')
|
||||||
|
if encoding is not None:
|
||||||
|
kopen_kwargs['encoding'] = encoding
|
||||||
|
# TODO assert read only?
|
||||||
|
return kopen(str(self), **kopen_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
open = kopen # TODO deprecate
|
||||||
|
|
||||||
|
|
||||||
|
# meh
|
||||||
|
# TODO ideally switch to ZipPath or smth similar?
|
||||||
|
# nothing else supports subpath properly anyway
|
||||||
|
def kexists(path: PathIsh, subpath: str) -> bool:
|
||||||
|
try:
|
||||||
|
kopen(path, subpath)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
# meh... zipfile.Path is not available on 3.7
|
||||||
|
zipfile_Path = zipfile.Path
|
||||||
|
|
||||||
|
|
||||||
|
@total_ordering
|
||||||
|
class ZipPath(zipfile_Path):
|
||||||
|
# NOTE: is_dir/is_file might not behave as expected, the base class checks it only based on the slash in path
|
||||||
|
|
||||||
|
# seems that root/at are not exposed in the docs, so might be an implementation detail
|
||||||
|
root: zipfile.ZipFile # type: ignore[assignment]
|
||||||
|
at: str
|
||||||
|
|
||||||
|
@property
|
||||||
|
def filepath(self) -> Path:
|
||||||
|
res = self.root.filename
|
||||||
|
assert res is not None # make mypy happy
|
||||||
|
return Path(res)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def subpath(self) -> Path:
|
||||||
|
return Path(self.at)
|
||||||
|
|
||||||
|
def absolute(self) -> ZipPath:
|
||||||
|
return ZipPath(self.filepath.absolute(), self.at)
|
||||||
|
|
||||||
|
def expanduser(self) -> ZipPath:
|
||||||
|
return ZipPath(self.filepath.expanduser(), self.at)
|
||||||
|
|
||||||
|
def exists(self) -> bool:
|
||||||
|
if self.at == '':
|
||||||
|
# special case, the base class returns False in this case for some reason
|
||||||
|
return self.filepath.exists()
|
||||||
|
return super().exists() or self._as_dir().exists()
|
||||||
|
|
||||||
|
def _as_dir(self) -> zipfile_Path:
|
||||||
|
# note: seems that zip always uses forward slash, regardless OS?
|
||||||
|
return zipfile_Path(self.root, self.at + '/')
|
||||||
|
|
||||||
|
def rglob(self, glob: str) -> Iterator[ZipPath]:
|
||||||
|
# note: not 100% sure about the correctness, but seem fine?
|
||||||
|
# Path.match() matches from the right, so need to
|
||||||
|
rpaths = [p for p in self.root.namelist() if p.startswith(self.at)]
|
||||||
|
rpaths = [p for p in rpaths if Path(p).match(glob)]
|
||||||
|
return (ZipPath(self.root, p) for p in rpaths)
|
||||||
|
|
||||||
|
def relative_to(self, other: ZipPath) -> Path: # type: ignore[override, unused-ignore]
|
||||||
|
assert self.filepath == other.filepath, (self.filepath, other.filepath)
|
||||||
|
return self.subpath.relative_to(other.subpath)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def parts(self) -> Sequence[str]:
|
||||||
|
# messy, but might be ok..
|
||||||
|
return self.filepath.parts + self.subpath.parts
|
||||||
|
|
||||||
|
def __truediv__(self, key) -> ZipPath:
|
||||||
|
# need to implement it so the return type is not zipfile.Path
|
||||||
|
tmp = zipfile_Path(self.root) / self.at / key
|
||||||
|
return ZipPath(self.root, tmp.at)
|
||||||
|
|
||||||
|
def iterdir(self) -> Iterator[ZipPath]:
|
||||||
|
for s in self._as_dir().iterdir():
|
||||||
|
yield ZipPath(s.root, s.at)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def stem(self) -> str:
|
||||||
|
return self.subpath.stem
|
||||||
|
|
||||||
|
@property # type: ignore[misc]
|
||||||
|
def __class__(self):
|
||||||
|
return Path
|
||||||
|
|
||||||
|
def __eq__(self, other) -> bool:
|
||||||
|
# hmm, super class doesn't seem to treat as equals unless they are the same object
|
||||||
|
if not isinstance(other, ZipPath):
|
||||||
|
return False
|
||||||
|
return (self.filepath, self.subpath) == (other.filepath, other.subpath)
|
||||||
|
|
||||||
|
def __lt__(self, other) -> bool:
|
||||||
|
if not isinstance(other, ZipPath):
|
||||||
|
return False
|
||||||
|
return (self.filepath, self.subpath) < (other.filepath, other.subpath)
|
||||||
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
return hash((self.filepath, self.subpath))
|
||||||
|
|
||||||
|
def stat(self) -> os.stat_result:
|
||||||
|
# NOTE: zip datetimes have no notion of time zone, usually they just keep local time?
|
||||||
|
# see https://en.wikipedia.org/wiki/ZIP_(file_format)#Structure
|
||||||
|
dt = datetime(*self.root.getinfo(self.at).date_time)
|
||||||
|
ts = int(dt.timestamp())
|
||||||
|
params = dict( # noqa: C408
|
||||||
|
st_mode=0,
|
||||||
|
st_ino=0,
|
||||||
|
st_dev=0,
|
||||||
|
st_nlink=1,
|
||||||
|
st_uid=1000,
|
||||||
|
st_gid=1000,
|
||||||
|
st_size=0, # todo compute it properly?
|
||||||
|
st_atime=ts,
|
||||||
|
st_mtime=ts,
|
||||||
|
st_ctime=ts,
|
||||||
|
)
|
||||||
|
return os.stat_result(tuple(params.values()))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def suffix(self) -> str:
|
||||||
|
return Path(self.parts[-1]).suffix
|
||||||
|
|
||||||
|
# fmt: on
|
|
@ -1,43 +1,163 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from .internal import assert_subpackage
|
||||||
|
|
||||||
|
assert_subpackage(__name__)
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from collections.abc import Iterator
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
Callable,
|
||||||
|
TypeVar,
|
||||||
|
Union,
|
||||||
|
cast,
|
||||||
|
overload,
|
||||||
|
)
|
||||||
|
|
||||||
|
import appdirs # type: ignore[import-untyped]
|
||||||
|
|
||||||
|
from . import warnings
|
||||||
|
|
||||||
|
PathIsh = Union[str, Path] # avoid circular import from .common
|
||||||
|
|
||||||
|
|
||||||
def disable_cachew():
|
def disable_cachew() -> None:
|
||||||
try:
|
try:
|
||||||
import cachew
|
import cachew # noqa: F401 # unused, it's fine
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# nothing to disable
|
# nothing to disable
|
||||||
return
|
return
|
||||||
|
|
||||||
from cachew import settings
|
from cachew import settings
|
||||||
|
|
||||||
settings.ENABLE = False
|
settings.ENABLE = False
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def disabled_cachew():
|
def disabled_cachew() -> Iterator[None]:
|
||||||
try:
|
try:
|
||||||
import cachew
|
import cachew # noqa: F401 # unused, it's fine
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# nothing to disable
|
# nothing to disable
|
||||||
yield
|
yield
|
||||||
return
|
return
|
||||||
from cachew.extra import disabled_cachew
|
from cachew.extra import disabled_cachew
|
||||||
|
|
||||||
with disabled_cachew():
|
with disabled_cachew():
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
|
||||||
def cache_dir() -> Path:
|
def _appdirs_cache_dir() -> Path:
|
||||||
'''
|
cd = Path(appdirs.user_cache_dir('my'))
|
||||||
Base directory for cachew.
|
cd.mkdir(exist_ok=True, parents=True)
|
||||||
To override, add to your config file:
|
return cd
|
||||||
class config:
|
|
||||||
cache_dir = '/your/custom/cache/path'
|
|
||||||
'''
|
_CACHE_DIR_NONE_HACK = Path('/tmp/hpi/cachew_none_hack')
|
||||||
from .core_config import config
|
|
||||||
cdir = config.cache_dir
|
|
||||||
if cdir is None:
|
def cache_dir(suffix: PathIsh | None = None) -> Path:
|
||||||
# TODO handle this in core_config.py
|
from . import core_config as CC
|
||||||
# TODO fallback to default cachew dir instead? or appdirs cache
|
|
||||||
return Path('/var/tmp/cachew')
|
cdir_ = CC.config.get_cache_dir()
|
||||||
|
|
||||||
|
sp: Path | None = None
|
||||||
|
if suffix is not None:
|
||||||
|
sp = Path(suffix)
|
||||||
|
# guess if you do need absolute, better path it directly instead of as suffix?
|
||||||
|
assert not sp.is_absolute(), sp
|
||||||
|
|
||||||
|
# ok, so ideally we could just return cdir_ / sp
|
||||||
|
# however, this function was at first used without the suffix, e.g. cache_dir() / 'some_dir'
|
||||||
|
# but now cache_dir setting can also be None which means 'disable cache'
|
||||||
|
# changing return type to Optional means that it will break for existing users even if the cache isn't used
|
||||||
|
# it's kinda wrong.. so we use dummy path (_CACHE_DIR_NONE_HACK), and then strip it away in core.common.mcachew
|
||||||
|
# this logic is tested via test_cachew_dir_none
|
||||||
|
|
||||||
|
if cdir_ is None:
|
||||||
|
cdir = _CACHE_DIR_NONE_HACK
|
||||||
else:
|
else:
|
||||||
return Path(cdir)
|
cdir = cdir_
|
||||||
|
|
||||||
|
return cdir if sp is None else cdir / sp
|
||||||
|
|
||||||
|
|
||||||
|
"""See core.cachew.cache_dir for the explanation"""
|
||||||
|
|
||||||
|
|
||||||
|
_cache_path_dflt = cast(str, object())
|
||||||
|
|
||||||
|
|
||||||
|
# TODO I don't really like 'mcachew', just 'cache' would be better... maybe?
|
||||||
|
# todo ugh. I think it needs @doublewrap, otherwise @mcachew without args doesn't work
|
||||||
|
# but it's a bit problematic.. doublewrap works by defecting if the first arg is callable
|
||||||
|
# but here cache_path can also be a callable (for lazy/dynamic path)... so unclear how to detect this
|
||||||
|
def _mcachew_impl(cache_path=_cache_path_dflt, **kwargs):
|
||||||
|
"""
|
||||||
|
Stands for 'Maybe cachew'.
|
||||||
|
Defensive wrapper around @cachew to make it an optional dependency.
|
||||||
|
"""
|
||||||
|
if cache_path is _cache_path_dflt:
|
||||||
|
# wasn't specified... so we need to use cache_dir
|
||||||
|
cache_path = cache_dir()
|
||||||
|
|
||||||
|
if isinstance(cache_path, (str, Path)):
|
||||||
|
try:
|
||||||
|
# check that it starts with 'hack' path
|
||||||
|
Path(cache_path).relative_to(_CACHE_DIR_NONE_HACK)
|
||||||
|
except: # noqa: E722 bare except
|
||||||
|
pass # no action needed, doesn't start with 'hack' string
|
||||||
|
else:
|
||||||
|
# todo show warning? tbh unclear how to detect when user stopped using 'old' way and using suffix instead?
|
||||||
|
# if it does, means that user wanted to disable cache
|
||||||
|
cache_path = None
|
||||||
|
try:
|
||||||
|
import cachew
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
warnings.high('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew')
|
||||||
|
return lambda orig_func: orig_func
|
||||||
|
else:
|
||||||
|
kwargs['cache_path'] = cache_path
|
||||||
|
return cachew.cachew(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
R = TypeVar('R')
|
||||||
|
if sys.version_info[:2] >= (3, 10):
|
||||||
|
from typing import ParamSpec
|
||||||
|
else:
|
||||||
|
from typing_extensions import ParamSpec
|
||||||
|
P = ParamSpec('P')
|
||||||
|
CC = Callable[P, R] # need to give it a name, if inlined into bound=, mypy runs in a bug
|
||||||
|
PathProvider = Union[PathIsh, Callable[P, PathIsh]]
|
||||||
|
# NOTE: in cachew, HashFunction type returns str
|
||||||
|
# however in practice, cachew always calls str for its result
|
||||||
|
# so perhaps better to switch it to Any in cachew as well
|
||||||
|
HashFunction = Callable[P, Any]
|
||||||
|
|
||||||
|
F = TypeVar('F', bound=Callable)
|
||||||
|
|
||||||
|
# we need two versions due to @doublewrap
|
||||||
|
# this is when we just annotate as @cachew without any args
|
||||||
|
@overload # type: ignore[no-overload-impl]
|
||||||
|
def mcachew(fun: F) -> F: ...
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def mcachew(
|
||||||
|
cache_path: PathProvider | None = ...,
|
||||||
|
*,
|
||||||
|
force_file: bool = ...,
|
||||||
|
cls: type | None = ...,
|
||||||
|
depends_on: HashFunction = ...,
|
||||||
|
logger: logging.Logger | None = ...,
|
||||||
|
chunk_by: int = ...,
|
||||||
|
synthetic_key: str | None = ...,
|
||||||
|
) -> Callable[[F], F]: ...
|
||||||
|
|
||||||
|
else:
|
||||||
|
mcachew = _mcachew_impl
|
||||||
|
|
112
my/core/cfg.py
112
my/core/cfg.py
|
@ -1,30 +1,44 @@
|
||||||
from typing import TypeVar, Type, Callable, Dict, Any
|
from __future__ import annotations
|
||||||
|
|
||||||
Attrs = Dict[str, Any]
|
import importlib
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from contextlib import ExitStack, contextmanager
|
||||||
|
from typing import Any, Callable, TypeVar
|
||||||
|
|
||||||
|
Attrs = dict[str, Any]
|
||||||
|
|
||||||
C = TypeVar('C')
|
C = TypeVar('C')
|
||||||
|
|
||||||
|
|
||||||
# todo not sure about it, could be overthinking...
|
# todo not sure about it, could be overthinking...
|
||||||
# but short enough to change later
|
# but short enough to change later
|
||||||
# TODO document why it's necessary?
|
# TODO document why it's necessary?
|
||||||
def make_config(cls: Type[C], migration: Callable[[Attrs], Attrs]=lambda x: x) -> C:
|
def make_config(cls: type[C], migration: Callable[[Attrs], Attrs] = lambda x: x) -> C:
|
||||||
props = dict(vars(cls.__base__))
|
user_config = cls.__base__
|
||||||
props = migration(props)
|
old_props = {
|
||||||
|
# NOTE: deliberately use gettatr to 'force' class properties here
|
||||||
|
k: getattr(user_config, k)
|
||||||
|
for k in vars(user_config)
|
||||||
|
}
|
||||||
|
new_props = migration(old_props)
|
||||||
from dataclasses import fields
|
from dataclasses import fields
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
k: v
|
k: v
|
||||||
for k, v in props.items()
|
for k, v in new_props.items()
|
||||||
if k in {f.name for f in fields(cls)}
|
if k in {f.name for f in fields(cls)} # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115
|
||||||
}
|
}
|
||||||
return cls(**params) # type: ignore[call-arg]
|
# todo maybe return type here?
|
||||||
|
return cls(**params)
|
||||||
|
|
||||||
|
|
||||||
F = TypeVar('F')
|
F = TypeVar('F')
|
||||||
from contextlib import contextmanager
|
|
||||||
import inspect
|
|
||||||
from typing import Iterator
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def override_config(config: F) -> Iterator[F]:
|
def _override_config(config: F) -> Iterator[F]:
|
||||||
'''
|
'''
|
||||||
Temporary override for config's parameters, useful for testing/fake data/etc.
|
Temporary override for config's parameters, useful for testing/fake data/etc.
|
||||||
'''
|
'''
|
||||||
|
@ -35,3 +49,77 @@ def override_config(config: F) -> Iterator[F]:
|
||||||
# ugh. __dict__ of type objects isn't writable..
|
# ugh. __dict__ of type objects isn't writable..
|
||||||
for k, v in orig_properties.items():
|
for k, v in orig_properties.items():
|
||||||
setattr(config, k, v)
|
setattr(config, k, v)
|
||||||
|
added = {k for k in set(vars(config).keys()).difference(set(orig_properties.keys())) if not k.startswith('__')}
|
||||||
|
for k in added:
|
||||||
|
delattr(config, k)
|
||||||
|
|
||||||
|
|
||||||
|
ModuleRegex = str
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _reload_modules(modules: ModuleRegex) -> Iterator[None]:
|
||||||
|
# need to use list here, otherwise reordering with set might mess things up
|
||||||
|
def loaded_modules() -> list[str]:
|
||||||
|
return [name for name in sys.modules if re.fullmatch(modules, name)]
|
||||||
|
|
||||||
|
modules_before = loaded_modules()
|
||||||
|
|
||||||
|
# uhh... seems that reversed might make more sense -- not 100% sure why, but this works for tests/reddit.py
|
||||||
|
for m in reversed(modules_before):
|
||||||
|
# ugh... seems that reload works whereas pop doesn't work in some cases (e.g. on tests/reddit.py)
|
||||||
|
# sys.modules.pop(m, None)
|
||||||
|
importlib.reload(sys.modules[m])
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
modules_after = loaded_modules()
|
||||||
|
modules_before_set = set(modules_before)
|
||||||
|
for m in modules_after:
|
||||||
|
if m in modules_before_set:
|
||||||
|
# was previously loaded, so need to reload to pick up old config
|
||||||
|
importlib.reload(sys.modules[m])
|
||||||
|
else:
|
||||||
|
# wasn't previously loaded, so need to unload it
|
||||||
|
# otherwise it might fail due to missing config etc
|
||||||
|
sys.modules.pop(m, None)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def tmp_config(*, modules: ModuleRegex | None = None, config=None):
|
||||||
|
if modules is None:
|
||||||
|
assert config is None
|
||||||
|
if modules is not None:
|
||||||
|
assert config is not None
|
||||||
|
|
||||||
|
import my.config
|
||||||
|
|
||||||
|
with ExitStack() as module_reload_stack, _override_config(my.config) as new_config:
|
||||||
|
if config is not None:
|
||||||
|
overrides = {k: v for k, v in vars(config).items() if not k.startswith('__')}
|
||||||
|
for k, v in overrides.items():
|
||||||
|
setattr(new_config, k, v)
|
||||||
|
|
||||||
|
if modules is not None:
|
||||||
|
module_reload_stack.enter_context(_reload_modules(modules))
|
||||||
|
yield new_config
|
||||||
|
|
||||||
|
|
||||||
|
def test_tmp_config() -> None:
|
||||||
|
class extra:
|
||||||
|
data_path = '/path/to/data'
|
||||||
|
|
||||||
|
with tmp_config() as c:
|
||||||
|
assert c.google != 'whatever'
|
||||||
|
assert not hasattr(c, 'extra')
|
||||||
|
c.extra = extra
|
||||||
|
c.google = 'whatever'
|
||||||
|
# todo hmm. not sure what should do about new properties??
|
||||||
|
assert not hasattr(c, 'extra')
|
||||||
|
assert c.google != 'whatever'
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
# todo properly deprecate, this isn't really meant for public use
|
||||||
|
override_config = _override_config
|
||||||
|
|
|
@ -1,273 +1,142 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from collections.abc import Iterable, Sequence
|
||||||
from glob import glob as do_glob
|
from glob import glob as do_glob
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
from typing import (
|
||||||
import functools
|
TYPE_CHECKING,
|
||||||
import types
|
Callable,
|
||||||
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple, TYPE_CHECKING
|
Generic,
|
||||||
import warnings
|
TypeVar,
|
||||||
from . import warnings as core_warnings
|
Union,
|
||||||
|
)
|
||||||
|
|
||||||
|
from . import compat, warnings
|
||||||
|
|
||||||
# some helper functions
|
# some helper functions
|
||||||
|
# TODO start deprecating this? soon we'd be able to use Path | str syntax which is shorter and more explicit
|
||||||
PathIsh = Union[Path, str]
|
PathIsh = Union[Path, str]
|
||||||
|
|
||||||
# TODO only used in tests? not sure if useful at all.
|
|
||||||
# TODO port annotations to kython?..
|
|
||||||
def import_file(p: PathIsh, name: Optional[str]=None) -> types.ModuleType:
|
|
||||||
p = Path(p)
|
|
||||||
if name is None:
|
|
||||||
name = p.stem
|
|
||||||
import importlib.util
|
|
||||||
spec = importlib.util.spec_from_file_location(name, p)
|
|
||||||
foo = importlib.util.module_from_spec(spec)
|
|
||||||
loader = spec.loader; assert loader is not None
|
|
||||||
loader.exec_module(foo) # type: ignore[attr-defined]
|
|
||||||
return foo
|
|
||||||
|
|
||||||
|
|
||||||
def import_from(path: PathIsh, name: str) -> types.ModuleType:
|
|
||||||
path = str(path)
|
|
||||||
import sys
|
|
||||||
try:
|
|
||||||
sys.path.append(path)
|
|
||||||
import importlib
|
|
||||||
return importlib.import_module(name)
|
|
||||||
finally:
|
|
||||||
sys.path.remove(path)
|
|
||||||
|
|
||||||
|
|
||||||
def import_dir(path: PathIsh, extra: str='') -> types.ModuleType:
|
|
||||||
p = Path(path)
|
|
||||||
if p.parts[0] == '~':
|
|
||||||
p = p.expanduser() # TODO eh. not sure about this..
|
|
||||||
return import_from(p.parent, p.name + extra)
|
|
||||||
|
|
||||||
|
|
||||||
T = TypeVar('T')
|
|
||||||
K = TypeVar('K')
|
|
||||||
V = TypeVar('V')
|
|
||||||
|
|
||||||
def the(l: Iterable[T]) -> T:
|
|
||||||
it = iter(l)
|
|
||||||
try:
|
|
||||||
first = next(it)
|
|
||||||
except StopIteration as ee:
|
|
||||||
raise RuntimeError('Empty iterator?')
|
|
||||||
assert all(e == first for e in it)
|
|
||||||
return first
|
|
||||||
|
|
||||||
|
|
||||||
# TODO more_itertools.bucket?
|
|
||||||
def group_by_key(l: Iterable[T], key: Callable[[T], K]) -> Dict[K, List[T]]:
|
|
||||||
res: Dict[K, List[T]] = {}
|
|
||||||
for i in l:
|
|
||||||
kk = key(i)
|
|
||||||
lst = res.get(kk, [])
|
|
||||||
lst.append(i)
|
|
||||||
res[kk] = lst
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
def _identity(v: T) -> V:
|
|
||||||
return cast(V, v)
|
|
||||||
|
|
||||||
def make_dict(l: Iterable[T], key: Callable[[T], K], value: Callable[[T], V]=_identity) -> Dict[K, V]:
|
|
||||||
res: Dict[K, V] = {}
|
|
||||||
for i in l:
|
|
||||||
k = key(i)
|
|
||||||
v = value(i)
|
|
||||||
pv = res.get(k, None) # type: ignore
|
|
||||||
if pv is not None:
|
|
||||||
raise RuntimeError(f"Duplicate key: {k}. Previous value: {pv}, new value: {v}")
|
|
||||||
res[k] = v
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
Cl = TypeVar('Cl')
|
|
||||||
R = TypeVar('R')
|
|
||||||
|
|
||||||
def cproperty(f: Callable[[Cl], R]) -> R:
|
|
||||||
return property(functools.lru_cache(maxsize=1)(f)) # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
# https://stackoverflow.com/a/12377059/706389
|
|
||||||
def listify(fn=None, wrapper=list):
|
|
||||||
"""
|
|
||||||
Wraps a function's return value in wrapper (e.g. list)
|
|
||||||
Useful when an algorithm can be expressed more cleanly as a generator
|
|
||||||
"""
|
|
||||||
def listify_return(fn):
|
|
||||||
@functools.wraps(fn)
|
|
||||||
def listify_helper(*args, **kw):
|
|
||||||
return wrapper(fn(*args, **kw))
|
|
||||||
return listify_helper
|
|
||||||
if fn is None:
|
|
||||||
return listify_return
|
|
||||||
return listify_return(fn)
|
|
||||||
|
|
||||||
|
|
||||||
# todo use in bluemaestro
|
|
||||||
# def dictify(fn=None, key=None, value=None):
|
|
||||||
# def md(it):
|
|
||||||
# return make_dict(it, key=key, value=value)
|
|
||||||
# return listify(fn=fn, wrapper=md)
|
|
||||||
|
|
||||||
|
|
||||||
from .logging import setup_logger, LazyLogger
|
|
||||||
|
|
||||||
|
|
||||||
Paths = Union[Sequence[PathIsh], PathIsh]
|
Paths = Union[Sequence[PathIsh], PathIsh]
|
||||||
|
|
||||||
|
|
||||||
def _is_compressed(p: Path) -> bool:
|
|
||||||
# todo kinda lame way for now.. use mime ideally?
|
|
||||||
# should cooperate with kompress.kopen?
|
|
||||||
return p.suffix in {'.xz', '.lz4', '.zstd'}
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_GLOB = '*'
|
DEFAULT_GLOB = '*'
|
||||||
|
|
||||||
|
|
||||||
def get_files(
|
def get_files(
|
||||||
pp: Paths,
|
pp: Paths,
|
||||||
glob: str=DEFAULT_GLOB,
|
glob: str = DEFAULT_GLOB,
|
||||||
sort: bool=True,
|
*,
|
||||||
guess_compression: bool=True,
|
sort: bool = True,
|
||||||
) -> Tuple[Path, ...]:
|
guess_compression: bool = True,
|
||||||
|
) -> tuple[Path, ...]:
|
||||||
"""
|
"""
|
||||||
Helper function to avoid boilerplate.
|
Helper function to avoid boilerplate.
|
||||||
|
|
||||||
Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense
|
Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense
|
||||||
"""
|
"""
|
||||||
# TODO FIXME mm, some wrapper to assert iterator isn't empty?
|
# TODO FIXME mm, some wrapper to assert iterator isn't empty?
|
||||||
sources: List[Path]
|
sources: list[Path]
|
||||||
if isinstance(pp, Path):
|
if isinstance(pp, Path):
|
||||||
sources = [pp]
|
sources = [pp]
|
||||||
elif isinstance(pp, str):
|
elif isinstance(pp, str):
|
||||||
if pp == '':
|
if pp == '':
|
||||||
# special case -- makes sense for optional data sources, etc
|
# special case -- makes sense for optional data sources, etc
|
||||||
return () # early return to prevent warnings etc
|
return () # early return to prevent warnings etc
|
||||||
sources = [Path(pp)]
|
sources = [Path(pp)]
|
||||||
else:
|
else:
|
||||||
sources = [Path(p) for p in pp]
|
sources = [p if isinstance(p, Path) else Path(p) for p in pp]
|
||||||
|
|
||||||
def caller() -> str:
|
def caller() -> str:
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
# TODO ugh. very flaky... -3 because [<this function>, get_files(), <actual caller>]
|
# TODO ugh. very flaky... -3 because [<this function>, get_files(), <actual caller>]
|
||||||
return traceback.extract_stack()[-3].filename
|
return traceback.extract_stack()[-3].filename
|
||||||
|
|
||||||
paths: List[Path] = []
|
paths: list[Path] = []
|
||||||
for src in sources:
|
for src in sources:
|
||||||
if src.parts[0] == '~':
|
if src.parts[0] == '~':
|
||||||
src = src.expanduser()
|
src = src.expanduser()
|
||||||
if src.is_dir():
|
# note: glob handled first, because e.g. on Windows asterisk makes is_dir unhappy
|
||||||
gp: Iterable[Path] = src.glob(glob) # todo not sure if should be recursive?
|
gs = str(src)
|
||||||
|
if '*' in gs:
|
||||||
|
if glob != DEFAULT_GLOB:
|
||||||
|
warnings.medium(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!")
|
||||||
|
paths.extend(map(Path, do_glob(gs))) # noqa: PTH207
|
||||||
|
elif os.path.isdir(str(src)): # noqa: PTH112
|
||||||
|
# NOTE: we're using os.path here on purpose instead of src.is_dir
|
||||||
|
# the reason is is_dir for archives might return True and then
|
||||||
|
# this clause would try globbing insize the archives
|
||||||
|
# this is generally undesirable (since modules handle archives themselves)
|
||||||
|
|
||||||
|
# todo not sure if should be recursive?
|
||||||
|
# note: glob='**/*.ext' works without any changes.. so perhaps it's ok as it is
|
||||||
|
gp: Iterable[Path] = src.glob(glob)
|
||||||
paths.extend(gp)
|
paths.extend(gp)
|
||||||
else:
|
else:
|
||||||
ss = str(src)
|
assert src.exists(), src
|
||||||
if '*' in ss:
|
# todo assert matches glob??
|
||||||
if glob != DEFAULT_GLOB:
|
paths.append(src)
|
||||||
warnings.warn(f"{caller()}: treating {ss} as glob path. Explicit glob={glob} argument is ignored!")
|
|
||||||
paths.extend(map(Path, do_glob(ss)))
|
|
||||||
else:
|
|
||||||
if not src.is_file():
|
|
||||||
# todo not sure, might be race condition?
|
|
||||||
raise RuntimeError(f"Expected '{src}' to exist")
|
|
||||||
# todo assert matches glob??
|
|
||||||
paths.append(src)
|
|
||||||
|
|
||||||
if sort:
|
if sort:
|
||||||
paths = list(sorted(paths))
|
paths = sorted(paths)
|
||||||
|
|
||||||
if len(paths) == 0:
|
if len(paths) == 0:
|
||||||
# todo make it conditionally defensive based on some global settings
|
# todo make it conditionally defensive based on some global settings
|
||||||
core_warnings.high(f'''
|
warnings.high(f'''
|
||||||
{caller()}: no paths were matched against {pp}. This might result in missing data. Likely, the directory you passed is empty.
|
{caller()}: no paths were matched against {pp}. This might result in missing data. Likely, the directory you passed is empty.
|
||||||
'''.strip())
|
'''.strip())
|
||||||
# traceback is useful to figure out what config caused it?
|
# traceback is useful to figure out what config caused it?
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
traceback.print_stack()
|
traceback.print_stack()
|
||||||
|
|
||||||
if guess_compression:
|
if guess_compression:
|
||||||
from .kompress import CPath
|
from .kompress import CPath, ZipPath, is_compressed
|
||||||
paths = [CPath(p) if _is_compressed(p) else p for p in paths]
|
|
||||||
|
# NOTE: wrap is just for backwards compat with vendorized kompress
|
||||||
|
# with kompress library, only is_compressed check and Cpath should be enough
|
||||||
|
def wrap(p: Path) -> Path:
|
||||||
|
if isinstance(p, ZipPath):
|
||||||
|
return p
|
||||||
|
if p.suffix == '.zip':
|
||||||
|
return ZipPath(p) # type: ignore[return-value]
|
||||||
|
if is_compressed(p):
|
||||||
|
return CPath(p)
|
||||||
|
return p
|
||||||
|
|
||||||
|
paths = [wrap(p) for p in paths]
|
||||||
return tuple(paths)
|
return tuple(paths)
|
||||||
|
|
||||||
|
|
||||||
# TODO annotate it, perhaps use 'dependent' type (for @doublewrap stuff)
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from typing import Callable, TypeVar
|
|
||||||
from typing_extensions import Protocol
|
|
||||||
# TODO reuse types from cachew? although not sure if we want hard dependency on it in typecheck time..
|
|
||||||
# I guess, later just define pass through once this is fixed: https://github.com/python/typing/issues/270
|
|
||||||
# ok, that's actually a super nice 'pattern'
|
|
||||||
F = TypeVar('F')
|
|
||||||
class McachewType(Protocol):
|
|
||||||
def __call__(
|
|
||||||
self,
|
|
||||||
cache_path: Any=None,
|
|
||||||
*,
|
|
||||||
hashf: Any=None, # todo deprecate
|
|
||||||
depends_on: Any=None,
|
|
||||||
force_file: bool=False,
|
|
||||||
chunk_by: int=0,
|
|
||||||
logger: Any=None,
|
|
||||||
) -> Callable[[F], F]:
|
|
||||||
...
|
|
||||||
|
|
||||||
mcachew: McachewType
|
|
||||||
|
|
||||||
# TODO set default cache dir here instead?
|
|
||||||
# todo ugh. I think it needs doublewrap, otherwise @mcachew without args doesn't work
|
|
||||||
def mcachew(*args, **kwargs): # type: ignore[no-redef]
|
|
||||||
"""
|
|
||||||
Stands for 'Maybe cachew'.
|
|
||||||
Defensive wrapper around @cachew to make it an optional dependency.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import cachew
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew')
|
|
||||||
return lambda orig_func: orig_func
|
|
||||||
else:
|
|
||||||
return cachew.cachew(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
@functools.lru_cache(1)
|
|
||||||
def _magic():
|
|
||||||
import magic # type: ignore
|
|
||||||
return magic.Magic(mime=True)
|
|
||||||
|
|
||||||
|
|
||||||
# TODO could reuse in pdf module?
|
|
||||||
import mimetypes # todo do I need init()?
|
|
||||||
# todo wtf? fastermime thinks it's mime is application/json even if the extension is xz??
|
|
||||||
# whereas magic detects correctly: application/x-zstd and application/x-xz
|
|
||||||
def fastermime(path: PathIsh) -> str:
|
|
||||||
paths = str(path)
|
|
||||||
# mimetypes is faster
|
|
||||||
(mime, _) = mimetypes.guess_type(paths)
|
|
||||||
if mime is not None:
|
|
||||||
return mime
|
|
||||||
# magic is slower but returns more stuff
|
|
||||||
# TODO Result type?; it's kinda racey, but perhaps better to let the caller decide?
|
|
||||||
return _magic().from_file(paths)
|
|
||||||
|
|
||||||
|
|
||||||
Json = Dict[str, Any]
|
|
||||||
|
|
||||||
|
|
||||||
from typing import TypeVar, Callable, Generic
|
|
||||||
|
|
||||||
_C = TypeVar('_C')
|
|
||||||
_R = TypeVar('_R')
|
_R = TypeVar('_R')
|
||||||
|
|
||||||
|
|
||||||
# https://stackoverflow.com/a/5192374/706389
|
# https://stackoverflow.com/a/5192374/706389
|
||||||
|
# NOTE: it was added to stdlib in 3.9 and then deprecated in 3.11
|
||||||
|
# seems that the suggested solution is to use custom decorator?
|
||||||
class classproperty(Generic[_R]):
|
class classproperty(Generic[_R]):
|
||||||
def __init__(self, f: Callable[[_C], _R]) -> None:
|
def __init__(self, f: Callable[..., _R]) -> None:
|
||||||
self.f = f
|
self.f = f
|
||||||
|
|
||||||
def __get__(self, obj: None, cls: _C) -> _R:
|
def __get__(self, obj, cls) -> _R:
|
||||||
return self.f(cls)
|
return self.f(cls)
|
||||||
|
|
||||||
|
|
||||||
|
def test_classproperty() -> None:
|
||||||
|
from .compat import assert_type
|
||||||
|
|
||||||
|
class C:
|
||||||
|
@classproperty
|
||||||
|
def prop(cls) -> str:
|
||||||
|
return 'hello'
|
||||||
|
|
||||||
|
res = C.prop
|
||||||
|
assert_type(res, str)
|
||||||
|
assert res == 'hello'
|
||||||
|
|
||||||
|
|
||||||
# hmm, this doesn't really work with mypy well..
|
# hmm, this doesn't really work with mypy well..
|
||||||
# https://github.com/python/mypy/issues/6244
|
# https://github.com/python/mypy/issues/6244
|
||||||
# class staticproperty(Generic[_R]):
|
# class staticproperty(Generic[_R]):
|
||||||
|
@ -277,186 +146,117 @@ class classproperty(Generic[_R]):
|
||||||
# def __get__(self) -> _R:
|
# def __get__(self) -> _R:
|
||||||
# return self.f()
|
# return self.f()
|
||||||
|
|
||||||
# for now just serves documentation purposes... but one day might make it statically verifiable where possible?
|
|
||||||
# TODO e.g. maybe use opaque mypy alias?
|
|
||||||
tzdatetime = datetime
|
|
||||||
|
|
||||||
|
|
||||||
fromisoformat: Callable[[str], datetime]
|
|
||||||
import sys
|
|
||||||
if sys.version_info[:2] >= (3, 7):
|
|
||||||
# prevent mypy on py3.6 from complaining...
|
|
||||||
fromisoformat_real = datetime.fromisoformat
|
|
||||||
fromisoformat = fromisoformat_real
|
|
||||||
else:
|
|
||||||
from .py37 import fromisoformat
|
|
||||||
|
|
||||||
|
|
||||||
if sys.version_info[:2] >= (3, 8):
|
|
||||||
from typing import Literal
|
|
||||||
else:
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from typing_extensions import Literal
|
|
||||||
else:
|
|
||||||
# erm.. I guess as long as it's not crashing, whatever...
|
|
||||||
Literal = Union
|
|
||||||
|
|
||||||
|
|
||||||
# TODO doctests?
|
|
||||||
def isoparse(s: str) -> tzdatetime:
|
|
||||||
"""
|
|
||||||
Parses timestamps formatted like 2020-05-01T10:32:02.925961Z
|
|
||||||
"""
|
|
||||||
# TODO could use dateutil? but it's quite slow as far as I remember..
|
|
||||||
# TODO support non-utc.. somehow?
|
|
||||||
assert s.endswith('Z'), s
|
|
||||||
s = s[:-1] + '+00:00'
|
|
||||||
return fromisoformat(s)
|
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
# https://stackoverflow.com/a/295466/706389
|
# https://stackoverflow.com/a/295466/706389
|
||||||
def get_valid_filename(s: str) -> str:
|
def get_valid_filename(s: str) -> str:
|
||||||
s = str(s).strip().replace(' ', '_')
|
s = str(s).strip().replace(' ', '_')
|
||||||
return re.sub(r'(?u)[^-\w.]', '', s)
|
return re.sub(r'(?u)[^-\w.]', '', s)
|
||||||
|
|
||||||
|
|
||||||
from typing import Generic, Sized, Callable
|
# TODO deprecate and suggest to use one from my.core directly? not sure
|
||||||
|
from .utils.itertools import unique_everseen # noqa: F401
|
||||||
|
|
||||||
|
### legacy imports, keeping them here for backwards compatibility
|
||||||
|
## hiding behind TYPE_CHECKING so it works in runtime
|
||||||
|
## in principle, warnings.deprecated decorator should cooperate with mypy, but doesn't look like it works atm?
|
||||||
|
## perhaps it doesn't work when it's used from typing_extensions
|
||||||
|
|
||||||
# X = TypeVar('X')
|
if not TYPE_CHECKING:
|
||||||
def _warn_iterator(it, f: Any=None):
|
from .compat import deprecated
|
||||||
emitted = False
|
|
||||||
for i in it:
|
|
||||||
yield i
|
|
||||||
emitted = True
|
|
||||||
if not emitted:
|
|
||||||
warnings.warn(f"Function {f} didn't emit any data, make sure your config paths are correct")
|
|
||||||
|
|
||||||
|
@deprecated('use my.core.compat.assert_never instead')
|
||||||
|
def assert_never(*args, **kwargs):
|
||||||
|
return compat.assert_never(*args, **kwargs)
|
||||||
|
|
||||||
# TODO ugh, so I want to express something like:
|
@deprecated('use my.core.compat.fromisoformat instead')
|
||||||
# X = TypeVar('X')
|
def isoparse(*args, **kwargs):
|
||||||
# C = TypeVar('C', bound=Iterable[X])
|
return compat.fromisoformat(*args, **kwargs)
|
||||||
# _warn_iterable(it: C) -> C
|
|
||||||
# but apparently I can't??? ugh.
|
|
||||||
# https://github.com/python/typing/issues/548
|
|
||||||
# I guess for now overloads are fine...
|
|
||||||
|
|
||||||
from typing import overload
|
@deprecated('use more_itertools.one instead')
|
||||||
X = TypeVar('X')
|
def the(*args, **kwargs):
|
||||||
@overload
|
import more_itertools
|
||||||
def _warn_iterable(it: List[X] , f: Any=None) -> List[X] : ...
|
|
||||||
@overload
|
|
||||||
def _warn_iterable(it: Iterable[X], f: Any=None) -> Iterable[X]: ...
|
|
||||||
def _warn_iterable(it, f=None):
|
|
||||||
if isinstance(it, Sized):
|
|
||||||
sz = len(it)
|
|
||||||
if sz == 0:
|
|
||||||
warnings.warn(f"Function {f} returned empty container, make sure your config paths are correct")
|
|
||||||
return it
|
|
||||||
else:
|
|
||||||
return _warn_iterator(it, f=f)
|
|
||||||
|
|
||||||
|
return more_itertools.one(*args, **kwargs)
|
||||||
|
|
||||||
# ok, this seems to work...
|
@deprecated('use functools.cached_property instead')
|
||||||
# https://github.com/python/mypy/issues/1927#issue-167100413
|
def cproperty(*args, **kwargs):
|
||||||
FL = TypeVar('FL', bound=Callable[..., List])
|
import functools
|
||||||
FI = TypeVar('FI', bound=Callable[..., Iterable])
|
|
||||||
|
|
||||||
@overload
|
return functools.cached_property(*args, **kwargs)
|
||||||
def warn_if_empty(f: FL) -> FL: ...
|
|
||||||
@overload
|
|
||||||
def warn_if_empty(f: FI) -> FI: ...
|
|
||||||
|
|
||||||
|
@deprecated('use more_itertools.bucket instead')
|
||||||
|
def group_by_key(l, key):
|
||||||
|
res = {}
|
||||||
|
for i in l:
|
||||||
|
kk = key(i)
|
||||||
|
lst = res.get(kk, [])
|
||||||
|
lst.append(i)
|
||||||
|
res[kk] = lst
|
||||||
|
return res
|
||||||
|
|
||||||
def warn_if_empty(f):
|
@deprecated('use my.core.utils.itertools.make_dict instead')
|
||||||
from functools import wraps
|
def make_dict(*args, **kwargs):
|
||||||
@wraps(f)
|
from .utils import itertools as UI
|
||||||
def wrapped(*args, **kwargs):
|
|
||||||
res = f(*args, **kwargs)
|
|
||||||
return _warn_iterable(res, f=f)
|
|
||||||
return wrapped # type: ignore
|
|
||||||
|
|
||||||
|
return UI.make_dict(*args, **kwargs)
|
||||||
|
|
||||||
# hacky hook to speed up for 'hpi doctor'
|
@deprecated('use my.core.utils.itertools.listify instead')
|
||||||
# todo think about something better
|
def listify(*args, **kwargs):
|
||||||
QUICK_STATS = False
|
from .utils import itertools as UI
|
||||||
|
|
||||||
|
return UI.listify(*args, **kwargs)
|
||||||
|
|
||||||
C = TypeVar('C')
|
@deprecated('use my.core.warn_if_empty instead')
|
||||||
Stats = Dict[str, Any]
|
def warn_if_empty(*args, **kwargs):
|
||||||
# todo not sure about return type...
|
from .utils import itertools as UI
|
||||||
def stat(func: Callable[[], Iterable[C]]) -> Stats:
|
|
||||||
fr = func()
|
|
||||||
tname = type(fr).__name__
|
|
||||||
if tname == 'DataFrame':
|
|
||||||
# dynamic, because pandas is an optional dependency..
|
|
||||||
df = cast(Any, fr) # todo ugh, not sure how to annotate properly
|
|
||||||
res = dict(
|
|
||||||
dtypes=df.dtypes.to_dict(),
|
|
||||||
rows=len(df),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
res = _stat_iterable(fr)
|
|
||||||
return {
|
|
||||||
func.__name__: res,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
return UI.listify(*args, **kwargs)
|
||||||
|
|
||||||
def _stat_iterable(it: Iterable[C]) -> Any:
|
@deprecated('use my.core.stat instead')
|
||||||
from more_itertools import ilen, take, first
|
def stat(*args, **kwargs):
|
||||||
|
from . import stats
|
||||||
|
|
||||||
# todo not sure if there is something in more_itertools to compute this?
|
return stats.stat(*args, **kwargs)
|
||||||
total = 0
|
|
||||||
errors = 0
|
|
||||||
last = None
|
|
||||||
def funcit():
|
|
||||||
nonlocal errors, last, total
|
|
||||||
for x in it:
|
|
||||||
total += 1
|
|
||||||
if isinstance(x, Exception):
|
|
||||||
errors += 1
|
|
||||||
else:
|
|
||||||
last = x
|
|
||||||
yield x
|
|
||||||
|
|
||||||
eit = funcit()
|
@deprecated('use my.core.make_logger instead')
|
||||||
count: Any
|
def LazyLogger(*args, **kwargs):
|
||||||
if QUICK_STATS:
|
from . import logging
|
||||||
initial = take(100, eit)
|
|
||||||
count = len(initial)
|
|
||||||
if first(eit, None) is not None: # todo can actually be none...
|
|
||||||
# haven't exhausted
|
|
||||||
count = f'{count}+'
|
|
||||||
else:
|
|
||||||
count = ilen(eit)
|
|
||||||
|
|
||||||
res = {
|
return logging.LazyLogger(*args, **kwargs)
|
||||||
'count': count,
|
|
||||||
}
|
|
||||||
|
|
||||||
if total == 0:
|
@deprecated('use my.core.types.asdict instead')
|
||||||
# not sure but I guess a good balance? wouldn't want to throw early here?
|
def asdict(*args, **kwargs):
|
||||||
res['warning'] = 'THE ITERABLE RETURNED NO DATA'
|
from . import types
|
||||||
|
|
||||||
if errors > 0:
|
return types.asdict(*args, **kwargs)
|
||||||
res['errors'] = errors
|
|
||||||
|
|
||||||
if last is not None:
|
# todo wrap these in deprecated decorator as well?
|
||||||
dt = guess_datetime(last)
|
# TODO hmm how to deprecate these in runtime?
|
||||||
if dt is not None:
|
# tricky cause they are actually classes/types
|
||||||
res['last'] = dt
|
from typing import Literal # noqa: F401
|
||||||
return res
|
|
||||||
|
|
||||||
|
from .cachew import mcachew # noqa: F401
|
||||||
|
|
||||||
# experimental, not sure about it..
|
# this is kinda internal, should just use my.core.logging.setup_logger if necessary
|
||||||
def guess_datetime(x: Any) -> Optional[datetime]:
|
from .logging import setup_logger
|
||||||
# todo support datacalsses
|
from .stats import Stats
|
||||||
asdict = getattr(x, '_asdict', None)
|
from .types import (
|
||||||
if asdict is None:
|
Json,
|
||||||
return None
|
datetime_aware,
|
||||||
# todo check if there are multiple?
|
datetime_naive,
|
||||||
for k, v in asdict().items():
|
)
|
||||||
if isinstance(v, datetime):
|
|
||||||
return v
|
tzdatetime = datetime_aware
|
||||||
return None
|
else:
|
||||||
|
from .compat import Never
|
||||||
|
|
||||||
|
# make these invalid during type check while working in runtime
|
||||||
|
Stats = Never
|
||||||
|
tzdatetime = Never
|
||||||
|
Json = Never
|
||||||
|
datetime_naive = Never
|
||||||
|
datetime_aware = Never
|
||||||
|
###
|
||||||
|
|
|
@ -1,49 +1,139 @@
|
||||||
'''
|
'''
|
||||||
Some backwards compatibility stuff/deprecation helpers
|
Contains backwards compatibility helpers for different python versions.
|
||||||
|
If something is relevant to HPI itself, please put it in .hpi_compat instead
|
||||||
'''
|
'''
|
||||||
from types import ModuleType
|
|
||||||
|
|
||||||
from . import warnings
|
from __future__ import annotations
|
||||||
from .common import LazyLogger
|
|
||||||
|
import sys
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 13):
|
||||||
|
from warnings import deprecated
|
||||||
|
else:
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
|
|
||||||
logger = LazyLogger('my.core.compat')
|
# keeping just for backwards compatibility, used to have compat implementation for 3.6
|
||||||
|
if not TYPE_CHECKING:
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
@deprecated('use .backup method on sqlite3.Connection directly instead')
|
||||||
|
def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwargs) -> None:
|
||||||
|
# TODO warn here?
|
||||||
|
source.backup(dest, **kwargs)
|
||||||
|
|
||||||
|
# keeping for runtime backwards compatibility (added in 3.9)
|
||||||
|
@deprecated('use .removeprefix method on string directly instead')
|
||||||
|
def removeprefix(text: str, prefix: str) -> str:
|
||||||
|
return text.removeprefix(prefix)
|
||||||
|
|
||||||
|
@deprecated('use .removesuffix method on string directly instead')
|
||||||
|
def removesuffix(text: str, suffix: str) -> str:
|
||||||
|
return text.removesuffix(suffix)
|
||||||
|
|
||||||
|
##
|
||||||
|
|
||||||
|
## used to have compat function before 3.8 for these, keeping for runtime back compatibility
|
||||||
|
from functools import cached_property
|
||||||
|
from typing import Literal, Protocol, TypedDict
|
||||||
|
##
|
||||||
|
|
||||||
|
|
||||||
def pre_pip_dal_handler(
|
if sys.version_info[:2] >= (3, 10):
|
||||||
name: str,
|
from typing import ParamSpec
|
||||||
e: ModuleNotFoundError,
|
else:
|
||||||
cfg,
|
from typing_extensions import ParamSpec
|
||||||
requires=[],
|
|
||||||
) -> ModuleType:
|
|
||||||
'''
|
|
||||||
https://github.com/karlicoss/HPI/issues/79
|
|
||||||
'''
|
|
||||||
if e.name != name:
|
|
||||||
# the module itself was imported, so the problem is with some dependencies
|
|
||||||
raise e
|
|
||||||
try:
|
|
||||||
dal = _get_dal(cfg, name)
|
|
||||||
warnings.high(f'''
|
|
||||||
Specifying modules' dependencies in the config or in my/config/repos is deprecated!
|
|
||||||
Please install {' '.join(requires)} as PIP packages (see the corresponding README instructions).
|
|
||||||
'''.strip(), stacklevel=2)
|
|
||||||
except ModuleNotFoundError as ee:
|
|
||||||
dal = None
|
|
||||||
|
|
||||||
if dal is None:
|
|
||||||
# probably means there was nothing in the old config in the first place
|
|
||||||
# so we should raise the original exception
|
|
||||||
raise e
|
|
||||||
return dal
|
|
||||||
|
|
||||||
|
|
||||||
def _get_dal(cfg, module_name: str):
|
# bisect_left doesn't have a 'key' parameter (which we use)
|
||||||
mpath = getattr(cfg, module_name, None)
|
# till python3.10
|
||||||
if mpath is not None:
|
if sys.version_info[:2] <= (3, 9):
|
||||||
from .common import import_dir
|
from typing import Any, Callable, List, Optional, TypeVar # noqa: UP035
|
||||||
return import_dir(mpath, '.dal')
|
|
||||||
else:
|
|
||||||
from importlib import import_module
|
|
||||||
return import_module(f'my.config.repos.{module_name}.dal')
|
|
||||||
|
|
||||||
|
X = TypeVar('X')
|
||||||
|
|
||||||
|
# copied from python src
|
||||||
|
# fmt: off
|
||||||
|
def bisect_left(a: list[Any], x: Any, lo: int=0, hi: int | None=None, *, key: Callable[..., Any] | None=None) -> int:
|
||||||
|
if lo < 0:
|
||||||
|
raise ValueError('lo must be non-negative')
|
||||||
|
if hi is None:
|
||||||
|
hi = len(a)
|
||||||
|
# Note, the comparison uses "<" to match the
|
||||||
|
# __lt__() logic in list.sort() and in heapq.
|
||||||
|
if key is None:
|
||||||
|
while lo < hi:
|
||||||
|
mid = (lo + hi) // 2
|
||||||
|
if a[mid] < x:
|
||||||
|
lo = mid + 1
|
||||||
|
else:
|
||||||
|
hi = mid
|
||||||
|
else:
|
||||||
|
while lo < hi:
|
||||||
|
mid = (lo + hi) // 2
|
||||||
|
if key(a[mid]) < x:
|
||||||
|
lo = mid + 1
|
||||||
|
else:
|
||||||
|
hi = mid
|
||||||
|
return lo
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
else:
|
||||||
|
from bisect import bisect_left
|
||||||
|
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 11):
|
||||||
|
fromisoformat = datetime.fromisoformat
|
||||||
|
else:
|
||||||
|
# fromisoformat didn't support Z as "utc" before 3.11
|
||||||
|
# https://docs.python.org/3/library/datetime.html#datetime.datetime.fromisoformat
|
||||||
|
|
||||||
|
def fromisoformat(date_string: str) -> datetime:
|
||||||
|
if date_string.endswith('Z'):
|
||||||
|
date_string = date_string[:-1] + '+00:00'
|
||||||
|
return datetime.fromisoformat(date_string)
|
||||||
|
|
||||||
|
|
||||||
|
def test_fromisoformat() -> None:
|
||||||
|
from datetime import timezone
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
# feedbin has this format
|
||||||
|
assert fromisoformat('2020-05-01T10:32:02.925961Z') == datetime(
|
||||||
|
2020, 5, 1, 10, 32, 2, 925961, timezone.utc,
|
||||||
|
)
|
||||||
|
|
||||||
|
# polar has this format
|
||||||
|
assert fromisoformat('2018-11-28T22:04:01.304Z') == datetime(
|
||||||
|
2018, 11, 28, 22, 4, 1, 304000, timezone.utc,
|
||||||
|
)
|
||||||
|
|
||||||
|
# stackexchange, runnerup has this format
|
||||||
|
assert fromisoformat('2020-11-30T00:53:12Z') == datetime(
|
||||||
|
2020, 11, 30, 0, 53, 12, 0, timezone.utc,
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
# arbtt has this format (sometimes less/more than 6 digits in milliseconds)
|
||||||
|
# TODO doesn't work atm, not sure if really should be supported...
|
||||||
|
# maybe should have flags for weird formats?
|
||||||
|
# assert isoparse('2017-07-18T18:59:38.21731Z') == datetime(
|
||||||
|
# 2017, 7, 18, 18, 59, 38, 217310, timezone.utc,
|
||||||
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 10):
|
||||||
|
from types import NoneType
|
||||||
|
from typing import TypeAlias
|
||||||
|
else:
|
||||||
|
NoneType = type(None)
|
||||||
|
from typing_extensions import TypeAlias
|
||||||
|
|
||||||
|
|
||||||
|
if sys.version_info[:2] >= (3, 11):
|
||||||
|
from typing import Never, assert_never, assert_type
|
||||||
|
else:
|
||||||
|
from typing_extensions import Never, assert_never, assert_type
|
||||||
|
|
|
@ -1,54 +1,108 @@
|
||||||
'''
|
'''
|
||||||
Bindings for the 'core' HPI configuration
|
Bindings for the 'core' HPI configuration
|
||||||
'''
|
'''
|
||||||
import re
|
|
||||||
from typing import Sequence, Optional
|
|
||||||
|
|
||||||
from .common import PathIsh
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from collections.abc import Sequence
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from . import warnings
|
from . import warnings
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from my.config import core as user_config # type: ignore[attr-defined]
|
from my.config import core as user_config # type: ignore[attr-defined]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
try:
|
try:
|
||||||
from my.config import common as user_config # type: ignore[attr-defined, assignment, misc]
|
from my.config import common as user_config # type: ignore[attr-defined]
|
||||||
|
|
||||||
warnings.high("'common' config section is deprecated. Please rename it to 'core'.")
|
warnings.high("'common' config section is deprecated. Please rename it to 'core'.")
|
||||||
except Exception as e2:
|
except Exception as e2:
|
||||||
# make it defensive, because it's pretty commonly used and would be annoying if it breaks hpi doctor etc.
|
# make it defensive, because it's pretty commonly used and would be annoying if it breaks hpi doctor etc.
|
||||||
# this way it'll at least use the defaults
|
# this way it'll at least use the defaults
|
||||||
# todo actually not sure if needs a warning? Perhaps it's okay without it, because the defaults are reasonable enough
|
# todo actually not sure if needs a warning? Perhaps it's okay without it, because the defaults are reasonable enough
|
||||||
user_config = object # type: ignore[assignment, misc]
|
user_config = object
|
||||||
|
|
||||||
|
|
||||||
|
_HPI_CACHE_DIR_DEFAULT = ''
|
||||||
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Config(user_config):
|
class Config(user_config):
|
||||||
# TODO if attr is set _and_ it's none, disable cache?
|
'''
|
||||||
# todo or empty string?
|
Config for the HPI itself.
|
||||||
# I guess flip the switch at some point when I'm confident in cachew
|
To override, add to your config file something like
|
||||||
cache_dir: Optional[PathIsh] = None # FIXME use appdirs cache dir or something
|
|
||||||
|
|
||||||
# list of regexes/globs
|
class config:
|
||||||
# None means 'rely on disabled_modules'
|
cache_dir = '/your/custom/cache/path'
|
||||||
enabled_modules : Optional[Sequence[str]] = None
|
'''
|
||||||
|
|
||||||
# list of regexes/globs
|
cache_dir: Path | str | None = _HPI_CACHE_DIR_DEFAULT
|
||||||
# None means 'rely on enabled_modules'
|
'''
|
||||||
disabled_modules: Optional[Sequence[str]] = None
|
Base directory for cachew.
|
||||||
|
- if None , means cache is disabled
|
||||||
|
- if '' (empty string), use user cache dir (see https://github.com/ActiveState/appdirs for more info). This is the default.
|
||||||
|
- otherwise , use the specified directory as base cache directory
|
||||||
|
|
||||||
|
NOTE: you shouldn't use this attribute in HPI modules directly, use Config.get_cache_dir()/cachew.cache_dir() instead
|
||||||
|
'''
|
||||||
|
|
||||||
def _is_module_active(self, module: str) -> Optional[bool]:
|
tmp_dir: Path | str | None = None
|
||||||
|
'''
|
||||||
|
Path to a temporary directory.
|
||||||
|
This can be used temporarily while extracting zipfiles etc...
|
||||||
|
- if None , uses default determined by tempfile.gettempdir + 'HPI'
|
||||||
|
- otherwise , use the specified directory as the base temporary directory
|
||||||
|
'''
|
||||||
|
|
||||||
|
enabled_modules: Sequence[str] | None = None
|
||||||
|
'''
|
||||||
|
list of regexes/globs
|
||||||
|
- None means 'rely on disabled_modules'
|
||||||
|
'''
|
||||||
|
|
||||||
|
disabled_modules: Sequence[str] | None = None
|
||||||
|
'''
|
||||||
|
list of regexes/globs
|
||||||
|
- None means 'rely on enabled_modules'
|
||||||
|
'''
|
||||||
|
|
||||||
|
def get_cache_dir(self) -> Path | None:
|
||||||
|
cdir = self.cache_dir
|
||||||
|
if cdir is None:
|
||||||
|
return None
|
||||||
|
if cdir == _HPI_CACHE_DIR_DEFAULT:
|
||||||
|
from .cachew import _appdirs_cache_dir
|
||||||
|
|
||||||
|
return _appdirs_cache_dir()
|
||||||
|
else:
|
||||||
|
return Path(cdir).expanduser()
|
||||||
|
|
||||||
|
def get_tmp_dir(self) -> Path:
|
||||||
|
tdir: Path | str | None = self.tmp_dir
|
||||||
|
tpath: Path
|
||||||
|
# use tempfile if unset
|
||||||
|
if tdir is None:
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
tpath = Path(tempfile.gettempdir()) / 'HPI'
|
||||||
|
else:
|
||||||
|
tpath = Path(tdir)
|
||||||
|
tpath = tpath.expanduser()
|
||||||
|
tpath.mkdir(parents=True, exist_ok=True)
|
||||||
|
return tpath
|
||||||
|
|
||||||
|
def _is_module_active(self, module: str) -> bool | None:
|
||||||
# None means the config doesn't specify anything
|
# None means the config doesn't specify anything
|
||||||
# todo might be nice to return the 'reason' too? e.g. which option has matched
|
# todo might be nice to return the 'reason' too? e.g. which option has matched
|
||||||
def matches(specs: Sequence[str]) -> Optional[str]:
|
def matches(specs: Sequence[str]) -> str | None:
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
# not sure because . (packages separate) matches anything, but I guess unlikely to clash
|
# not sure because . (packages separate) matches anything, but I guess unlikely to clash
|
||||||
if re.match(spec, module):
|
if re.match(spec, module):
|
||||||
return spec
|
return spec
|
||||||
return None
|
return None
|
||||||
|
|
||||||
enabled = self.enabled_modules
|
|
||||||
disabled = self.disabled_modules
|
|
||||||
on = matches(self.enabled_modules or [])
|
on = matches(self.enabled_modules or [])
|
||||||
off = matches(self.disabled_modules or [])
|
off = matches(self.disabled_modules or [])
|
||||||
|
|
||||||
|
@ -58,28 +112,33 @@ class Config(user_config):
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
else: # not None
|
else: # not None
|
||||||
if off is None:
|
if off is None:
|
||||||
return True
|
return True
|
||||||
else: # not None
|
else: # not None
|
||||||
# fallback onto the 'enable everything', then the user will notice
|
# fallback onto the 'enable everything', then the user will notice
|
||||||
warnings.medium(f"[module]: conflicting regexes '{on}' and '{off}' are set in the config. Please only use one of them.")
|
warnings.medium(f"[module]: conflicting regexes '{on}' and '{off}' are set in the config. Please only use one of them.")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
from .cfg import make_config
|
from .cfg import make_config
|
||||||
|
|
||||||
config = make_config(Config)
|
config = make_config(Config)
|
||||||
|
|
||||||
|
|
||||||
### tests start
|
### tests start
|
||||||
|
from collections.abc import Iterator
|
||||||
from contextlib import contextmanager as ctx
|
from contextlib import contextmanager as ctx
|
||||||
|
|
||||||
|
|
||||||
@ctx
|
@ctx
|
||||||
def _reset_config():
|
def _reset_config() -> Iterator[Config]:
|
||||||
# todo maybe have this decorator for the whole of my.config?
|
# todo maybe have this decorator for the whole of my.config?
|
||||||
from .cfg import override_config
|
from .cfg import _override_config
|
||||||
with override_config(config) as cc:
|
with _override_config(config) as cc:
|
||||||
cc.enabled_modules = None
|
cc.enabled_modules = None
|
||||||
cc.disabled_modules = None
|
cc.disabled_modules = None
|
||||||
|
cc.cache_dir = None
|
||||||
yield cc
|
yield cc
|
||||||
|
|
||||||
|
|
||||||
|
@ -96,18 +155,19 @@ def test_active_modules() -> None:
|
||||||
with reset() as cc:
|
with reset() as cc:
|
||||||
cc.enabled_modules = ['my.whatever']
|
cc.enabled_modules = ['my.whatever']
|
||||||
cc.disabled_modules = ['my.body.*']
|
cc.disabled_modules = ['my.body.*']
|
||||||
assert cc._is_module_active('my.whatever' ) is True
|
assert cc._is_module_active('my.whatever' ) is True
|
||||||
assert cc._is_module_active('my.core' ) is None
|
assert cc._is_module_active('my.core' ) is None
|
||||||
assert not cc._is_module_active('my.body.exercise') is True
|
assert cc._is_module_active('my.body.exercise') is False
|
||||||
|
|
||||||
with reset() as cc:
|
with reset() as cc:
|
||||||
# if both are set, enable all
|
# if both are set, enable all
|
||||||
cc.disabled_modules = ['my.body.*']
|
cc.disabled_modules = ['my.body.*']
|
||||||
cc.enabled_modules = ['my.body.exercise']
|
cc.enabled_modules = ['my.body.exercise']
|
||||||
assert cc._is_module_active('my.whatever' ) is None
|
assert cc._is_module_active('my.whatever' ) is None
|
||||||
assert cc._is_module_active('my.core' ) is None
|
assert cc._is_module_active('my.core' ) is None
|
||||||
with pytest.warns(UserWarning, match=r"conflicting regexes") as record_warnings:
|
with pytest.warns(UserWarning, match=r"conflicting regexes") as record_warnings:
|
||||||
assert cc._is_module_active("my.body.exercise") is True
|
assert cc._is_module_active("my.body.exercise") is True
|
||||||
assert len(record_warnings) == 1
|
assert len(record_warnings) == 1
|
||||||
|
|
||||||
|
|
||||||
### tests end
|
### tests end
|
||||||
|
|
|
@ -1,12 +1,5 @@
|
||||||
from pathlib import Path
|
from . import warnings
|
||||||
|
|
||||||
# TODO wonder if also need to open without WAL.. test this on read-only directory/db file
|
warnings.high(f"{__name__} is deprecated, please use dataset directly if you need or switch to my.core.sqlite")
|
||||||
def connect_readonly(db: Path):
|
|
||||||
import dataset # type: ignore
|
from ._deprecated.dataset import *
|
||||||
# see https://github.com/pudo/dataset/issues/136#issuecomment-128693122
|
|
||||||
# todo not sure if mode=ro has any benefit, but it doesn't work on read-only filesystems
|
|
||||||
# maybe it should autodetect readonly filesystems and apply this? not sure
|
|
||||||
import sqlite3
|
|
||||||
# https://www.sqlite.org/draft/uri.html#uriimmutable
|
|
||||||
creator = lambda: sqlite3.connect(f'file:{db}?immutable=1', uri=True)
|
|
||||||
return dataset.connect('sqlite:///', engine_kwargs={'creator': creator})
|
|
||||||
|
|
179
my/core/denylist.py
Normal file
179
my/core/denylist.py
Normal file
|
@ -0,0 +1,179 @@
|
||||||
|
"""
|
||||||
|
A helper module for defining denylists for sources programmatically
|
||||||
|
(in lamens terms, this lets you remove some output from a module you don't want)
|
||||||
|
|
||||||
|
For docs, see doc/DENYLIST.md
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import functools
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from collections.abc import Iterator, Mapping
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, TypeVar
|
||||||
|
|
||||||
|
import click
|
||||||
|
from more_itertools import seekable
|
||||||
|
|
||||||
|
from .serialize import dumps
|
||||||
|
from .warnings import medium
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
DenyMap = Mapping[str, set[Any]]
|
||||||
|
|
||||||
|
|
||||||
|
def _default_key_func(obj: T) -> str:
|
||||||
|
return str(obj)
|
||||||
|
|
||||||
|
|
||||||
|
class DenyList:
|
||||||
|
def __init__(self, denylist_file: Path | str) -> None:
|
||||||
|
self.file = Path(denylist_file).expanduser().absolute()
|
||||||
|
self._deny_raw_list: list[dict[str, Any]] = []
|
||||||
|
self._deny_map: DenyMap = defaultdict(set)
|
||||||
|
|
||||||
|
# deny cli, user can override these
|
||||||
|
self.fzf_path = None
|
||||||
|
self._fzf_options = ()
|
||||||
|
self._deny_cli_key_func = None
|
||||||
|
|
||||||
|
def _load(self) -> None:
|
||||||
|
if not self.file.exists():
|
||||||
|
medium(f"denylist file {self.file} does not exist")
|
||||||
|
return
|
||||||
|
|
||||||
|
deny_map: DenyMap = defaultdict(set)
|
||||||
|
data: list[dict[str, Any]] = json.loads(self.file.read_text())
|
||||||
|
self._deny_raw_list = data
|
||||||
|
|
||||||
|
for ignore in data:
|
||||||
|
for k, v in ignore.items():
|
||||||
|
deny_map[k].add(v)
|
||||||
|
|
||||||
|
self._deny_map = deny_map
|
||||||
|
|
||||||
|
def load(self) -> DenyMap:
|
||||||
|
self._load()
|
||||||
|
return self._deny_map
|
||||||
|
|
||||||
|
def write(self) -> None:
|
||||||
|
if not self._deny_raw_list:
|
||||||
|
medium("no denylist data to write")
|
||||||
|
return
|
||||||
|
self.file.write_text(json.dumps(self._deny_raw_list))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _is_json_primitive(cls, val: Any) -> bool:
|
||||||
|
return isinstance(val, (str, int, float, bool, type(None)))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _stringify_value(cls, val: Any) -> Any:
|
||||||
|
# if it's a primitive, just return it
|
||||||
|
if cls._is_json_primitive(val):
|
||||||
|
return val
|
||||||
|
# otherwise, stringify-and-back so we can compare to
|
||||||
|
# json data loaded from the denylist file
|
||||||
|
return json.loads(dumps(val))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _allow(cls, obj: T, deny_map: DenyMap) -> bool:
|
||||||
|
for deny_key, deny_set in deny_map.items():
|
||||||
|
# this should be done separately and not as part of the getattr
|
||||||
|
# because 'null'/None could actually be a value in the denylist,
|
||||||
|
# and the user may define behavior to filter that out
|
||||||
|
if not hasattr(obj, deny_key):
|
||||||
|
return False
|
||||||
|
val = cls._stringify_value(getattr(obj, deny_key))
|
||||||
|
# this object doesn't have have the attribute in the denylist
|
||||||
|
if val in deny_set:
|
||||||
|
return False
|
||||||
|
# if we tried all the denylist keys and didn't return False,
|
||||||
|
# then this object is allowed
|
||||||
|
return True
|
||||||
|
|
||||||
|
def filter(
|
||||||
|
self,
|
||||||
|
itr: Iterator[T],
|
||||||
|
*,
|
||||||
|
invert: bool = False,
|
||||||
|
) -> Iterator[T]:
|
||||||
|
denyf = functools.partial(self._allow, deny_map=self.load())
|
||||||
|
if invert:
|
||||||
|
return filter(lambda x: not denyf(x), itr)
|
||||||
|
return filter(denyf, itr)
|
||||||
|
|
||||||
|
def deny(self, key: str, value: Any, *, write: bool = False) -> None:
|
||||||
|
'''
|
||||||
|
add a key/value pair to the denylist
|
||||||
|
'''
|
||||||
|
if not self._deny_raw_list:
|
||||||
|
self._load()
|
||||||
|
self._deny_raw({key: self._stringify_value(value)}, write=write)
|
||||||
|
|
||||||
|
def _deny_raw(self, data: dict[str, Any], *, write: bool = False) -> None:
|
||||||
|
self._deny_raw_list.append(data)
|
||||||
|
if write:
|
||||||
|
self.write()
|
||||||
|
|
||||||
|
def _prompt_keys(self, item: T) -> str:
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
click.echo(pprint.pformat(item))
|
||||||
|
# TODO: extract keys from item by checking if its dataclass/NT etc.?
|
||||||
|
resp = click.prompt("Key to deny on").strip()
|
||||||
|
if not hasattr(item, resp):
|
||||||
|
click.echo(f"Could not find key '{resp}' on item", err=True)
|
||||||
|
return self._prompt_keys(item)
|
||||||
|
return resp
|
||||||
|
|
||||||
|
def _deny_cli_remember(
|
||||||
|
self,
|
||||||
|
items: Iterator[T],
|
||||||
|
mem: dict[str, T],
|
||||||
|
) -> Iterator[str]:
|
||||||
|
keyf = self._deny_cli_key_func or _default_key_func
|
||||||
|
# i.e., convert each item to a string, and map str -> item
|
||||||
|
for item in items:
|
||||||
|
key = keyf(item)
|
||||||
|
mem[key] = item
|
||||||
|
yield key
|
||||||
|
|
||||||
|
def deny_cli(self, itr: Iterator[T]) -> None:
|
||||||
|
try:
|
||||||
|
from pyfzf import FzfPrompt
|
||||||
|
except ImportError:
|
||||||
|
click.echo("pyfzf is required to use the denylist cli, run 'python3 -m pip install pyfzf_iter'", err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# wrap in seekable so we can use it multiple times
|
||||||
|
# progressively caches the items as we iterate over them
|
||||||
|
sit = seekable(itr)
|
||||||
|
|
||||||
|
prompt_continue = True
|
||||||
|
|
||||||
|
while prompt_continue:
|
||||||
|
# reset the iterator
|
||||||
|
sit.seek(0)
|
||||||
|
# so we can map the selected string from fzf back to the original objects
|
||||||
|
memory_map: dict[str, T] = {}
|
||||||
|
picker = FzfPrompt(executable_path=self.fzf_path, default_options="--no-multi")
|
||||||
|
picked_l = picker.prompt(
|
||||||
|
self._deny_cli_remember(itr, memory_map),
|
||||||
|
"--read0",
|
||||||
|
*self._fzf_options,
|
||||||
|
delimiter="\0",
|
||||||
|
)
|
||||||
|
assert isinstance(picked_l, list)
|
||||||
|
if picked_l:
|
||||||
|
picked: T = memory_map[picked_l[0]]
|
||||||
|
key = self._prompt_keys(picked)
|
||||||
|
self.deny(key, getattr(picked, key), write=True)
|
||||||
|
click.echo(f"Added {self._deny_raw_list[-1]} to denylist", err=True)
|
||||||
|
else:
|
||||||
|
click.echo("No item selected", err=True)
|
||||||
|
|
||||||
|
prompt_continue = click.confirm("Continue?")
|
267
my/core/discovery_pure.py
Normal file
267
my/core/discovery_pure.py
Normal file
|
@ -0,0 +1,267 @@
|
||||||
|
'''
|
||||||
|
The idea of this module is to avoid imports of external HPI modules and code evaluation via ast module etc.
|
||||||
|
|
||||||
|
This potentially allows it to be:
|
||||||
|
|
||||||
|
- robust: can discover modules that can't be imported, generally makes it foolproof
|
||||||
|
- faster: importing is slow and with tens of modules can be noteiceable
|
||||||
|
- secure: can be executed in a sandbox & used during setup
|
||||||
|
|
||||||
|
It should be free of external modules, importlib, exec, etc. etc.
|
||||||
|
'''
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
REQUIRES = 'REQUIRES'
|
||||||
|
NOT_HPI_MODULE_VAR = '__NOT_HPI_MODULE__'
|
||||||
|
|
||||||
|
###
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from collections.abc import Iterable, Sequence
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, NamedTuple, Optional, cast
|
||||||
|
|
||||||
|
'''
|
||||||
|
None means that requirements weren't defined (different from empty requirements)
|
||||||
|
'''
|
||||||
|
Requires = Optional[Sequence[str]]
|
||||||
|
|
||||||
|
|
||||||
|
class HPIModule(NamedTuple):
|
||||||
|
name: str
|
||||||
|
skip_reason: str | None
|
||||||
|
doc: str | None = None
|
||||||
|
file: Path | None = None
|
||||||
|
requires: Requires = None
|
||||||
|
legacy: str | None = None # contains reason/deprecation warning
|
||||||
|
|
||||||
|
|
||||||
|
def ignored(m: str) -> bool:
|
||||||
|
excluded = [
|
||||||
|
# legacy stuff left for backwards compatibility
|
||||||
|
'core.*',
|
||||||
|
'config.*',
|
||||||
|
]
|
||||||
|
exs = '|'.join(excluded)
|
||||||
|
return re.match(f'^my.({exs})$', m) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def has_stats(src: Path) -> bool:
|
||||||
|
# todo make sure consistent with get_stats?
|
||||||
|
return _has_stats(src.read_text())
|
||||||
|
|
||||||
|
|
||||||
|
def _has_stats(code: str) -> bool:
|
||||||
|
a: ast.Module = ast.parse(code)
|
||||||
|
for x in a.body:
|
||||||
|
try: # maybe assign
|
||||||
|
[tg] = cast(Any, x).targets
|
||||||
|
if tg.id == 'stats':
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try: # maybe def?
|
||||||
|
name = cast(Any, x).name
|
||||||
|
if name == 'stats':
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_not_module_src(src: Path) -> bool:
|
||||||
|
a: ast.Module = ast.parse(src.read_text())
|
||||||
|
return _is_not_module_ast(a)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_not_module_ast(a: ast.Module) -> bool:
|
||||||
|
marker = NOT_HPI_MODULE_VAR
|
||||||
|
return any(
|
||||||
|
getattr(node, 'name', None) == marker # direct definition
|
||||||
|
or any(getattr(n, 'name', None) == marker for n in getattr(node, 'names', [])) # import from
|
||||||
|
for node in a.body
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_legacy_module(a: ast.Module) -> bool:
|
||||||
|
marker = 'handle_legacy_import'
|
||||||
|
return any(
|
||||||
|
getattr(node, 'name', None) == marker # direct definition
|
||||||
|
or any(getattr(n, 'name', None) == marker for n in getattr(node, 'names', [])) # import from
|
||||||
|
for node in a.body
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# todo should be defensive? not sure
|
||||||
|
def _extract_requirements(a: ast.Module) -> Requires:
|
||||||
|
# find the assignment..
|
||||||
|
for x in a.body:
|
||||||
|
if not isinstance(x, ast.Assign):
|
||||||
|
continue
|
||||||
|
tg = x.targets
|
||||||
|
if len(tg) != 1:
|
||||||
|
continue
|
||||||
|
t = tg[0]
|
||||||
|
# could be Subscript.. so best to keep dynamic
|
||||||
|
id_ = getattr(t, 'id', None)
|
||||||
|
if id_ != REQUIRES:
|
||||||
|
continue
|
||||||
|
vals = x.value
|
||||||
|
# could be List/Tuple/Set?
|
||||||
|
elts = getattr(vals, 'elts', None)
|
||||||
|
if elts is None:
|
||||||
|
continue
|
||||||
|
deps = []
|
||||||
|
for c in elts:
|
||||||
|
if isinstance(c, ast.Constant):
|
||||||
|
deps.append(c.value)
|
||||||
|
elif isinstance(c, ast.Str):
|
||||||
|
deps.append(c.s)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Expecting string constants only in {REQUIRES} declaration")
|
||||||
|
return tuple(deps)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# todo should probably be more defensive..
|
||||||
|
def all_modules() -> Iterable[HPIModule]:
|
||||||
|
"""
|
||||||
|
Return all importable modules under all items in the 'my' namespace package
|
||||||
|
|
||||||
|
Note: This returns all modules under all roots - if you have
|
||||||
|
several overlays (multiple items in my.__path__ and you've overridden
|
||||||
|
modules), this can return multiple HPIModule objects with the same
|
||||||
|
name. It should respect import order, as we're traversing
|
||||||
|
in my.__path__ order, so module_by_name should still work
|
||||||
|
and return the correctly resolved module, but all_modules
|
||||||
|
can have duplicates
|
||||||
|
"""
|
||||||
|
for my_root in _iter_my_roots():
|
||||||
|
yield from _modules_under_root(my_root)
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_my_roots() -> Iterable[Path]:
|
||||||
|
import my # doesn't import any code, because of namespace package
|
||||||
|
|
||||||
|
paths: list[str] = list(my.__path__)
|
||||||
|
if len(paths) == 0:
|
||||||
|
# should probably never happen?, if this code is running, it was imported
|
||||||
|
# because something was added to __path__ to match this name
|
||||||
|
raise RuntimeError("my.__path__ was empty, try re-installing HPI?")
|
||||||
|
else:
|
||||||
|
yield from map(Path, paths)
|
||||||
|
|
||||||
|
|
||||||
|
def _modules_under_root(my_root: Path) -> Iterable[HPIModule]:
|
||||||
|
"""
|
||||||
|
Experimental version, which isn't importing the modules, making it more robust and safe.
|
||||||
|
"""
|
||||||
|
for f in sorted(my_root.rglob('*.py')):
|
||||||
|
if f.is_symlink():
|
||||||
|
continue # meh
|
||||||
|
mp = f.relative_to(my_root.parent)
|
||||||
|
if mp.name == '__init__.py':
|
||||||
|
mp = mp.parent
|
||||||
|
m = str(mp.with_suffix('')).replace(os.sep, '.')
|
||||||
|
if ignored(m):
|
||||||
|
continue
|
||||||
|
a: ast.Module = ast.parse(f.read_text())
|
||||||
|
|
||||||
|
# legacy modules are 'forced' to be modules so 'hpi module install' still works for older modules
|
||||||
|
# a bit messy, will think how to fix it properly later
|
||||||
|
legacy_module = _is_legacy_module(a)
|
||||||
|
if _is_not_module_ast(a) and not legacy_module:
|
||||||
|
continue
|
||||||
|
doc = ast.get_docstring(a, clean=False)
|
||||||
|
|
||||||
|
requires: Requires = None
|
||||||
|
try:
|
||||||
|
requires = _extract_requirements(a)
|
||||||
|
except Exception as e:
|
||||||
|
logging.exception(e)
|
||||||
|
|
||||||
|
legacy = f'{m} is DEPRECATED. Please refer to the module documentation.' if legacy_module else None
|
||||||
|
|
||||||
|
yield HPIModule(
|
||||||
|
name=m,
|
||||||
|
skip_reason=None,
|
||||||
|
doc=doc,
|
||||||
|
file=f.relative_to(my_root.parent),
|
||||||
|
requires=requires,
|
||||||
|
legacy=legacy,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def module_by_name(name: str) -> HPIModule:
|
||||||
|
for m in all_modules():
|
||||||
|
if m.name == name:
|
||||||
|
return m
|
||||||
|
raise RuntimeError(f'No such module: {name}')
|
||||||
|
|
||||||
|
|
||||||
|
### tests
|
||||||
|
|
||||||
|
|
||||||
|
def test() -> None:
|
||||||
|
# TODO this should be a 'sanity check' or something
|
||||||
|
assert len(list(all_modules())) > 10 # kinda arbitrary
|
||||||
|
|
||||||
|
|
||||||
|
def test_demo() -> None:
|
||||||
|
demo = module_by_name('my.demo')
|
||||||
|
assert demo.doc is not None
|
||||||
|
assert demo.file == Path('my', 'demo.py')
|
||||||
|
assert demo.requires is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_excluded() -> None:
|
||||||
|
for m in all_modules():
|
||||||
|
assert 'my.core.' not in m.name
|
||||||
|
|
||||||
|
|
||||||
|
def test_requires() -> None:
|
||||||
|
photos = module_by_name('my.photos.main')
|
||||||
|
r = photos.requires
|
||||||
|
assert r is not None
|
||||||
|
assert len(r) == 2 # fragile, but ok for now
|
||||||
|
|
||||||
|
|
||||||
|
def test_legacy_modules() -> None:
|
||||||
|
# shouldn't crash
|
||||||
|
module_by_name('my.reddit')
|
||||||
|
module_by_name('my.fbmessenger')
|
||||||
|
|
||||||
|
|
||||||
|
def test_pure() -> None:
|
||||||
|
"""
|
||||||
|
We want to keep this module clean of other HPI imports
|
||||||
|
"""
|
||||||
|
# this uses string concatenation here to prevent
|
||||||
|
# these tests from testing against themselves
|
||||||
|
src = Path(__file__).read_text()
|
||||||
|
# 'import my' is allowed, but
|
||||||
|
# dont allow anything other HPI modules
|
||||||
|
assert re.findall('import ' + r'my\.\S+', src, re.MULTILINE) == []
|
||||||
|
assert 'from ' + 'my' not in src
|
||||||
|
|
||||||
|
|
||||||
|
def test_has_stats() -> None:
|
||||||
|
assert not _has_stats('')
|
||||||
|
assert not _has_stats('x = lambda : whatever')
|
||||||
|
|
||||||
|
assert _has_stats('''
|
||||||
|
def stats():
|
||||||
|
pass
|
||||||
|
''')
|
||||||
|
|
||||||
|
assert _has_stats('''
|
||||||
|
stats = lambda: "something"
|
||||||
|
''')
|
||||||
|
|
||||||
|
assert _has_stats('''
|
||||||
|
stats = other_function
|
||||||
|
''')
|
173
my/core/error.py
173
my/core/error.py
|
@ -3,19 +3,34 @@ Various error handling helpers
|
||||||
See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail
|
See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from itertools import tee
|
from __future__ import annotations
|
||||||
from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any
|
|
||||||
|
|
||||||
|
import traceback
|
||||||
|
from collections.abc import Iterable, Iterator
|
||||||
|
from datetime import datetime
|
||||||
|
from itertools import tee
|
||||||
|
from typing import (
|
||||||
|
Any,
|
||||||
|
Callable,
|
||||||
|
Literal,
|
||||||
|
TypeVar,
|
||||||
|
Union,
|
||||||
|
cast,
|
||||||
|
)
|
||||||
|
|
||||||
|
from .types import Json
|
||||||
|
|
||||||
T = TypeVar('T')
|
T = TypeVar('T')
|
||||||
E = TypeVar('E', bound=Exception) # TODO make covariant?
|
E = TypeVar('E', bound=Exception) # TODO make covariant?
|
||||||
|
|
||||||
ResT = Union[T, E]
|
ResT = Union[T, E]
|
||||||
|
|
||||||
Res = ResT[T, Exception]
|
Res = ResT[T, Exception]
|
||||||
|
|
||||||
|
ErrorPolicy = Literal["yield", "raise", "drop"]
|
||||||
|
|
||||||
def notnone(x: Optional[T]) -> T:
|
|
||||||
|
def notnone(x: T | None) -> T:
|
||||||
assert x is not None
|
assert x is not None
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
@ -23,8 +38,41 @@ def notnone(x: Optional[T]) -> T:
|
||||||
def unwrap(res: Res[T]) -> T:
|
def unwrap(res: Res[T]) -> T:
|
||||||
if isinstance(res, Exception):
|
if isinstance(res, Exception):
|
||||||
raise res
|
raise res
|
||||||
else:
|
return res
|
||||||
return res
|
|
||||||
|
|
||||||
|
def drop_exceptions(itr: Iterator[Res[T]]) -> Iterator[T]:
|
||||||
|
"""Return non-errors from the iterable"""
|
||||||
|
for o in itr:
|
||||||
|
if isinstance(o, Exception):
|
||||||
|
continue
|
||||||
|
yield o
|
||||||
|
|
||||||
|
|
||||||
|
def raise_exceptions(itr: Iterable[Res[T]]) -> Iterator[T]:
|
||||||
|
"""Raise errors from the iterable, stops the select function"""
|
||||||
|
for o in itr:
|
||||||
|
if isinstance(o, Exception):
|
||||||
|
raise o
|
||||||
|
yield o
|
||||||
|
|
||||||
|
|
||||||
|
def warn_exceptions(itr: Iterable[Res[T]], warn_func: Callable[[Exception], None] | None = None) -> Iterator[T]:
|
||||||
|
# if not provided, use the 'warnings' module
|
||||||
|
if warn_func is None:
|
||||||
|
from my.core.warnings import medium
|
||||||
|
|
||||||
|
def _warn_func(e: Exception) -> None:
|
||||||
|
# TODO: print traceback? but user could always --raise-exceptions as well
|
||||||
|
medium(str(e))
|
||||||
|
|
||||||
|
warn_func = _warn_func
|
||||||
|
|
||||||
|
for o in itr:
|
||||||
|
if isinstance(o, Exception):
|
||||||
|
warn_func(o)
|
||||||
|
continue
|
||||||
|
yield o
|
||||||
|
|
||||||
|
|
||||||
def echain(ex: E, cause: Exception) -> E:
|
def echain(ex: E, cause: Exception) -> E:
|
||||||
|
@ -32,7 +80,7 @@ def echain(ex: E, cause: Exception) -> E:
|
||||||
return ex
|
return ex
|
||||||
|
|
||||||
|
|
||||||
def split_errors(l: Iterable[ResT[T, E]], ET: Type[E]) -> Tuple[Iterable[T], Iterable[E]]:
|
def split_errors(l: Iterable[ResT[T, E]], ET: type[E]) -> tuple[Iterable[T], Iterable[E]]:
|
||||||
# TODO would be nice to have ET=Exception default? but it causes some mypy complaints?
|
# TODO would be nice to have ET=Exception default? but it causes some mypy complaints?
|
||||||
vit, eit = tee(l)
|
vit, eit = tee(l)
|
||||||
# TODO ugh, not sure if I can reconcile type checking and runtime and convince mypy that ET and E are the same type?
|
# TODO ugh, not sure if I can reconcile type checking and runtime and convince mypy that ET and E are the same type?
|
||||||
|
@ -50,7 +98,9 @@ def split_errors(l: Iterable[ResT[T, E]], ET: Type[E]) -> Tuple[Iterable[T], Ite
|
||||||
|
|
||||||
|
|
||||||
K = TypeVar('K')
|
K = TypeVar('K')
|
||||||
def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> List[Res[T]]:
|
|
||||||
|
|
||||||
|
def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> list[Res[T]]:
|
||||||
"""
|
"""
|
||||||
Sort a sequence potentially interleaved with errors/entries on which the key can't be computed.
|
Sort a sequence potentially interleaved with errors/entries on which the key can't be computed.
|
||||||
The general idea is: the error sticks to the non-error entry that follows it
|
The general idea is: the error sticks to the non-error entry that follows it
|
||||||
|
@ -58,20 +108,20 @@ def sort_res_by(items: Iterable[Res[T]], key: Callable[[Any], K]) -> List[Res[T]
|
||||||
group = []
|
group = []
|
||||||
groups = []
|
groups = []
|
||||||
for i in items:
|
for i in items:
|
||||||
k: Optional[K]
|
k: K | None
|
||||||
try:
|
try:
|
||||||
k = key(i)
|
k = key(i)
|
||||||
except Exception as e:
|
except Exception: # error white computing key? dunno, might be nice to handle...
|
||||||
k = None
|
k = None
|
||||||
group.append(i)
|
group.append(i)
|
||||||
if k is not None:
|
if k is not None:
|
||||||
groups.append((k, group))
|
groups.append((k, group))
|
||||||
group = []
|
group = []
|
||||||
|
|
||||||
results: List[Res[T]] = []
|
results: list[Res[T]] = []
|
||||||
for v, grp in sorted(groups, key=lambda p: p[0]): # type: ignore[return-value, arg-type] # TODO SupportsLessThan??
|
for _v, grp in sorted(groups, key=lambda p: p[0]): # type: ignore[return-value, arg-type] # TODO SupportsLessThan??
|
||||||
results.extend(grp)
|
results.extend(grp)
|
||||||
results.extend(group) # handle last group (it will always be errors only)
|
results.extend(group) # handle last group (it will always be errors only)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@ -91,7 +141,7 @@ def test_sort_res_by() -> None:
|
||||||
1,
|
1,
|
||||||
Exc('last'),
|
Exc('last'),
|
||||||
]
|
]
|
||||||
results = sort_res_by(ress, lambda x: int(x)) # type: ignore
|
results = sort_res_by(ress, lambda x: int(x))
|
||||||
assert results == [
|
assert results == [
|
||||||
1,
|
1,
|
||||||
'bad',
|
'bad',
|
||||||
|
@ -103,32 +153,32 @@ def test_sort_res_by() -> None:
|
||||||
Exc('last'),
|
Exc('last'),
|
||||||
]
|
]
|
||||||
|
|
||||||
results2 = sort_res_by(ress + [0], lambda x: int(x)) # type: ignore
|
results2 = sort_res_by([*ress, 0], lambda x: int(x))
|
||||||
assert results2 == [Exc('last'), 0] + results[:-1]
|
assert results2 == [Exc('last'), 0] + results[:-1]
|
||||||
|
|
||||||
assert sort_res_by(['caba', 'a', 'aba', 'daba'], key=lambda x: len(x)) == ['a', 'aba', 'caba', 'daba']
|
assert sort_res_by(['caba', 'a', 'aba', 'daba'], key=lambda x: len(x)) == ['a', 'aba', 'caba', 'daba']
|
||||||
assert sort_res_by([], key=lambda x: x) == [] # type: ignore
|
assert sort_res_by([], key=lambda x: x) == []
|
||||||
|
|
||||||
|
|
||||||
# helpers to associate timestamps with the errors (so something meaningful could be displayed on the plots, for example)
|
# helpers to associate timestamps with the errors (so something meaningful could be displayed on the plots, for example)
|
||||||
# todo document it under 'patterns' somewhere...
|
# todo document it under 'patterns' somewhere...
|
||||||
|
|
||||||
# todo proper typevar?
|
# todo proper typevar?
|
||||||
from datetime import datetime
|
def set_error_datetime(e: Exception, dt: datetime | None) -> None:
|
||||||
def set_error_datetime(e: Exception, dt: Optional[datetime]) -> None:
|
|
||||||
if dt is None:
|
if dt is None:
|
||||||
return
|
return
|
||||||
e.args = e.args + (dt,)
|
e.args = (*e.args, dt)
|
||||||
# todo not sure if should return new exception?
|
# todo not sure if should return new exception?
|
||||||
|
|
||||||
def attach_dt(e: Exception, *, dt: Optional[datetime]) -> Exception:
|
|
||||||
|
def attach_dt(e: Exception, *, dt: datetime | None) -> Exception:
|
||||||
set_error_datetime(e, dt)
|
set_error_datetime(e, dt)
|
||||||
return e
|
return e
|
||||||
|
|
||||||
|
|
||||||
# todo it might be problematic because might mess with timezones (when it's converted to string, it's converted to a shift)
|
# todo it might be problematic because might mess with timezones (when it's converted to string, it's converted to a shift)
|
||||||
def extract_error_datetime(e: Exception) -> Optional[datetime]:
|
def extract_error_datetime(e: Exception) -> datetime | None:
|
||||||
from .common import fromisoformat
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
for x in reversed(e.args):
|
for x in reversed(e.args):
|
||||||
if isinstance(x, datetime):
|
if isinstance(x, datetime):
|
||||||
return x
|
return x
|
||||||
|
@ -139,14 +189,84 @@ def extract_error_datetime(e: Exception) -> Optional[datetime]:
|
||||||
continue
|
continue
|
||||||
ss = m.group(0)
|
ss = m.group(0)
|
||||||
# todo not sure if should be defensive??
|
# todo not sure if should be defensive??
|
||||||
return fromisoformat(ss)
|
return datetime.fromisoformat(ss)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def error_to_json(e: Exception) -> Json:
|
||||||
|
estr = ''.join(traceback.format_exception(Exception, e, e.__traceback__))
|
||||||
|
return {'error': estr}
|
||||||
|
|
||||||
|
|
||||||
|
MODULE_SETUP_URL = 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#private-configuration-myconfig'
|
||||||
|
|
||||||
|
|
||||||
|
def warn_my_config_import_error(
|
||||||
|
err: ImportError | AttributeError,
|
||||||
|
*,
|
||||||
|
help_url: str | None = None,
|
||||||
|
module_name: str | None = None,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
If the user tried to import something from my.config but it failed,
|
||||||
|
possibly due to missing the config block in my.config?
|
||||||
|
|
||||||
|
Returns True if it matched a possible config error
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
if help_url is None:
|
||||||
|
help_url = MODULE_SETUP_URL
|
||||||
|
if type(err) is ImportError:
|
||||||
|
if err.name != 'my.config':
|
||||||
|
return False
|
||||||
|
# parse name that user attempted to import
|
||||||
|
em = re.match(r"cannot import name '(\w+)' from 'my.config'", str(err))
|
||||||
|
if em is not None:
|
||||||
|
section_name = em.group(1)
|
||||||
|
click.secho(f"""\
|
||||||
|
You may be missing the '{section_name}' section from your config.
|
||||||
|
See {help_url}\
|
||||||
|
""", fg='yellow', err=True)
|
||||||
|
return True
|
||||||
|
elif type(err) is AttributeError:
|
||||||
|
# test if user had a nested config block missing
|
||||||
|
# https://github.com/karlicoss/HPI/issues/223
|
||||||
|
if hasattr(err, 'obj') and hasattr(err, "name"):
|
||||||
|
config_obj = cast(object, getattr(err, 'obj')) # the object that caused the attribute error
|
||||||
|
# e.g. active_browser for my.browser
|
||||||
|
nested_block_name = err.name
|
||||||
|
errmsg = f"""You're likely missing the nested config block for '{getattr(config_obj, '__name__', str(config_obj))}.{nested_block_name}'.
|
||||||
|
See {help_url} or check the corresponding module.py file for an example\
|
||||||
|
"""
|
||||||
|
if config_obj.__module__ == 'my.config':
|
||||||
|
click.secho(errmsg, fg='yellow', err=True)
|
||||||
|
return True
|
||||||
|
if module_name is not None and nested_block_name == module_name.split('.')[-1]:
|
||||||
|
# this tries to cover cases like these
|
||||||
|
# user config:
|
||||||
|
# class location:
|
||||||
|
# class via_ip:
|
||||||
|
# accuracy = 10_000
|
||||||
|
# then when we import it, we do something like
|
||||||
|
# from my.config import location
|
||||||
|
# user_config = location.via_ip
|
||||||
|
# so if location is present, but via_ip is not, we get
|
||||||
|
# AttributeError: type object 'location' has no attribute 'via_ip'
|
||||||
|
click.secho(errmsg, fg='yellow', err=True)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
click.echo(f"Unexpected error... {err}", err=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def test_datetime_errors() -> None:
|
def test_datetime_errors() -> None:
|
||||||
import pytz
|
import pytz # noqa: I001
|
||||||
|
|
||||||
dt_notz = datetime.now()
|
dt_notz = datetime.now()
|
||||||
dt_tz = datetime.now(tz=pytz.timezone('Europe/Amsterdam'))
|
dt_tz = datetime.now(tz=pytz.timezone('Europe/Amsterdam'))
|
||||||
for dt in [dt_tz, dt_notz]:
|
for dt in [dt_tz, dt_notz]:
|
||||||
e1 = RuntimeError('whatever')
|
e1 = RuntimeError('whatever')
|
||||||
assert extract_error_datetime(e1) is None
|
assert extract_error_datetime(e1) is None
|
||||||
|
@ -156,7 +276,6 @@ def test_datetime_errors() -> None:
|
||||||
e2 = RuntimeError(f'something something {dt} something else')
|
e2 = RuntimeError(f'something something {dt} something else')
|
||||||
assert extract_error_datetime(e2) == dt
|
assert extract_error_datetime(e2) == dt
|
||||||
|
|
||||||
|
|
||||||
e3 = RuntimeError(str(['one', '2019-11-27T08:56:00', 'three']))
|
e3 = RuntimeError(str(['one', '2019-11-27T08:56:00', 'three']))
|
||||||
assert extract_error_datetime(e3) is not None
|
assert extract_error_datetime(e3) is not None
|
||||||
|
|
||||||
|
|
66
my/core/experimental.py
Normal file
66
my/core/experimental.py
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import types
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
# The idea behind this one is to support accessing "overlaid/shadowed" modules from namespace packages
|
||||||
|
# See usage examples here:
|
||||||
|
# - https://github.com/karlicoss/hpi-personal-overlay/blob/master/src/my/util/hpi_heartbeat.py
|
||||||
|
# - https://github.com/karlicoss/hpi-personal-overlay/blob/master/src/my/twitter/all.py
|
||||||
|
# Suppose you want to use my.twitter.talon, which isn't in the default all.py
|
||||||
|
# You could just copy all.py to your personal overlay, but that would mean duplicating
|
||||||
|
# all the code and possible upstream changes.
|
||||||
|
# Alternatively, you could import the "original" my.twitter.all module from "overlay" my.twitter.all
|
||||||
|
# _ORIG = import_original_module(__name__, __file__)
|
||||||
|
# this would magically take care of package import path etc,
|
||||||
|
# and should import the "original" my.twitter.all as _ORIG
|
||||||
|
# After that you can call its methods, extend etc.
|
||||||
|
def import_original_module(
|
||||||
|
module_name: str,
|
||||||
|
file: str,
|
||||||
|
*,
|
||||||
|
star: bool = False,
|
||||||
|
globals: dict[str, Any] | None = None,
|
||||||
|
) -> types.ModuleType:
|
||||||
|
module_to_restore = sys.modules[module_name]
|
||||||
|
|
||||||
|
# NOTE: we really wanna to hack the actual package of the module
|
||||||
|
# rather than just top level my.
|
||||||
|
# since that would be a bit less disruptive
|
||||||
|
module_pkg = module_to_restore.__package__
|
||||||
|
assert module_pkg is not None
|
||||||
|
parent = sys.modules[module_pkg]
|
||||||
|
|
||||||
|
my_path = parent.__path__._path # type: ignore[attr-defined]
|
||||||
|
my_path_orig = list(my_path)
|
||||||
|
|
||||||
|
def fixup_path() -> None:
|
||||||
|
for i, p in enumerate(my_path_orig):
|
||||||
|
starts = file.startswith(p)
|
||||||
|
if i == 0:
|
||||||
|
# not sure about this.. but I guess it'll always be 0th element?
|
||||||
|
assert starts, (my_path_orig, file)
|
||||||
|
if starts:
|
||||||
|
my_path.remove(p)
|
||||||
|
# should remove exactly one item
|
||||||
|
assert len(my_path) + 1 == len(my_path_orig), (my_path_orig, file)
|
||||||
|
|
||||||
|
try:
|
||||||
|
fixup_path()
|
||||||
|
try:
|
||||||
|
del sys.modules[module_name]
|
||||||
|
# NOTE: we're using __import__ instead of importlib.import_module
|
||||||
|
# since it's closer to the actual normal import (e.g. imports subpackages etc properly )
|
||||||
|
# fromlist=[None] forces it to return rightmost child
|
||||||
|
# (otherwise would just return 'my' package)
|
||||||
|
res = __import__(module_name, fromlist=[None]) # type: ignore[list-item]
|
||||||
|
if star:
|
||||||
|
assert globals is not None
|
||||||
|
globals.update({k: v for k, v in vars(res).items() if not k.startswith('_')})
|
||||||
|
return res
|
||||||
|
finally:
|
||||||
|
sys.modules[module_name] = module_to_restore
|
||||||
|
finally:
|
||||||
|
my_path[:] = my_path_orig
|
82
my/core/freezer.py
Normal file
82
my/core/freezer.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from .internal import assert_subpackage
|
||||||
|
|
||||||
|
assert_subpackage(__name__)
|
||||||
|
|
||||||
|
import dataclasses
|
||||||
|
import inspect
|
||||||
|
from typing import Any, Generic, TypeVar
|
||||||
|
|
||||||
|
D = TypeVar('D')
|
||||||
|
|
||||||
|
|
||||||
|
def _freeze_dataclass(Orig: type[D]):
|
||||||
|
ofields = [(f.name, f.type, f) for f in dataclasses.fields(Orig)] # type: ignore[arg-type] # see https://github.com/python/typing_extensions/issues/115
|
||||||
|
|
||||||
|
# extract properties along with their types
|
||||||
|
props = list(inspect.getmembers(Orig, lambda o: isinstance(o, property)))
|
||||||
|
pfields = [(name, inspect.signature(getattr(prop, 'fget')).return_annotation) for name, prop in props]
|
||||||
|
# FIXME not sure about name?
|
||||||
|
# NOTE: sadly passing bases=[Orig] won't work, python won't let us override properties with fields
|
||||||
|
RRR = dataclasses.make_dataclass('RRR', fields=[*ofields, *pfields])
|
||||||
|
# todo maybe even declare as slots?
|
||||||
|
return props, RRR
|
||||||
|
|
||||||
|
|
||||||
|
class Freezer(Generic[D]):
|
||||||
|
'''
|
||||||
|
Some magic which converts dataclass properties into fields.
|
||||||
|
It could be useful for better serialization, for performance, for using type as a schema.
|
||||||
|
For now only supports dataclasses.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, Orig: type[D]) -> None:
|
||||||
|
self.Orig = Orig
|
||||||
|
self.props, self.Frozen = _freeze_dataclass(Orig)
|
||||||
|
|
||||||
|
def freeze(self, value: D) -> D:
|
||||||
|
pvalues = {name: getattr(value, name) for name, _ in self.props}
|
||||||
|
return self.Frozen(**dataclasses.asdict(value), **pvalues) # type: ignore[call-overload] # see https://github.com/python/typing_extensions/issues/115
|
||||||
|
|
||||||
|
|
||||||
|
### tests
|
||||||
|
|
||||||
|
|
||||||
|
# this needs to be defined here to prevent a mypy bug
|
||||||
|
# see https://github.com/python/mypy/issues/7281
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class _A:
|
||||||
|
x: Any
|
||||||
|
|
||||||
|
# TODO what about error handling?
|
||||||
|
@property
|
||||||
|
def typed(self) -> int:
|
||||||
|
return self.x['an_int']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def untyped(self):
|
||||||
|
return self.x['an_any']
|
||||||
|
|
||||||
|
|
||||||
|
def test_freezer() -> None:
|
||||||
|
val = _A(x={
|
||||||
|
'an_int': 123,
|
||||||
|
'an_any': [1, 2, 3],
|
||||||
|
})
|
||||||
|
af = Freezer(_A)
|
||||||
|
fval = af.freeze(val)
|
||||||
|
|
||||||
|
fd = vars(fval)
|
||||||
|
assert fd['typed'] == 123
|
||||||
|
assert fd['untyped'] == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
|
###
|
||||||
|
|
||||||
|
# TODO shit. what to do with exceptions?
|
||||||
|
# e.g. good testcase is date parsing issue. should def yield Exception in this case
|
||||||
|
# fundamentally it should just be Exception aware, dunno
|
||||||
|
#
|
||||||
|
# TODO not entirely sure if best to use Frozen as the schema, or actually convert objects..
|
||||||
|
# guess need to experiment and see
|
260
my/core/hpi_compat.py
Normal file
260
my/core/hpi_compat.py
Normal file
|
@ -0,0 +1,260 @@
|
||||||
|
"""
|
||||||
|
Contains various backwards compatibility/deprecation helpers relevant to HPI itself.
|
||||||
|
(as opposed to .compat module which implements compatibility between python versions)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import inspect
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from collections.abc import Iterator, Sequence
|
||||||
|
from types import ModuleType
|
||||||
|
from typing import TypeVar
|
||||||
|
|
||||||
|
from . import warnings
|
||||||
|
|
||||||
|
|
||||||
|
def handle_legacy_import(
|
||||||
|
parent_module_name: str,
|
||||||
|
legacy_submodule_name: str,
|
||||||
|
parent_module_path: list[str],
|
||||||
|
) -> bool:
|
||||||
|
###
|
||||||
|
# this is to trick mypy into treating this as a proper namespace package
|
||||||
|
# should only be used for backwards compatibility on packages that are convernted into namespace & all.py pattern
|
||||||
|
# - https://www.python.org/dev/peps/pep-0382/#namespace-packages-today
|
||||||
|
# - https://github.com/karlicoss/hpi_namespace_experiment
|
||||||
|
# - discussion here https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/extending.20HPI/near/269946944
|
||||||
|
from pkgutil import extend_path
|
||||||
|
|
||||||
|
parent_module_path[:] = extend_path(parent_module_path, parent_module_name)
|
||||||
|
# 'this' source tree ends up first in the pythonpath when we extend_path()
|
||||||
|
# so we need to move 'this' source tree towards the end to make sure we prioritize overlays
|
||||||
|
parent_module_path[:] = parent_module_path[1:] + parent_module_path[:1]
|
||||||
|
###
|
||||||
|
|
||||||
|
# allow stuff like 'import my.module.submodule' and such
|
||||||
|
imported_as_parent = False
|
||||||
|
|
||||||
|
# allow stuff like 'from my.module import submodule'
|
||||||
|
importing_submodule = False
|
||||||
|
|
||||||
|
# some hacky traceback to inspect the current stack
|
||||||
|
# to see if the user is using the old style of importing
|
||||||
|
for f in inspect.stack():
|
||||||
|
# seems that when a submodule is imported, at some point it'll call some internal import machinery
|
||||||
|
# with 'parent' set to the parent module
|
||||||
|
# if parent module is imported first (i.e. in case of deprecated usage), it won't be the case
|
||||||
|
args = inspect.getargvalues(f.frame)
|
||||||
|
if args.locals.get('parent') == parent_module_name:
|
||||||
|
imported_as_parent = True
|
||||||
|
|
||||||
|
# this we can only detect from the code I guess
|
||||||
|
line = '\n'.join(f.code_context or [])
|
||||||
|
if re.match(rf'from\s+{parent_module_name}\s+import\s+{legacy_submodule_name}', line):
|
||||||
|
importing_submodule = True
|
||||||
|
|
||||||
|
# click sets '_HPI_COMPLETE' env var when it's doing autocompletion
|
||||||
|
# otherwise, the warning will be printed every time you try to tab complete
|
||||||
|
autocompleting_module_cli = "_HPI_COMPLETE" in os.environ
|
||||||
|
|
||||||
|
is_legacy_import = not (imported_as_parent or importing_submodule)
|
||||||
|
if is_legacy_import and not autocompleting_module_cli:
|
||||||
|
warnings.high(
|
||||||
|
f'''\
|
||||||
|
importing {parent_module_name} is DEPRECATED! \
|
||||||
|
Instead, import from {parent_module_name}.{legacy_submodule_name} or {parent_module_name}.all \
|
||||||
|
See https://github.com/karlicoss/HPI/blob/master/doc/MODULE_DESIGN.org#allpy for more info.
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
return is_legacy_import
|
||||||
|
|
||||||
|
|
||||||
|
def pre_pip_dal_handler(
|
||||||
|
name: str,
|
||||||
|
e: ModuleNotFoundError,
|
||||||
|
cfg,
|
||||||
|
requires: Sequence[str] = (),
|
||||||
|
) -> ModuleType:
|
||||||
|
'''
|
||||||
|
https://github.com/karlicoss/HPI/issues/79
|
||||||
|
'''
|
||||||
|
if e.name != name:
|
||||||
|
# the module itself was imported, so the problem is with some dependencies
|
||||||
|
raise e
|
||||||
|
try:
|
||||||
|
dal = _get_dal(cfg, name)
|
||||||
|
warnings.high(
|
||||||
|
f'''
|
||||||
|
Specifying modules' dependencies in the config or in my/config/repos is deprecated!
|
||||||
|
Please install {' '.join(requires)} as PIP packages (see the corresponding README instructions).
|
||||||
|
'''.strip(),
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
dal = None
|
||||||
|
|
||||||
|
if dal is None:
|
||||||
|
# probably means there was nothing in the old config in the first place
|
||||||
|
# so we should raise the original exception
|
||||||
|
raise e
|
||||||
|
return dal
|
||||||
|
|
||||||
|
|
||||||
|
def _get_dal(cfg, module_name: str):
|
||||||
|
mpath = getattr(cfg, module_name, None)
|
||||||
|
if mpath is not None:
|
||||||
|
from .utils.imports import import_dir
|
||||||
|
|
||||||
|
return import_dir(mpath, '.dal')
|
||||||
|
else:
|
||||||
|
from importlib import import_module
|
||||||
|
|
||||||
|
return import_module(f'my.config.repos.{module_name}.dal')
|
||||||
|
|
||||||
|
|
||||||
|
V = TypeVar('V')
|
||||||
|
|
||||||
|
|
||||||
|
# named to be kinda consistent with more_itertools, e.g. more_itertools.always_iterable
|
||||||
|
class always_supports_sequence(Iterator[V]):
|
||||||
|
"""
|
||||||
|
Helper to make migration from Sequence/List to Iterable/Iterator type backwards compatible in runtime
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, it: Iterator[V]) -> None:
|
||||||
|
self._it = it
|
||||||
|
self._list: list[V] | None = None
|
||||||
|
self._lit: Iterator[V] | None = None
|
||||||
|
|
||||||
|
def __iter__(self) -> Iterator[V]: # noqa: PYI034
|
||||||
|
if self._list is not None:
|
||||||
|
self._lit = iter(self._list)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self) -> V:
|
||||||
|
if self._list is not None:
|
||||||
|
assert self._lit is not None
|
||||||
|
delegate = self._lit
|
||||||
|
else:
|
||||||
|
delegate = self._it
|
||||||
|
return next(delegate)
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
return getattr(self._it, name)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _aslist(self) -> list[V]:
|
||||||
|
if self._list is None:
|
||||||
|
qualname = getattr(self._it, '__qualname__', '<no qualname>') # defensive just in case
|
||||||
|
warnings.medium(f'Using {qualname} as list is deprecated. Migrate to iterative processing or call list() explicitly.')
|
||||||
|
self._list = list(self._it)
|
||||||
|
|
||||||
|
# this is necessary for list constructor to work correctly
|
||||||
|
# since it's __iter__ first, then tries to compute length and then starts iterating...
|
||||||
|
self._lit = iter(self._list)
|
||||||
|
return self._list
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self._aslist)
|
||||||
|
|
||||||
|
def __getitem__(self, i: int) -> V:
|
||||||
|
return self._aslist[i]
|
||||||
|
|
||||||
|
|
||||||
|
def test_always_supports_sequence_list_constructor() -> None:
|
||||||
|
exhausted = 0
|
||||||
|
|
||||||
|
def it() -> Iterator[str]:
|
||||||
|
nonlocal exhausted
|
||||||
|
yield from ['a', 'b', 'c']
|
||||||
|
exhausted += 1
|
||||||
|
|
||||||
|
sit = always_supports_sequence(it())
|
||||||
|
|
||||||
|
# list constructor is a bit special... it's trying to compute length if it's available to optimize memory allocation
|
||||||
|
# so, what's happening in this case is
|
||||||
|
# - sit.__iter__ is called
|
||||||
|
# - sit.__len__ is called
|
||||||
|
# - sit.__next__ is called
|
||||||
|
res = list(sit)
|
||||||
|
assert res == ['a', 'b', 'c']
|
||||||
|
assert exhausted == 1
|
||||||
|
|
||||||
|
res = list(sit)
|
||||||
|
assert res == ['a', 'b', 'c']
|
||||||
|
assert exhausted == 1 # this will iterate over 'cached' list now, so original generator is only exhausted once
|
||||||
|
|
||||||
|
|
||||||
|
def test_always_supports_sequence_indexing() -> None:
|
||||||
|
exhausted = 0
|
||||||
|
|
||||||
|
def it() -> Iterator[str]:
|
||||||
|
nonlocal exhausted
|
||||||
|
yield from ['a', 'b', 'c']
|
||||||
|
exhausted += 1
|
||||||
|
|
||||||
|
sit = always_supports_sequence(it())
|
||||||
|
|
||||||
|
assert len(sit) == 3
|
||||||
|
assert exhausted == 1
|
||||||
|
|
||||||
|
assert sit[2] == 'c'
|
||||||
|
assert sit[1] == 'b'
|
||||||
|
assert sit[0] == 'a'
|
||||||
|
assert exhausted == 1
|
||||||
|
|
||||||
|
# a few tests to make sure list-like operations are working..
|
||||||
|
assert list(sit) == ['a', 'b', 'c']
|
||||||
|
assert [x for x in sit] == ['a', 'b', 'c'] # noqa: C416
|
||||||
|
assert list(sit) == ['a', 'b', 'c']
|
||||||
|
assert [x for x in sit] == ['a', 'b', 'c'] # noqa: C416
|
||||||
|
assert exhausted == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_always_supports_sequence_next() -> None:
|
||||||
|
exhausted = 0
|
||||||
|
|
||||||
|
def it() -> Iterator[str]:
|
||||||
|
nonlocal exhausted
|
||||||
|
yield from ['a', 'b', 'c']
|
||||||
|
exhausted += 1
|
||||||
|
|
||||||
|
sit = always_supports_sequence(it())
|
||||||
|
|
||||||
|
x = next(sit)
|
||||||
|
assert x == 'a'
|
||||||
|
assert exhausted == 0
|
||||||
|
|
||||||
|
x = next(sit)
|
||||||
|
assert x == 'b'
|
||||||
|
assert exhausted == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_always_supports_sequence_iter() -> None:
|
||||||
|
exhausted = 0
|
||||||
|
|
||||||
|
def it() -> Iterator[str]:
|
||||||
|
nonlocal exhausted
|
||||||
|
yield from ['a', 'b', 'c']
|
||||||
|
exhausted += 1
|
||||||
|
|
||||||
|
sit = always_supports_sequence(it())
|
||||||
|
|
||||||
|
for x in sit:
|
||||||
|
assert x == 'a'
|
||||||
|
break
|
||||||
|
|
||||||
|
x = next(sit)
|
||||||
|
assert x == 'b'
|
||||||
|
|
||||||
|
assert exhausted == 0
|
||||||
|
|
||||||
|
x = next(sit)
|
||||||
|
assert x == 'c'
|
||||||
|
assert exhausted == 0
|
||||||
|
|
||||||
|
for _ in sit:
|
||||||
|
raise RuntimeError # shouldn't trigger, just exhaust the iterator
|
||||||
|
assert exhausted == 1
|
159
my/core/influxdb.py
Normal file
159
my/core/influxdb.py
Normal file
|
@ -0,0 +1,159 @@
|
||||||
|
'''
|
||||||
|
TODO doesn't really belong to 'core' morally, but can think of moving out later
|
||||||
|
'''
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from .internal import assert_subpackage
|
||||||
|
|
||||||
|
assert_subpackage(__name__)
|
||||||
|
|
||||||
|
from collections.abc import Iterable
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from .logging import make_logger
|
||||||
|
from .types import Json, asdict
|
||||||
|
|
||||||
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class config:
|
||||||
|
db = 'db'
|
||||||
|
|
||||||
|
|
||||||
|
RESET_DEFAULT = False
|
||||||
|
|
||||||
|
|
||||||
|
def fill(it: Iterable[Any], *, measurement: str, reset: bool = RESET_DEFAULT, dt_col: str = 'dt') -> None:
|
||||||
|
# todo infer dt column automatically, reuse in stat?
|
||||||
|
# it doesn't like dots, ends up some syntax error?
|
||||||
|
measurement = measurement.replace('.', '_')
|
||||||
|
# todo autoinfer measurement?
|
||||||
|
|
||||||
|
db = config.db
|
||||||
|
|
||||||
|
from influxdb import InfluxDBClient # type: ignore
|
||||||
|
|
||||||
|
client = InfluxDBClient()
|
||||||
|
# todo maybe create if not exists?
|
||||||
|
# client.create_database(db)
|
||||||
|
|
||||||
|
# todo should be it be env variable?
|
||||||
|
if reset:
|
||||||
|
logger.warning('deleting measurements: %s:%s', db, measurement)
|
||||||
|
client.delete_series(database=db, measurement=measurement)
|
||||||
|
|
||||||
|
# TODO need to take schema here...
|
||||||
|
cache: dict[str, bool] = {}
|
||||||
|
|
||||||
|
def good(f, v) -> bool:
|
||||||
|
c = cache.get(f)
|
||||||
|
if c is not None:
|
||||||
|
return c
|
||||||
|
t = type(v)
|
||||||
|
r = t in {str, int}
|
||||||
|
cache[f] = r
|
||||||
|
if not r:
|
||||||
|
logger.warning('%s: filtering out %s=%s because of type %s', measurement, f, v, t)
|
||||||
|
return r
|
||||||
|
|
||||||
|
def filter_dict(d: Json) -> Json:
|
||||||
|
return {f: v for f, v in d.items() if good(f, v)}
|
||||||
|
|
||||||
|
def dit() -> Iterable[Json]:
|
||||||
|
for i in it:
|
||||||
|
d = asdict(i)
|
||||||
|
tags: Json | None = None
|
||||||
|
tags_ = d.get('tags') # meh... handle in a more robust manner
|
||||||
|
if tags_ is not None and isinstance(tags_, dict): # FIXME meh.
|
||||||
|
del d['tags']
|
||||||
|
tags = tags_
|
||||||
|
|
||||||
|
# TODO what to do with exceptions??
|
||||||
|
# todo handle errors.. not sure how? maybe add tag for 'error' and fill with empty data?
|
||||||
|
dt = d[dt_col].isoformat()
|
||||||
|
del d[dt_col]
|
||||||
|
|
||||||
|
fields = filter_dict(d)
|
||||||
|
|
||||||
|
yield {
|
||||||
|
'measurement': measurement,
|
||||||
|
# TODO maybe good idea to tag with database file/name? to inspect inconsistencies etc..
|
||||||
|
# hmm, so tags are autoindexed and might be faster?
|
||||||
|
# not sure what's the big difference though
|
||||||
|
# "fields are data and tags are metadata"
|
||||||
|
'tags': tags,
|
||||||
|
'time': dt,
|
||||||
|
'fields': fields,
|
||||||
|
}
|
||||||
|
|
||||||
|
from more_itertools import chunked
|
||||||
|
|
||||||
|
# "The optimal batch size is 5000 lines of line protocol."
|
||||||
|
# some chunking is def necessary, otherwise it fails
|
||||||
|
inserted = 0
|
||||||
|
for chi in chunked(dit(), n=5000):
|
||||||
|
chl = list(chi)
|
||||||
|
inserted += len(chl)
|
||||||
|
logger.debug('writing next chunk %s', chl[-1])
|
||||||
|
client.write_points(chl, database=db)
|
||||||
|
|
||||||
|
logger.info('inserted %d points', inserted)
|
||||||
|
# todo "Specify timestamp precision when writing to InfluxDB."?
|
||||||
|
|
||||||
|
|
||||||
|
def magic_fill(it, *, name: str | None = None, reset: bool = RESET_DEFAULT) -> None:
|
||||||
|
if name is None:
|
||||||
|
assert callable(it) # generators have no name/module
|
||||||
|
name = f'{it.__module__}:{it.__name__}'
|
||||||
|
assert name is not None
|
||||||
|
|
||||||
|
if callable(it):
|
||||||
|
it = it()
|
||||||
|
|
||||||
|
from itertools import tee
|
||||||
|
|
||||||
|
from more_itertools import first, one
|
||||||
|
|
||||||
|
it, x = tee(it)
|
||||||
|
f = first(x, default=None)
|
||||||
|
if f is None:
|
||||||
|
logger.warning('%s has no data', name)
|
||||||
|
return
|
||||||
|
|
||||||
|
# TODO can we reuse pandas code or something?
|
||||||
|
#
|
||||||
|
from .pandas import _as_columns
|
||||||
|
|
||||||
|
schema = _as_columns(type(f))
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
dtex = RuntimeError(f'expected single datetime field. schema: {schema}')
|
||||||
|
dtf = one((f for f, t in schema.items() if t == datetime), too_short=dtex, too_long=dtex)
|
||||||
|
|
||||||
|
fill(it, measurement=name, reset=reset, dt_col=dtf)
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
def main() -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@main.command(name='populate', short_help='populate influxdb')
|
||||||
|
@click.option('--reset', is_flag=True, help='Reset Influx measurements before inserting', show_default=True)
|
||||||
|
@click.argument('FUNCTION_NAME', type=str, required=True)
|
||||||
|
def populate(*, function_name: str, reset: bool) -> None:
|
||||||
|
from .__main__ import _locate_functions_or_prompt
|
||||||
|
|
||||||
|
[provider] = list(_locate_functions_or_prompt([function_name]))
|
||||||
|
# todo could have a non-interactive version which populates from all data sources for the provider?
|
||||||
|
magic_fill(provider, reset=reset)
|
||||||
|
|
||||||
|
|
||||||
|
# todo later just add to hpi main?
|
||||||
|
# not sure if want to couple
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -1,43 +1,32 @@
|
||||||
'''
|
'''
|
||||||
A hook to insert user's config directory into Python's search path.
|
A hook to insert user's config directory into Python's search path.
|
||||||
|
Note that this file is imported only if we don't have custom user config (under my.config namespace) in PYTHONPATH
|
||||||
|
|
||||||
- Ideally that would be in __init__.py (so it's executed without having to import explicityly)
|
Ideally that would be in __init__.py (so it's executed without having to import explicitly)
|
||||||
But, with namespace packages, we can't have __init__.py in the parent subpackage
|
But, with namespace packages, we can't have __init__.py in the parent subpackage
|
||||||
(see http://python-notes.curiousefficiency.org/en/latest/python_concepts/import_traps.html#the-init-py-trap)
|
(see http://python-notes.curiousefficiency.org/en/latest/python_concepts/import_traps.html#the-init-py-trap)
|
||||||
|
|
||||||
Please let me know if you are aware of a better way of dealing with this!
|
Instead, this is imported in the stub config (in this repository), so if the stub config is used, it triggers import of the 'real' config.
|
||||||
|
|
||||||
|
Please let me know if you are aware of a better way of dealing with this!
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from types import ModuleType
|
|
||||||
|
|
||||||
# TODO not ideal to keep it here, but this should really be a leaf in the import tree
|
|
||||||
def assign_module(parent: str, name: str, module: ModuleType) -> None:
|
|
||||||
import sys
|
|
||||||
import importlib
|
|
||||||
parent_module = importlib.import_module(parent)
|
|
||||||
sys.modules[parent + '.' + name] = module
|
|
||||||
if sys.version_info.minor == 6:
|
|
||||||
# ugh. not sure why it's necessary in py36...
|
|
||||||
# TODO that crap should be tested... I guess will get it for free when I run rest of tests in the matrix
|
|
||||||
setattr(parent_module, name, module)
|
|
||||||
|
|
||||||
del ModuleType
|
|
||||||
|
|
||||||
|
|
||||||
# separate function to present namespace pollution
|
# separate function to present namespace pollution
|
||||||
def setup_config() -> None:
|
def setup_config() -> None:
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Optional
|
from pathlib import Path
|
||||||
|
|
||||||
from .preinit import get_mycfg_dir
|
from .preinit import get_mycfg_dir
|
||||||
|
|
||||||
mycfg_dir = get_mycfg_dir()
|
mycfg_dir = get_mycfg_dir()
|
||||||
|
|
||||||
if not mycfg_dir.exists():
|
if not mycfg_dir.exists():
|
||||||
warnings.warn(f"""
|
warnings.warn(f"""
|
||||||
'my.config' package isn't found! (expected at '{mycfg_dir}'). This is likely to result in issues.
|
'my.config' package isn't found! (expected at '{mycfg_dir}'). This is likely to result in issues.
|
||||||
See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info.
|
See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info.
|
||||||
""".strip())
|
""".strip(), stacklevel=1)
|
||||||
return
|
return
|
||||||
|
|
||||||
mpath = str(mycfg_dir)
|
mpath = str(mycfg_dir)
|
||||||
|
@ -45,20 +34,39 @@ See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-mo
|
||||||
# hopefully it doesn't cause any issues
|
# hopefully it doesn't cause any issues
|
||||||
sys.path.insert(0, mpath)
|
sys.path.insert(0, mpath)
|
||||||
|
|
||||||
# remove the stub and insert reimport hte 'real' config
|
# remove the stub and reimport the 'real' config
|
||||||
|
# likely my.config will always be in sys.modules, but defensive just in case
|
||||||
if 'my.config' in sys.modules:
|
if 'my.config' in sys.modules:
|
||||||
# TODO FIXME make sure this method isn't called twice...
|
|
||||||
del sys.modules['my.config']
|
del sys.modules['my.config']
|
||||||
|
# this should import from mpath now
|
||||||
try:
|
try:
|
||||||
# todo import_from instead?? dunno
|
|
||||||
import my.config
|
import my.config
|
||||||
except ImportError as ex:
|
except ImportError as ex:
|
||||||
# just in case... who knows what crazy setup users have in mind.
|
# just in case... who knows what crazy setup users have
|
||||||
# todo log?
|
import logging
|
||||||
|
|
||||||
|
logging.exception(ex)
|
||||||
warnings.warn(f"""
|
warnings.warn(f"""
|
||||||
Importing 'my.config' failed! (error: {ex}). This is likely to result in issues.
|
Importing 'my.config' failed! (error: {ex}). This is likely to result in issues.
|
||||||
See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info.
|
See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info.
|
||||||
""")
|
""", stacklevel=1)
|
||||||
|
else:
|
||||||
|
# defensive just in case -- __file__ may not be present if there is some dynamic magic involved
|
||||||
|
used_config_file = getattr(my.config, '__file__', None)
|
||||||
|
if used_config_file is not None:
|
||||||
|
used_config_path = Path(used_config_file)
|
||||||
|
try:
|
||||||
|
# will crash if it's imported from other dir?
|
||||||
|
used_config_path.relative_to(mycfg_dir)
|
||||||
|
except ValueError:
|
||||||
|
# TODO maybe implement a strict mode where these warnings will be errors?
|
||||||
|
warnings.warn(
|
||||||
|
f"""
|
||||||
|
Expected my.config to be located at {mycfg_dir}, but instead its path is {used_config_path}.
|
||||||
|
This will likely cause issues down the line -- double check {mycfg_dir} structure.
|
||||||
|
See https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#setting-up-the-modules for more info.
|
||||||
|
""", stacklevel=1
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
setup_config()
|
setup_config()
|
||||||
|
|
9
my/core/internal.py
Normal file
9
my/core/internal.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
"""
|
||||||
|
Utils specific to hpi core, shouldn't really be used by HPI modules
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def assert_subpackage(name: str) -> None:
|
||||||
|
# can lead to some unexpected issues if you 'import cachew' which being in my/core directory.. so let's protect against it
|
||||||
|
# NOTE: if we use overlay, name can be smth like my.origg.my.core.cachew ...
|
||||||
|
assert name == '__main__' or 'my.core' in name, f'Expected module __name__ ({name}) to be __main__ or start with my.core'
|
|
@ -1,94 +1,17 @@
|
||||||
"""
|
from .internal import assert_subpackage
|
||||||
Various helpers for compression
|
|
||||||
"""
|
|
||||||
import pathlib
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Union, IO
|
|
||||||
import io
|
|
||||||
|
|
||||||
PathIsh = Union[Path, str]
|
assert_subpackage(__name__)
|
||||||
|
|
||||||
|
from . import warnings
|
||||||
|
|
||||||
def _zstd_open(path: Path, *args, **kwargs) -> IO[str]:
|
# do this later -- for now need to transition modules to avoid using kompress directly (e.g. ZipPath)
|
||||||
import zstandard as zstd # type: ignore
|
# warnings.high('my.core.kompress is deprecated, please use "kompress" library directly. See https://github.com/karlicoss/kompress')
|
||||||
fh = path.open('rb')
|
|
||||||
dctx = zstd.ZstdDecompressor()
|
|
||||||
reader = dctx.stream_reader(fh)
|
|
||||||
return io.TextIOWrapper(reader, **kwargs) # meh
|
|
||||||
|
|
||||||
|
try:
|
||||||
# TODO returns protocol that we can call 'read' against?
|
from kompress import *
|
||||||
# TODO use the 'dependent type' trick?
|
except ModuleNotFoundError as e:
|
||||||
def kopen(path: PathIsh, *args, mode: str='rt', **kwargs) -> IO[str]:
|
if e.name == 'kompress':
|
||||||
# TODO handle mode in *rags?
|
warnings.high('Please install kompress (pip3 install kompress). Falling onto vendorized kompress for now.')
|
||||||
encoding = kwargs.get('encoding', 'utf8')
|
from ._deprecated.kompress import * # type: ignore[assignment]
|
||||||
kwargs['encoding'] = encoding
|
|
||||||
|
|
||||||
pp = Path(path)
|
|
||||||
suf = pp.suffix
|
|
||||||
if suf in {'.xz'}:
|
|
||||||
import lzma
|
|
||||||
r = lzma.open(pp, mode, *args, **kwargs)
|
|
||||||
# should only happen for binary mode?
|
|
||||||
# file:///usr/share/doc/python3/html/library/lzma.html?highlight=lzma#lzma.open
|
|
||||||
assert not isinstance(r, lzma.LZMAFile), r
|
|
||||||
return r
|
|
||||||
elif suf in {'.zip'}:
|
|
||||||
# eh. this behaviour is a bit dodgy...
|
|
||||||
from zipfile import ZipFile
|
|
||||||
zfile = ZipFile(pp)
|
|
||||||
|
|
||||||
[subpath] = args # meh?
|
|
||||||
|
|
||||||
## oh god... https://stackoverflow.com/a/5639960/706389
|
|
||||||
ifile = zfile.open(subpath, mode='r')
|
|
||||||
ifile.readable = lambda: True # type: ignore
|
|
||||||
ifile.writable = lambda: False # type: ignore
|
|
||||||
ifile.seekable = lambda: False # type: ignore
|
|
||||||
ifile.read1 = ifile.read # type: ignore
|
|
||||||
# TODO pass all kwargs here??
|
|
||||||
# todo 'expected "BinaryIO"'??
|
|
||||||
return io.TextIOWrapper(ifile, encoding=encoding) # type: ignore[arg-type]
|
|
||||||
elif suf in {'.lz4'}:
|
|
||||||
import lz4.frame # type: ignore
|
|
||||||
return lz4.frame.open(str(pp), mode, *args, **kwargs)
|
|
||||||
elif suf in {'.zstd'}:
|
|
||||||
return _zstd_open(pp, mode, *args, **kwargs)
|
|
||||||
else:
|
else:
|
||||||
return pp.open(mode, *args, **kwargs)
|
raise e
|
||||||
|
|
||||||
|
|
||||||
import typing
|
|
||||||
import os
|
|
||||||
|
|
||||||
if typing.TYPE_CHECKING:
|
|
||||||
# otherwise mypy can't figure out that BasePath is a type alias..
|
|
||||||
BasePath = pathlib.Path
|
|
||||||
else:
|
|
||||||
BasePath = pathlib.WindowsPath if os.name == 'nt' else pathlib.PosixPath
|
|
||||||
|
|
||||||
|
|
||||||
class CPath(BasePath):
|
|
||||||
"""
|
|
||||||
Hacky way to support compressed files.
|
|
||||||
If you can think of a better way to do this, please let me know! https://github.com/karlicoss/HPI/issues/20
|
|
||||||
|
|
||||||
Ugh. So, can't override Path because of some _flavour thing.
|
|
||||||
Path only has _accessor and _closed slots, so can't directly set .open method
|
|
||||||
_accessor.open has to return file descriptor, doesn't work for compressed stuff.
|
|
||||||
"""
|
|
||||||
def open(self, *args, **kwargs):
|
|
||||||
# TODO assert read only?
|
|
||||||
return kopen(str(self))
|
|
||||||
|
|
||||||
|
|
||||||
open = kopen # TODO deprecate
|
|
||||||
|
|
||||||
|
|
||||||
# meh
|
|
||||||
def kexists(path: PathIsh, subpath: str) -> bool:
|
|
||||||
try:
|
|
||||||
kopen(path, subpath)
|
|
||||||
return True
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
|
@ -5,21 +5,25 @@ This can potentially allow both for safer defensive parsing, and let you know if
|
||||||
TODO perhaps need to get some inspiration from linear logic to decide on a nice API...
|
TODO perhaps need to get some inspiration from linear logic to decide on a nice API...
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import Any, List
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
def ignore(w, *keys):
|
def ignore(w, *keys):
|
||||||
for k in keys:
|
for k in keys:
|
||||||
w[k].ignore()
|
w[k].ignore()
|
||||||
|
|
||||||
|
|
||||||
def zoom(w, *keys):
|
def zoom(w, *keys):
|
||||||
return [w[k].zoom() for k in keys]
|
return [w[k].zoom() for k in keys]
|
||||||
|
|
||||||
|
|
||||||
# TODO need to support lists
|
# TODO need to support lists
|
||||||
class Zoomable:
|
class Zoomable:
|
||||||
def __init__(self, parent, *args, **kwargs) -> None:
|
def __init__(self, parent, *args, **kwargs) -> None:
|
||||||
super().__init__(*args, **kwargs) # type: ignore
|
super().__init__(*args, **kwargs)
|
||||||
self.parent = parent
|
self.parent = parent
|
||||||
|
|
||||||
# TODO not sure, maybe do it via del??
|
# TODO not sure, maybe do it via del??
|
||||||
|
@ -40,7 +44,7 @@ class Zoomable:
|
||||||
assert self.parent is not None
|
assert self.parent is not None
|
||||||
self.parent._remove(self)
|
self.parent._remove(self)
|
||||||
|
|
||||||
def zoom(self) -> 'Zoomable':
|
def zoom(self) -> Zoomable:
|
||||||
self.consume()
|
self.consume()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -63,6 +67,7 @@ class Wdict(Zoomable, OrderedDict):
|
||||||
|
|
||||||
def this_consumed(self):
|
def this_consumed(self):
|
||||||
return len(self) == 0
|
return len(self) == 0
|
||||||
|
|
||||||
# TODO specify mypy type for the index special method?
|
# TODO specify mypy type for the index special method?
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,6 +82,7 @@ class Wlist(Zoomable, list):
|
||||||
def this_consumed(self):
|
def this_consumed(self):
|
||||||
return len(self) == 0
|
return len(self) == 0
|
||||||
|
|
||||||
|
|
||||||
class Wvalue(Zoomable):
|
class Wvalue(Zoomable):
|
||||||
def __init__(self, parent, value: Any) -> None:
|
def __init__(self, parent, value: Any) -> None:
|
||||||
super().__init__(parent)
|
super().__init__(parent)
|
||||||
|
@ -87,20 +93,20 @@ class Wvalue(Zoomable):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def this_consumed(self):
|
def this_consumed(self):
|
||||||
return True # TODO not sure..
|
return True # TODO not sure..
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return 'WValue{' + repr(self.value) + '}'
|
return 'WValue{' + repr(self.value) + '}'
|
||||||
|
|
||||||
from typing import Tuple
|
|
||||||
def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]:
|
def _wrap(j, parent=None) -> tuple[Zoomable, list[Zoomable]]:
|
||||||
res: Zoomable
|
res: Zoomable
|
||||||
cc: List[Zoomable]
|
cc: list[Zoomable]
|
||||||
if isinstance(j, dict):
|
if isinstance(j, dict):
|
||||||
res = Wdict(parent)
|
res = Wdict(parent)
|
||||||
cc = [res]
|
cc = [res]
|
||||||
for k, v in j.items():
|
for k, v in j.items():
|
||||||
vv, c = _wrap(v, parent=res)
|
vv, c = _wrap(v, parent=res)
|
||||||
res[k] = vv
|
res[k] = vv
|
||||||
cc.extend(c)
|
cc.extend(c)
|
||||||
return res, cc
|
return res, cc
|
||||||
|
@ -118,21 +124,24 @@ def _wrap(j, parent=None) -> Tuple[Zoomable, List[Zoomable]]:
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f'Unexpected type: {type(j)} {j}')
|
raise RuntimeError(f'Unexpected type: {type(j)} {j}')
|
||||||
|
|
||||||
|
|
||||||
|
from collections.abc import Iterator
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from typing import Iterator
|
|
||||||
|
|
||||||
class UnconsumedError(Exception):
|
class UnconsumedError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
# TODO think about error policy later...
|
# TODO think about error policy later...
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def wrap(j, throw=True) -> Iterator[Zoomable]:
|
def wrap(j, *, throw=True) -> Iterator[Zoomable]:
|
||||||
w, children = _wrap(j)
|
w, children = _wrap(j)
|
||||||
|
|
||||||
yield w
|
yield w
|
||||||
|
|
||||||
for c in children:
|
for c in children:
|
||||||
if not c.this_consumed(): # TODO hmm. how does it figure out if it's consumed???
|
if not c.this_consumed(): # TODO hmm. how does it figure out if it's consumed???
|
||||||
if throw:
|
if throw:
|
||||||
# TODO need to keep a full path or something...
|
# TODO need to keep a full path or something...
|
||||||
raise UnconsumedError(f'''
|
raise UnconsumedError(f'''
|
||||||
|
@ -142,9 +151,13 @@ Expected {c} to be fully consumed by the parser.
|
||||||
# TODO log?
|
# TODO log?
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
from typing import cast
|
from typing import cast
|
||||||
def test_unconsumed():
|
|
||||||
import pytest # type: ignore
|
|
||||||
|
def test_unconsumed() -> None:
|
||||||
|
import pytest
|
||||||
|
|
||||||
with pytest.raises(UnconsumedError):
|
with pytest.raises(UnconsumedError):
|
||||||
with wrap({'a': 1234}) as w:
|
with wrap({'a': 1234}) as w:
|
||||||
w = cast(Wdict, w)
|
w = cast(Wdict, w)
|
||||||
|
@ -155,7 +168,8 @@ def test_unconsumed():
|
||||||
w = cast(Wdict, w)
|
w = cast(Wdict, w)
|
||||||
d = w['c']['d'].zoom()
|
d = w['c']['d'].zoom()
|
||||||
|
|
||||||
def test_consumed():
|
|
||||||
|
def test_consumed() -> None:
|
||||||
with wrap({'a': 1234}) as w:
|
with wrap({'a': 1234}) as w:
|
||||||
w = cast(Wdict, w)
|
w = cast(Wdict, w)
|
||||||
a = w['a'].zoom()
|
a = w['a'].zoom()
|
||||||
|
@ -165,7 +179,8 @@ def test_consumed():
|
||||||
c = w['c'].zoom()
|
c = w['c'].zoom()
|
||||||
d = c['d'].zoom()
|
d = c['d'].zoom()
|
||||||
|
|
||||||
def test_types():
|
|
||||||
|
def test_types() -> None:
|
||||||
# (string, number, object, array, boolean or nul
|
# (string, number, object, array, boolean or nul
|
||||||
with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w:
|
with wrap({'string': 'string', 'number': 3.14, 'boolean': True, 'null': None, 'list': [1, 2, 3]}) as w:
|
||||||
w = cast(Wdict, w)
|
w = cast(Wdict, w)
|
||||||
|
@ -173,23 +188,22 @@ def test_types():
|
||||||
w['number'].consume()
|
w['number'].consume()
|
||||||
w['boolean'].zoom()
|
w['boolean'].zoom()
|
||||||
w['null'].zoom()
|
w['null'].zoom()
|
||||||
for x in list(w['list'].zoom()): # TODO eh. how to avoid the extra list thing?
|
for x in list(w['list'].zoom()): # TODO eh. how to avoid the extra list thing?
|
||||||
x.consume()
|
x.consume()
|
||||||
|
|
||||||
def test_consume_all():
|
|
||||||
|
def test_consume_all() -> None:
|
||||||
with wrap({'aaa': {'bbb': {'hi': 123}}}) as w:
|
with wrap({'aaa': {'bbb': {'hi': 123}}}) as w:
|
||||||
w = cast(Wdict, w)
|
w = cast(Wdict, w)
|
||||||
aaa = w['aaa'].zoom()
|
aaa = w['aaa'].zoom()
|
||||||
aaa['bbb'].consume_all()
|
aaa['bbb'].consume_all()
|
||||||
|
|
||||||
|
|
||||||
def test_consume_few():
|
def test_consume_few() -> None:
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
pytest.skip('Will think about it later..')
|
pytest.skip('Will think about it later..')
|
||||||
with wrap({
|
with wrap({'important': 123, 'unimportant': 'whatever'}) as w:
|
||||||
'important': 123,
|
|
||||||
'unimportant': 'whatever'
|
|
||||||
}) as w:
|
|
||||||
w = cast(Wdict, w)
|
w = cast(Wdict, w)
|
||||||
w['important'].zoom()
|
w['important'].zoom()
|
||||||
w.consume_all()
|
w.consume_all()
|
||||||
|
@ -197,7 +211,8 @@ def test_consume_few():
|
||||||
|
|
||||||
|
|
||||||
def test_zoom() -> None:
|
def test_zoom() -> None:
|
||||||
import pytest # type: ignore
|
import pytest
|
||||||
|
|
||||||
with wrap({'aaa': 'whatever'}) as w:
|
with wrap({'aaa': 'whatever'}) as w:
|
||||||
w = cast(Wdict, w)
|
w = cast(Wdict, w)
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
|
@ -206,3 +221,34 @@ def test_zoom() -> None:
|
||||||
|
|
||||||
|
|
||||||
# TODO type check this...
|
# TODO type check this...
|
||||||
|
|
||||||
|
# TODO feels like the whole thing kind of unnecessarily complex
|
||||||
|
# - cons:
|
||||||
|
# - in most cases this is not even needed? who cares if we miss a few attributes?
|
||||||
|
# - pro: on the other hand it could be interesting to know about new attributes in data,
|
||||||
|
# and without this kind of processing we wouldn't even know
|
||||||
|
# alternatives
|
||||||
|
# - manually process data
|
||||||
|
# e.g. use asserts, dict.pop and dict.values() methods to unpack things
|
||||||
|
# - pros:
|
||||||
|
# - very simple, since uses built in syntax
|
||||||
|
# - very performant, as fast as it gets
|
||||||
|
# - very flexible, easy to adjust behaviour
|
||||||
|
# - cons:
|
||||||
|
# - can forget to assert about extra entities etc, so error prone
|
||||||
|
# - if we do something like =assert j.pop('status') == 200, j=, by the time assert happens we already popped item -- makes error handling harder
|
||||||
|
# - a bit verbose.. so probably requires some helper functions though (could be much leaner than current konsume though)
|
||||||
|
# - if we assert, then terminates parsing too early, if we're defensive then inflates the code a lot with if statements
|
||||||
|
# - TODO perhaps combine warnings somehow or at least only emit once per module?
|
||||||
|
# - hmm actually tbh if we carefully go through everything and don't make copies, then only requires one assert at the very end?
|
||||||
|
# - TODO this is kinda useful? https://discuss.python.org/t/syntax-for-dictionnary-unpacking-to-variables/18718
|
||||||
|
# operator.itemgetter?
|
||||||
|
# - TODO can use match operator in python for this? quite nice actually! and allows for dynamic behaviour
|
||||||
|
# only from 3.10 tho, and gonna be tricky to do dynamic defensive behaviour with this
|
||||||
|
# - TODO in a sense, blenser already would hint if some meaningful fields aren't being processed? only if they are changing though
|
||||||
|
# - define a "schema" for data, then just recursively match data against the schema?
|
||||||
|
# possibly pydantic already does something like that? not sure about performance though
|
||||||
|
# pros:
|
||||||
|
# - much simpler to extend and understand what's going on
|
||||||
|
# cons:
|
||||||
|
# - more rigid, so it becomes tricky to do dynamic stuff (e.g. if schema actually changes)
|
||||||
|
|
|
@ -1,43 +1,61 @@
|
||||||
#!/usr/bin/env python3
|
from __future__ import annotations
|
||||||
'''
|
|
||||||
Default logger is a bit, see 'test'/run this file for a demo
|
import logging
|
||||||
'''
|
import os
|
||||||
|
import sys
|
||||||
|
import warnings
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import TYPE_CHECKING, Union
|
||||||
|
|
||||||
|
|
||||||
def test() -> None:
|
def test() -> None:
|
||||||
import logging
|
from typing import Callable
|
||||||
import sys
|
|
||||||
M = lambda s: print(s, file=sys.stderr)
|
|
||||||
|
|
||||||
M(" Logging module's deafults are not great...'")
|
M: Callable[[str], None] = lambda s: print(s, file=sys.stderr)
|
||||||
l = logging.getLogger('test_logger')
|
|
||||||
|
## prepare exception for later
|
||||||
|
try:
|
||||||
|
None.whatever # type: ignore[attr-defined] # noqa: B018
|
||||||
|
except Exception as e:
|
||||||
|
ex = e
|
||||||
|
##
|
||||||
|
|
||||||
|
M(" Logging module's defaults are not great:")
|
||||||
|
l = logging.getLogger('default_logger')
|
||||||
l.error("For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level")
|
l.error("For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level")
|
||||||
|
|
||||||
M(" The reason is that you need to remember to call basicConfig() first")
|
M("\n The reason is that you need to remember to call basicConfig() first. Let's do it now:")
|
||||||
logging.basicConfig()
|
logging.basicConfig()
|
||||||
l.error("OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number")
|
l.error("OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number")
|
||||||
|
|
||||||
M("")
|
M("\n Also exception logging is kinda lame, doesn't print traceback by default unless you remember to pass exc_info:")
|
||||||
M(" With LazyLogger you get a reasonable logging format, colours and other neat things")
|
l.exception(ex) # type: ignore[possibly-undefined]
|
||||||
|
|
||||||
ll = LazyLogger('test') # No need for basicConfig!
|
M("\n\n With make_logger you get a reasonable logging format, colours (via colorlog library) and other neat things:")
|
||||||
|
|
||||||
|
ll = make_logger('test') # No need for basicConfig!
|
||||||
ll.info("default level is INFO")
|
ll.info("default level is INFO")
|
||||||
ll.debug(".. so this shouldn't be displayed")
|
ll.debug("... so this shouldn't be displayed")
|
||||||
ll.warning("warnings are easy to spot!")
|
ll.warning("warnings are easy to spot!")
|
||||||
ll.exception(RuntimeError("exceptions as well"))
|
|
||||||
|
M("\n Exceptions print traceback by default now:")
|
||||||
|
ll.exception(ex)
|
||||||
|
|
||||||
|
M("\n You can (and should) use it via regular logging.getLogger after that, e.g. let's set logging level to DEBUG now")
|
||||||
|
logging.getLogger('test').setLevel(logging.DEBUG)
|
||||||
|
ll.debug("... now debug messages are also displayed")
|
||||||
|
|
||||||
|
|
||||||
import logging
|
DEFAULT_LEVEL = 'INFO'
|
||||||
from typing import Union, Optional
|
FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)-4d]{end} %(message)s'
|
||||||
import os
|
FORMAT_NOCOLOR = FORMAT.format(start='', end='')
|
||||||
|
|
||||||
|
|
||||||
Level = int
|
Level = int
|
||||||
LevelIsh = Optional[Union[Level, str]]
|
LevelIsh = Union[Level, str, None]
|
||||||
|
|
||||||
|
|
||||||
def mklevel(level: LevelIsh) -> Level:
|
def mklevel(level: LevelIsh) -> Level:
|
||||||
glevel = os.environ.get('HPI_LOGS', None)
|
|
||||||
if glevel is not None:
|
|
||||||
level = glevel
|
|
||||||
if level is None:
|
if level is None:
|
||||||
return logging.NOTSET
|
return logging.NOTSET
|
||||||
if isinstance(level, int):
|
if isinstance(level, int):
|
||||||
|
@ -45,47 +63,204 @@ def mklevel(level: LevelIsh) -> Level:
|
||||||
return getattr(logging, level.upper())
|
return getattr(logging, level.upper())
|
||||||
|
|
||||||
|
|
||||||
FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)d]{end} %(message)s'
|
def get_collapse_level() -> Level | None:
|
||||||
FORMAT_COLOR = FORMAT.format(start='%(color)s', end='%(end_color)s')
|
# TODO not sure if should be specific to logger name?
|
||||||
FORMAT_NOCOLOR = FORMAT.format(start='', end='')
|
cl = os.environ.get('LOGGING_COLLAPSE', None)
|
||||||
DATEFMT = '%Y-%m-%d %H:%M:%S'
|
if cl is not None:
|
||||||
|
return mklevel(cl)
|
||||||
|
# legacy name, maybe deprecate?
|
||||||
|
cl = os.environ.get('COLLAPSE_DEBUG_LOGS', None)
|
||||||
|
if cl is not None:
|
||||||
|
return logging.DEBUG
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def setup_logger(logger: logging.Logger, level: LevelIsh) -> None:
|
def get_env_level(name: str) -> Level | None:
|
||||||
lvl = mklevel(level)
|
PREFIX = 'LOGGING_LEVEL_' # e.g. LOGGING_LEVEL_my_hypothesis=debug
|
||||||
try:
|
# shell doesn't allow using dots in var names without escaping, so also support underscore syntax
|
||||||
import logzero # type: ignore[import]
|
lvl = os.environ.get(PREFIX + name, None) or os.environ.get(PREFIX + name.replace('.', '_'), None)
|
||||||
except ModuleNotFoundError:
|
if lvl is not None:
|
||||||
import warnings
|
return mklevel(lvl)
|
||||||
warnings.warn("You might want to install 'logzero' for nice colored logs!")
|
# if LOGGING_LEVEL_HPI is set, use that. This should override anything the module may set as its default
|
||||||
logger.setLevel(lvl)
|
# this is also set when the user passes the --debug flag in the CLI
|
||||||
h = logging.StreamHandler()
|
#
|
||||||
h.setLevel(lvl)
|
# check after LOGGING_LEVEL_ prefix since that is more specific
|
||||||
h.setFormatter(logging.Formatter(fmt=FORMAT_NOCOLOR, datefmt=DATEFMT))
|
if 'LOGGING_LEVEL_HPI' in os.environ:
|
||||||
logger.addHandler(h)
|
return mklevel(os.environ['LOGGING_LEVEL_HPI'])
|
||||||
logger.propagate = False # ugh. otherwise it duplicates log messages? not sure about it..
|
# legacy name, for backwards compatibility
|
||||||
|
if 'HPI_LOGS' in os.environ:
|
||||||
|
from my.core.warnings import medium
|
||||||
|
|
||||||
|
medium('The HPI_LOGS environment variable is deprecated, use LOGGING_LEVEL_HPI instead')
|
||||||
|
|
||||||
|
return mklevel(os.environ['HPI_LOGS'])
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logger(logger: str | logging.Logger, *, level: LevelIsh = None) -> None:
|
||||||
|
"""
|
||||||
|
Wrapper to simplify logging setup.
|
||||||
|
"""
|
||||||
|
if isinstance(logger, str):
|
||||||
|
logger = logging.getLogger(logger)
|
||||||
|
|
||||||
|
if level is None:
|
||||||
|
level = DEFAULT_LEVEL
|
||||||
|
|
||||||
|
# env level always takes precedence
|
||||||
|
env_level = get_env_level(logger.name)
|
||||||
|
if env_level is not None:
|
||||||
|
lvl = env_level
|
||||||
else:
|
else:
|
||||||
formatter = logzero.LogFormatter(
|
lvl = mklevel(level)
|
||||||
fmt=FORMAT_COLOR,
|
|
||||||
datefmt=DATEFMT,
|
if logger.level == logging.NOTSET:
|
||||||
)
|
# if it's already set, the user requested a different logging level, let's respect that
|
||||||
logzero.setup_logger(logger.name, level=lvl, formatter=formatter)
|
logger.setLevel(lvl)
|
||||||
|
|
||||||
|
_setup_handlers_and_formatters(name=logger.name)
|
||||||
|
|
||||||
|
|
||||||
class LazyLogger(logging.Logger):
|
# cached since this should only be done once per logger instance
|
||||||
def __new__(cls, name, level: LevelIsh = 'INFO'):
|
@lru_cache(None)
|
||||||
logger = logging.getLogger(name)
|
def _setup_handlers_and_formatters(name: str) -> None:
|
||||||
# this is called prior to all _log calls so makes sense to do it here?
|
logger = logging.getLogger(name)
|
||||||
def isEnabledFor_lazyinit(*args, logger=logger, orig=logger.isEnabledFor, **kwargs):
|
|
||||||
att = 'lazylogger_init_done'
|
|
||||||
if not getattr(logger, att, False): # init once, if necessary
|
|
||||||
setup_logger(logger, level=level)
|
|
||||||
setattr(logger, att, True)
|
|
||||||
return orig(*args, **kwargs)
|
|
||||||
|
|
||||||
logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[assignment]
|
logger.addFilter(AddExceptionTraceback())
|
||||||
return logger
|
|
||||||
|
collapse_level = get_collapse_level()
|
||||||
|
if collapse_level is None or not sys.stderr.isatty():
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
else:
|
||||||
|
handler = CollapseLogsHandler(maxlevel=collapse_level)
|
||||||
|
|
||||||
|
# default level for handler is NOTSET, which will make it process all messages
|
||||||
|
# we rely on the logger to actually accept/reject log msgs
|
||||||
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
# this attribute is set to True by default, which causes log entries to be passed to root logger (e.g. if you call basicConfig beforehand)
|
||||||
|
# even if log entry is handled by this logger ... not sure what's the point of this behaviour??
|
||||||
|
logger.propagate = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
# try colorlog first, so user gets nice colored logs
|
||||||
|
import colorlog
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
warnings.warn("You might want to 'pip install colorlog' for nice colored logs", stacklevel=1)
|
||||||
|
formatter = logging.Formatter(FORMAT_NOCOLOR)
|
||||||
|
else:
|
||||||
|
# log_color/reset are specific to colorlog
|
||||||
|
FORMAT_COLOR = FORMAT.format(start='%(log_color)s', end='%(reset)s')
|
||||||
|
# colorlog should detect tty in principle, but doesn't handle everything for some reason
|
||||||
|
# see https://github.com/borntyping/python-colorlog/issues/71
|
||||||
|
if handler.stream.isatty():
|
||||||
|
formatter = colorlog.ColoredFormatter(FORMAT_COLOR)
|
||||||
|
else:
|
||||||
|
formatter = logging.Formatter(FORMAT_NOCOLOR)
|
||||||
|
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
|
||||||
|
# by default, logging.exception isn't logging traceback unless called inside of the exception handler
|
||||||
|
# which is a bit annoying since we have to pass exc_info explicitly
|
||||||
|
# also see https://stackoverflow.com/questions/75121925/why-doesnt-python-logging-exception-method-log-traceback-by-default
|
||||||
|
# todo also amend by post about defensive error handling?
|
||||||
|
class AddExceptionTraceback(logging.Filter):
|
||||||
|
def filter(self, record: logging.LogRecord) -> bool:
|
||||||
|
if record.levelname == 'ERROR':
|
||||||
|
exc = record.msg
|
||||||
|
if isinstance(exc, BaseException):
|
||||||
|
if record.exc_info is None or record.exc_info == (None, None, None):
|
||||||
|
exc_info = (type(exc), exc, exc.__traceback__)
|
||||||
|
record.exc_info = exc_info
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# todo also save full log in a file?
|
||||||
|
class CollapseLogsHandler(logging.StreamHandler):
|
||||||
|
'''
|
||||||
|
Collapses subsequent debug log lines and redraws on the same line.
|
||||||
|
Hopefully this gives both a sense of progress and doesn't clutter the terminal as much?
|
||||||
|
'''
|
||||||
|
|
||||||
|
last: bool = False
|
||||||
|
|
||||||
|
maxlevel: Level = logging.DEBUG # everything with less or equal level will be collapsed
|
||||||
|
|
||||||
|
def __init__(self, *args, maxlevel: Level, **kwargs) -> None:
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.maxlevel = maxlevel
|
||||||
|
|
||||||
|
def emit(self, record: logging.LogRecord) -> None:
|
||||||
|
try:
|
||||||
|
msg = self.format(record)
|
||||||
|
cur = record.levelno <= self.maxlevel and '\n' not in msg
|
||||||
|
if cur:
|
||||||
|
if self.last:
|
||||||
|
self.stream.write('\033[K' + '\r') # clear line + return carriage
|
||||||
|
else:
|
||||||
|
if self.last:
|
||||||
|
self.stream.write('\n') # clean up after the last line
|
||||||
|
self.last = cur
|
||||||
|
columns, _ = os.get_terminal_size(0)
|
||||||
|
# ugh. the columns thing is meh. dunno I guess ultimately need curses for that
|
||||||
|
# TODO also would be cool to have a terminal post-processor? kinda like tail but aware of logging keywords (INFO/DEBUG/etc)
|
||||||
|
self.stream.write(msg + ' ' * max(0, columns - len(msg)) + ('' if cur else '\n'))
|
||||||
|
self.flush()
|
||||||
|
except:
|
||||||
|
self.handleError(record)
|
||||||
|
|
||||||
|
|
||||||
|
def make_logger(name: str, *, level: LevelIsh = None) -> logging.Logger:
|
||||||
|
logger = logging.getLogger(name)
|
||||||
|
setup_logger(logger, level=level)
|
||||||
|
return logger
|
||||||
|
|
||||||
|
|
||||||
|
# ughh. hacky way to have a single enlighten instance per interpreter, so it can be shared between modules
|
||||||
|
# not sure about this. I guess this should definitely be behind some flag
|
||||||
|
# OK, when stdout is not a tty, enlighten doesn't log anything, good
|
||||||
|
def get_enlighten():
|
||||||
|
# TODO could add env variable to disable enlighten for a module?
|
||||||
|
from unittest.mock import (
|
||||||
|
Mock, # Mock to return stub so cients don't have to think about it
|
||||||
|
)
|
||||||
|
|
||||||
|
# for now hidden behind the flag since it's a little experimental
|
||||||
|
if os.environ.get('ENLIGHTEN_ENABLE', None) is None:
|
||||||
|
return Mock()
|
||||||
|
|
||||||
|
try:
|
||||||
|
import enlighten # type: ignore[import-untyped]
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
warnings.warn("You might want to 'pip install enlighten' for a nice progress bar", stacklevel=1)
|
||||||
|
|
||||||
|
return Mock()
|
||||||
|
|
||||||
|
# dirty, but otherwise a bit unclear how to share enlighten manager between packages that call each other
|
||||||
|
instance = getattr(enlighten, 'INSTANCE', None)
|
||||||
|
if instance is not None:
|
||||||
|
return instance
|
||||||
|
instance = enlighten.get_manager()
|
||||||
|
setattr(enlighten, 'INSTANCE', instance)
|
||||||
|
return instance
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test()
|
test()
|
||||||
|
|
||||||
|
|
||||||
|
## legacy/deprecated methods for backwards compatibility
|
||||||
|
if not TYPE_CHECKING:
|
||||||
|
from .compat import deprecated
|
||||||
|
|
||||||
|
@deprecated('use make_logger instead')
|
||||||
|
def LazyLogger(*args, **kwargs):
|
||||||
|
return make_logger(*args, **kwargs)
|
||||||
|
|
||||||
|
@deprecated('use make_logger instead')
|
||||||
|
def logger(*args, **kwargs):
|
||||||
|
return make_logger(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
##
|
||||||
|
|
37
my/core/mime.py
Normal file
37
my/core/mime.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
"""
|
||||||
|
Utils for mime/filetype handling
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from .internal import assert_subpackage
|
||||||
|
|
||||||
|
assert_subpackage(__name__)
|
||||||
|
|
||||||
|
import functools
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@functools.lru_cache(1)
|
||||||
|
def _magic():
|
||||||
|
import magic # type: ignore
|
||||||
|
|
||||||
|
# TODO also has uncompess=True? could be useful
|
||||||
|
return magic.Magic(mime=True)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO could reuse in pdf module?
|
||||||
|
import mimetypes # todo do I need init()?
|
||||||
|
|
||||||
|
|
||||||
|
# todo wtf? fastermime thinks it's mime is application/json even if the extension is xz??
|
||||||
|
# whereas magic detects correctly: application/x-zstd and application/x-xz
|
||||||
|
def fastermime(path: Path | str) -> str:
|
||||||
|
paths = str(path)
|
||||||
|
# mimetypes is faster, so try it first
|
||||||
|
(mime, _) = mimetypes.guess_type(paths)
|
||||||
|
if mime is not None:
|
||||||
|
return mime
|
||||||
|
# magic is slower but handles more types
|
||||||
|
# TODO Result type?; it's kinda racey, but perhaps better to let the caller decide?
|
||||||
|
return _magic().from_file(paths)
|
|
@ -1,50 +1,60 @@
|
||||||
"""
|
"""
|
||||||
Various helpers for reading org-mode data
|
Various helpers for reading org-mode data
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
def parse_org_datetime(s: str) -> datetime:
|
def parse_org_datetime(s: str) -> datetime:
|
||||||
s = s.strip('[]')
|
s = s.strip('[]')
|
||||||
for fmt, cl in [
|
for fmt, _cls in [
|
||||||
("%Y-%m-%d %a %H:%M", datetime),
|
("%Y-%m-%d %a %H:%M", datetime),
|
||||||
("%Y-%m-%d %H:%M" , datetime),
|
("%Y-%m-%d %H:%M" , datetime),
|
||||||
# todo not sure about these... fallback on 00:00?
|
# todo not sure about these... fallback on 00:00?
|
||||||
# ("%Y-%m-%d %a" , date),
|
# ("%Y-%m-%d %a" , date),
|
||||||
# ("%Y-%m-%d" , date),
|
# ("%Y-%m-%d" , date),
|
||||||
]:
|
]:
|
||||||
try:
|
try:
|
||||||
return datetime.strptime(s, fmt)
|
return datetime.strptime(s, fmt)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
else:
|
raise RuntimeError(f"Bad datetime string {s}")
|
||||||
raise RuntimeError(f"Bad datetime string {s}")
|
|
||||||
|
|
||||||
|
|
||||||
|
# TODO I guess want to borrow inspiration from bs4? element type <-> tag; and similar logic for find_one, find_all
|
||||||
|
|
||||||
|
from collections.abc import Iterable
|
||||||
|
from typing import Callable, TypeVar
|
||||||
|
|
||||||
from orgparse import OrgNode
|
from orgparse import OrgNode
|
||||||
from typing import Iterable, TypeVar, Callable
|
|
||||||
V = TypeVar('V')
|
V = TypeVar('V')
|
||||||
|
|
||||||
|
|
||||||
def collect(n: OrgNode, cfun: Callable[[OrgNode], Iterable[V]]) -> Iterable[V]:
|
def collect(n: OrgNode, cfun: Callable[[OrgNode], Iterable[V]]) -> Iterable[V]:
|
||||||
yield from cfun(n)
|
yield from cfun(n)
|
||||||
for c in n.children:
|
for c in n.children:
|
||||||
yield from collect(c, cfun)
|
yield from collect(c, cfun)
|
||||||
|
|
||||||
|
|
||||||
from more_itertools import one
|
from more_itertools import one
|
||||||
from orgparse.extra import Table
|
from orgparse.extra import Table
|
||||||
|
|
||||||
|
|
||||||
def one_table(o: OrgNode) -> Table:
|
def one_table(o: OrgNode) -> Table:
|
||||||
return one(collect(o, lambda n: (x for x in n.body_rich if isinstance(x, Table))))
|
return one(collect(o, lambda n: (x for x in n.body_rich if isinstance(x, Table))))
|
||||||
|
|
||||||
|
|
||||||
from typing import Iterator, Dict, Any
|
|
||||||
class TypedTable(Table):
|
class TypedTable(Table):
|
||||||
def __new__(cls, orig: Table) -> 'TypedTable':
|
def __new__(cls, orig: Table) -> 'TypedTable':
|
||||||
tt = super().__new__(TypedTable)
|
tt = super().__new__(TypedTable)
|
||||||
tt.__dict__ = orig.__dict__
|
tt.__dict__ = orig.__dict__
|
||||||
blocks = list(orig.blocks)
|
blocks = list(orig.blocks)
|
||||||
header = blocks[0] # fist block is schema
|
header = blocks[0] # fist block is schema
|
||||||
if len(header) == 2:
|
if len(header) == 2:
|
||||||
# TODO later interpret first line as types
|
# TODO later interpret first line as types
|
||||||
header = header[1:]
|
header = header[1:]
|
||||||
tt._blocks = [header, *blocks[1:]]
|
setattr(tt, '_blocks', [header, *blocks[1:]])
|
||||||
return tt
|
return tt
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -1,27 +1,54 @@
|
||||||
'''
|
'''
|
||||||
Various pandas helpers and convenience functions
|
Various pandas helpers and convenience functions
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
# todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential
|
# todo not sure if belongs to 'core'. It's certainly 'more' core than actual modules, but still not essential
|
||||||
# NOTE: this file is meant to be importable without Pandas installed
|
# NOTE: this file is meant to be importable without Pandas installed
|
||||||
from datetime import datetime
|
import dataclasses
|
||||||
|
from collections.abc import Iterable, Iterator
|
||||||
|
from datetime import datetime, timezone
|
||||||
from pprint import pformat
|
from pprint import pformat
|
||||||
from typing import Optional, TYPE_CHECKING, Any, Iterable
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
Callable,
|
||||||
|
Literal,
|
||||||
|
TypeVar,
|
||||||
|
)
|
||||||
|
|
||||||
|
from decorator import decorator
|
||||||
|
|
||||||
from . import warnings
|
from . import warnings
|
||||||
|
from .error import Res, error_to_json, extract_error_datetime
|
||||||
|
from .logging import make_logger
|
||||||
|
from .types import Json, asdict
|
||||||
|
|
||||||
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
# this is kinda pointless at the moment, but handy to annotate DF returning methods now
|
import pandas as pd
|
||||||
# later will be unignored when they implement type annotations
|
|
||||||
import pandas as pd # type: ignore
|
DataFrameT = pd.DataFrame
|
||||||
# DataFrameT = pd.DataFrame
|
SeriesT = pd.Series
|
||||||
DataFrameT = Any
|
from pandas._typing import S1 # meh
|
||||||
|
|
||||||
|
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
|
||||||
|
# huh interesting -- with from __future__ import annotations don't even need else clause here?
|
||||||
|
# but still if other modules import these we do need some fake runtime types here..
|
||||||
else:
|
else:
|
||||||
# in runtime, make it defensive so it works without pandas
|
from typing import Optional
|
||||||
|
|
||||||
DataFrameT = Any
|
DataFrameT = Any
|
||||||
|
SeriesT = Optional # just some type with one argument
|
||||||
|
S1 = Any
|
||||||
|
|
||||||
|
|
||||||
def check_dateish(s) -> Iterable[str]:
|
def _check_dateish(s: SeriesT[S1]) -> Iterable[str]:
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # noqa: F811 not actually a redefinition
|
||||||
|
|
||||||
ctype = s.dtype
|
ctype = s.dtype
|
||||||
if str(ctype).startswith('datetime64'):
|
if str(ctype).startswith('datetime64'):
|
||||||
return
|
return
|
||||||
|
@ -30,8 +57,8 @@ def check_dateish(s) -> Iterable[str]:
|
||||||
return
|
return
|
||||||
all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all()
|
all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all()
|
||||||
if not all_timestamps:
|
if not all_timestamps:
|
||||||
return # not sure why it would happen, but ok
|
return # not sure why it would happen, but ok
|
||||||
tzs = s.map(lambda x: x.tzinfo).drop_duplicates()
|
tzs = s.map(lambda x: x.tzinfo).drop_duplicates() # type: ignore[union-attr, var-annotated, arg-type, return-value, unused-ignore]
|
||||||
examples = s[tzs.index]
|
examples = s[tzs.index]
|
||||||
# todo not so sure this warning is that useful... except for stuff without tz
|
# todo not so sure this warning is that useful... except for stuff without tz
|
||||||
yield f'''
|
yield f'''
|
||||||
|
@ -40,34 +67,216 @@ def check_dateish(s) -> Iterable[str]:
|
||||||
'''.strip()
|
'''.strip()
|
||||||
|
|
||||||
|
|
||||||
from typing import Any, Callable, TypeVar
|
def test_check_dateish() -> None:
|
||||||
FuncT = TypeVar('FuncT', bound=Callable[..., DataFrameT])
|
import pandas as pd
|
||||||
|
|
||||||
|
from .compat import fromisoformat
|
||||||
|
|
||||||
|
# empty series shouldn't warn
|
||||||
|
assert list(_check_dateish(pd.Series([]))) == []
|
||||||
|
|
||||||
|
# if no dateimes, shouldn't return any warnings
|
||||||
|
assert list(_check_dateish(pd.Series([1, 2, 3]))) == []
|
||||||
|
|
||||||
|
# all values are datetimes, shouldn't warn
|
||||||
|
# fmt: off
|
||||||
|
assert list(_check_dateish(pd.Series([
|
||||||
|
fromisoformat('2024-08-19T01:02:03'),
|
||||||
|
fromisoformat('2024-08-19T03:04:05'),
|
||||||
|
]))) == []
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
# mixture of timezones -- should warn
|
||||||
|
# fmt: off
|
||||||
|
assert len(list(_check_dateish(pd.Series([
|
||||||
|
fromisoformat('2024-08-19T01:02:03'),
|
||||||
|
fromisoformat('2024-08-19T03:04:05Z'),
|
||||||
|
])))) == 1
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
# TODO hmm. maybe this should actually warn?
|
||||||
|
# fmt: off
|
||||||
|
assert len(list(_check_dateish(pd.Series([
|
||||||
|
'whatever',
|
||||||
|
fromisoformat('2024-08-19T01:02:03'),
|
||||||
|
])))) == 0
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
ErrorColPolicy = Literal[
|
||||||
|
'add_if_missing', # add error column if it's missing
|
||||||
|
'warn' , # warn, but do not modify
|
||||||
|
'ignore' , # no warnings
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
def check_error_column(df: DataFrameT, *, policy: ErrorColPolicy) -> Iterable[str]:
|
||||||
|
if 'error' in df:
|
||||||
|
return
|
||||||
|
if policy == 'ignore':
|
||||||
|
return
|
||||||
|
|
||||||
|
wmsg = '''
|
||||||
|
No 'error' column detected. You probably forgot to handle errors defensively, which means a single bad entry might bring the whole dataframe down.
|
||||||
|
'''.strip()
|
||||||
|
if policy == 'add_if_missing':
|
||||||
|
# todo maybe just add the warnings text as well?
|
||||||
|
df['error'] = None
|
||||||
|
wmsg += "\nAdding empty 'error' column (see 'error_col_policy' if you want to change this behaviour)"
|
||||||
|
pass
|
||||||
|
|
||||||
|
yield wmsg
|
||||||
|
|
||||||
|
|
||||||
|
# TODO ugh. typing this is a mess... perhaps should use .compat.ParamSpec?
|
||||||
|
@decorator
|
||||||
|
def check_dataframe(f: FuncT, error_col_policy: ErrorColPolicy = 'add_if_missing', *args, **kwargs) -> DataFrameT:
|
||||||
|
df: DataFrameT = f(*args, **kwargs)
|
||||||
|
tag = '{f.__module__}:{f.__name__}'
|
||||||
|
# makes sense to keep super defensive
|
||||||
|
try:
|
||||||
|
for col, data in df.reset_index().items():
|
||||||
|
for w in _check_dateish(data):
|
||||||
|
warnings.low(f"{tag}, column '{col}': {w}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
|
try:
|
||||||
|
for w in check_error_column(df, policy=error_col_policy):
|
||||||
|
warnings.low(f"{tag}, {w}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(e)
|
||||||
|
return df
|
||||||
|
|
||||||
def check_dataframe(f: FuncT) -> FuncT:
|
|
||||||
from functools import wraps
|
|
||||||
@wraps(f)
|
|
||||||
def wrapper(*args, **kwargs) -> DataFrameT:
|
|
||||||
df = f(*args, **kwargs)
|
|
||||||
# todo make super defensive?
|
|
||||||
for col, data in df.reset_index().iteritems():
|
|
||||||
for w in check_dateish(data):
|
|
||||||
warnings.low(f"{f.__module__}:{f.__name__}, column '{col}': {w}")
|
|
||||||
return df
|
|
||||||
# https://github.com/python/mypy/issues/1927
|
|
||||||
return wrapper # type: ignore[return-value]
|
|
||||||
|
|
||||||
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
|
# todo doctor: could have a suggesion to wrap dataframes with it?? discover by return type?
|
||||||
|
|
||||||
|
|
||||||
import traceback
|
def error_to_row(e: Exception, *, dt_col: str = 'dt', tz: timezone | None = None) -> Json:
|
||||||
from typing import Dict, Any
|
|
||||||
from .error import extract_error_datetime
|
|
||||||
def error_to_row(e: Exception, *, dt_col: str='dt', tz=None) -> Dict[str, Any]:
|
|
||||||
edt = extract_error_datetime(e)
|
edt = extract_error_datetime(e)
|
||||||
if edt is not None and edt.tzinfo is None and tz is not None:
|
if edt is not None and edt.tzinfo is None and tz is not None:
|
||||||
edt = edt.replace(tzinfo=tz)
|
edt = edt.replace(tzinfo=tz)
|
||||||
estr = ''.join(traceback.format_exception(Exception, e, e.__traceback__))
|
err_dict: Json = error_to_json(e)
|
||||||
return {
|
err_dict[dt_col] = edt
|
||||||
'error': estr,
|
return err_dict
|
||||||
dt_col : edt,
|
|
||||||
}
|
|
||||||
|
def _to_jsons(it: Iterable[Res[Any]]) -> Iterable[Json]:
|
||||||
|
for r in it:
|
||||||
|
if isinstance(r, Exception):
|
||||||
|
yield error_to_row(r)
|
||||||
|
else:
|
||||||
|
yield asdict(r)
|
||||||
|
|
||||||
|
|
||||||
|
# mm. https://github.com/python/mypy/issues/8564
|
||||||
|
# no type for dataclass?
|
||||||
|
Schema = Any
|
||||||
|
|
||||||
|
|
||||||
|
def _as_columns(s: Schema) -> dict[str, type]:
|
||||||
|
# todo would be nice to extract properties; add tests for this as well
|
||||||
|
if dataclasses.is_dataclass(s):
|
||||||
|
return {f.name: f.type for f in dataclasses.fields(s)} # type: ignore[misc] # ugh, why mypy thinks f.type can return str??
|
||||||
|
# else must be NamedTuple??
|
||||||
|
# todo assert my.core.common.is_namedtuple?
|
||||||
|
return getattr(s, '_field_types')
|
||||||
|
|
||||||
|
|
||||||
|
# todo add proper types
|
||||||
|
@check_dataframe
|
||||||
|
def as_dataframe(it: Iterable[Res[Any]], schema: Schema | None = None) -> DataFrameT:
|
||||||
|
# todo warn if schema isn't specified?
|
||||||
|
# ok nice supports dataframe/NT natively
|
||||||
|
# https://github.com/pandas-dev/pandas/pull/27999
|
||||||
|
# but it dispatches dataclass based on the first entry...
|
||||||
|
# https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
|
||||||
|
# same for NamedTuple -- seems that it takes whatever schema the first NT has
|
||||||
|
# so we need to convert each individually... sigh
|
||||||
|
import pandas as pd # noqa: F811 not actually a redefinition
|
||||||
|
|
||||||
|
columns = None if schema is None else list(_as_columns(schema).keys())
|
||||||
|
return pd.DataFrame(_to_jsons(it), columns=columns)
|
||||||
|
|
||||||
|
|
||||||
|
# ugh. in principle this could be inside the test
|
||||||
|
# might be due to use of from __future__ import annotations
|
||||||
|
# can quickly reproduce by running pytest tests/tz.py tests/core/test_pandas.py
|
||||||
|
# possibly will be resolved after fix in pytest?
|
||||||
|
# see https://github.com/pytest-dev/pytest/issues/7856
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class _X:
|
||||||
|
# FIXME try moving inside?
|
||||||
|
x: int
|
||||||
|
|
||||||
|
|
||||||
|
def test_as_dataframe() -> None:
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
from pandas.testing import assert_frame_equal
|
||||||
|
|
||||||
|
from .compat import fromisoformat
|
||||||
|
|
||||||
|
it = ({'i': i, 's': f'str{i}'} for i in range(5))
|
||||||
|
with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings: # noqa: F841
|
||||||
|
df: DataFrameT = as_dataframe(it)
|
||||||
|
# todo test other error col policies
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
assert_frame_equal(
|
||||||
|
df,
|
||||||
|
pd.DataFrame({
|
||||||
|
'i' : [0 , 1 , 2 , 3 , 4 ],
|
||||||
|
's' : ['str0', 'str1', 'str2', 'str3', 'str4'],
|
||||||
|
# NOTE: error column is always added
|
||||||
|
'error': [None , None , None , None , None ],
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
assert_frame_equal(as_dataframe([]), pd.DataFrame(columns=['error']))
|
||||||
|
|
||||||
|
df2: DataFrameT = as_dataframe([], schema=_X)
|
||||||
|
assert_frame_equal(
|
||||||
|
df2,
|
||||||
|
# FIXME hmm. x column type should be an int?? and error should be string (or object??)
|
||||||
|
pd.DataFrame(columns=['x', 'error']),
|
||||||
|
)
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class S:
|
||||||
|
value: str
|
||||||
|
|
||||||
|
def it2() -> Iterator[Res[S]]:
|
||||||
|
yield S(value='test')
|
||||||
|
yield RuntimeError('i failed')
|
||||||
|
|
||||||
|
df = as_dataframe(it2())
|
||||||
|
# fmt: off
|
||||||
|
assert_frame_equal(
|
||||||
|
df,
|
||||||
|
pd.DataFrame(data={
|
||||||
|
'value': ['test', np.nan ],
|
||||||
|
'error': [np.nan, 'RuntimeError: i failed\n'],
|
||||||
|
'dt' : [np.nan, np.nan ],
|
||||||
|
}).astype(dtype={'dt': 'float'}), # FIXME should be datetime64 as below
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
def it3() -> Iterator[Res[S]]:
|
||||||
|
yield S(value='aba')
|
||||||
|
yield RuntimeError('whoops')
|
||||||
|
yield S(value='cde')
|
||||||
|
yield RuntimeError('exception with datetime', fromisoformat('2024-08-19T22:47:01Z'))
|
||||||
|
|
||||||
|
df = as_dataframe(it3())
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
assert_frame_equal(df, pd.DataFrame(data={
|
||||||
|
'value': ['aba' , np.nan , 'cde' , np.nan ],
|
||||||
|
'error': [np.nan, 'RuntimeError: whoops\n', np.nan, "RuntimeError: ('exception with datetime', datetime.datetime(2024, 8, 19, 22, 47, 1, tzinfo=datetime.timezone.utc))\n"],
|
||||||
|
# note: dt column is added even if errors don't have an associated datetime
|
||||||
|
'dt' : [np.nan, np.nan , np.nan, '2024-08-19 22:47:01+00:00'],
|
||||||
|
}).astype(dtype={'dt': 'datetime64[ns, UTC]'}))
|
||||||
|
# fmt: on
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue