diff --git a/.gitignore b/.gitignore index 888867a..19c3380 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ auto-save-list tramp .\#* +*.gpx # Org-mode .org-id-locations diff --git a/README.org b/README.org index 4843a9f..c065a0c 100644 --- a/README.org +++ b/README.org @@ -531,7 +531,7 @@ If you like the shell or just want to quickly convert/grab some information from #+begin_src bash $ hpi query my.coding.commits.commits --stream # stream JSON objects as they're read --order-type datetime # find the 'datetime' attribute and order by that - --after '2020-01-01 00:00:00' --before '2020-12-31 23:59:59' # in 2020 + --after '2020-01-01' --before '2021-01-01' # in 2020 | jq '.committed_dt' -r # extract the datetime # mangle the output a bit to group by month and graph it | cut -d'-' -f-2 | sort | uniq -c | awk '{print $2,$1}' | sort -n | termgraph @@ -552,6 +552,8 @@ If you like the shell or just want to quickly convert/grab some information from 2020-12: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 383.00 #+end_src +See [[https://github.com/karlicoss/HPI/blob/master/doc/QUERY.md][query docs]] +for more examples ** Querying Roam Research database :PROPERTIES: diff --git a/doc/QUERY.md b/doc/QUERY.md new file mode 100644 index 0000000..b672dff --- /dev/null +++ b/doc/QUERY.md @@ -0,0 +1,304 @@ +`hpi query` is a command line tool for querying the output of any `hpi` function. + +``` +Usage: hpi query [OPTIONS] FUNCTION_NAME... + + This allows you to query the results from one or more functions in HPI + + By default this runs with '-o json', converting the results to JSON and + printing them to STDOUT + + You can specify '-o pprint' to just print the objects using their repr, or + '-o repl' to drop into a ipython shell with access to the results + + While filtering using --order-key datetime, the --after, --before and + --within flags parse the input to their datetime and timedelta equivalents. + datetimes can be epoch time, the string 'now', or an date formatted in the + ISO format. timedelta (durations) are parsed from a similar format to the + GNU 'sleep' command, e.g. 1w2d8h5m20s -> 1 week, 2 days, 8 hours, 5 minutes, + 20 seconds + + As an example, to query reddit comments I've made in the last month + + hpi query --order-type datetime --before now --within 4w my.reddit.all.comments + or... + hpi query --recent 4w my.reddit.all.comments + + Can also query within a range. To filter comments between 2016 and 2018: + hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments + +Options: + -o, --output [json|pprint|repl|gpx] + what to do with the result [default: json] + -s, --stream stream objects from the data source instead + of printing a list at the end + -k, --order-key TEXT order by an object attribute or dict key on + the individual objects returned by the HPI + function + -t, --order-type [datetime|date|int|float] + order by searching for some type on the + iterable + -a, --after TEXT while ordering, filter items for the key or + type larger than or equal to this + -b, --before TEXT while ordering, filter items for the key or + type smaller than this + -w, --within TEXT a range 'after' or 'before' to filter items + by. see above for further explanation + -r, --recent TEXT a shorthand for '--order-type datetime + --reverse --before now --within'. e.g. + --recent 5d + --reverse / --no-reverse reverse the results returned from the + functions + -l, --limit INTEGER limit the number of items returned from the + (functions) + --drop-unsorted if the order of an item can't be determined + while ordering, drop those items from the + results + --wrap-unsorted if the order of an item can't be determined + while ordering, wrap them into an + 'Unsortable' object + --warn-exceptions if any errors are returned, print them as + errors on STDERR + --raise-exceptions if any errors are returned (as objects, not + raised) from the functions, raise them + --drop-exceptions ignore any errors returned as objects from + the functions + --help Show this message and exit. +``` + +This works with any function which returns an iterable, for example `my.coding.commits`, which searches for `git commit`s on your computer: + +```bash +hpi query my.coding.commits +``` + +When run with a module, this does some analysis of the functions in that module and tries to find ones that look like data sources. If it can't figure out which, it prompts you like: + +``` +Which function should be used from 'my.coding.commits'? + + 1. commits + 2. repos +``` + +You select the one you want by clicking `1` or `2` on your keyboard. Otherwise, you can provide a fully qualified path, like: + +``` +hpi query my.coding.commits.repos +``` + +The corresponding `repos` function this queries is defined in [`my/coding/commits.py`](../my/coding/commits.py) + +### Ordering/Filtering/Streaming + +By default, this just returns the items in the order they were returned by the function. This allows you to filter by specifying a `--order-key`, or `--order-type`. For example, to get the 10 most recent commits. `--order-type datetime` will try to automatically figure out which attribute to use. If it chooses the wrong one (since `Commit`s have both a `committed_dt` and `authored_dt`), you could tell it which to use. For example, to scan my computer and find the most recent commit I made: + +``` +hpi query my.coding.commits.commits --order-key committed_dt --limit 1 --reverse --output pprint --stream +Commit(committed_dt=datetime.datetime(2023, 4, 14, 23, 9, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), + authored_dt=datetime.datetime(2023, 4, 14, 23, 4, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), + message='sources.smscalls: propogate errors if there are breaking ' + 'schema changes', + repo='/home/sean/Repos/promnesia-fork', + sha='22a434fca9a28df9b0915ccf16368df129d2c9ce', + ref='refs/heads/smscalls-handle-result') +``` + +To instead limit in some range, you can use `--before` and `--within` to filter by a range. For example, to get all the commits I committed in the last day: + +``` +hpi query my.coding.commits.commits --order-type datetime --before now --within 1d +``` + +That prints a a list of `Commit` as JSON objects. You could also use `--output pprint` to pretty-print the objects or `--output repl` drop into a REPL. + +To process the JSON, you can pipe it to [`jq`](https://github.com/stedolan/jq). I often use `jq length` to get the count of some output: + +``` +hpi query my.coding.commits.commits --order-type datetime --before now --within 1d | jq length +6 +``` + +Because grabbing data `--before now` is such a common use case, the `--recent` flag is a shorthand for `--order-type datetime --reverse --before now --within`. The same as above, to get the commits from the last day: + +``` +hpi query my.coding.commits.commits --recent 1d | jq length +6 +``` + +To select a range of commits, you can use `--after` and `--before`, passing ISO or epoch timestamps. Those can be full `datetimes` (`2021-01-01T00:05:30`) or just dates (`2021-01-01`). For example, to get all the commits I made on January 1st, 2021: + +``` +hpi query my.coding.commits.commits --order-type datetime --after 2021-01-01 --before 2021-01-02 | jq length +1 +``` + +If you have [`dateparser`](https://github.com/scrapinghub/dateparser#how-to-use) installed, this supports dozens more natural language formats: + +``` +hpi query my.coding.commits.commits --order-type datetime --after 'last week' --before 'day before yesterday' | jq length +28 +``` + +If you're having issues ordering because there are exceptions in your results not all data is sortable (may have `None` for some attributes), you can use `--drop-unsorted` to drop those items from the results, or `--drop-exceptions` to remove the exceptions + +You can also stream the results, which is useful for functions that take a while to process or have a lot of data. For example, if you wanted to pick a sha hash from a particular repo, you could combine `jq` to `select` and pick that attribute from the JSON: + +``` +hpi query my.coding.commits.commits --recent 30d --stream | jq 'select(.repo | contains("HPI"))' | jq '.sha' -r +4afa899c8b365b3c10e468f6279c02e316d3b650 +40de162fab741df594b4d9651348ee46ee021e9b +e1cb229913482074dc5523e57ef0acf6e9ec2bb2 +87c13defd131e39292b93dcea661d3191222dace +02c738594f2cae36ca4fab43cf9533fe6aa89396 +0b3a2a6ef3a9e4992771aaea0252fb28217b814a +84817ce72d208038b66f634d4ceb6e3a4c7ec5e9 +47992b8e046d27fc5141839179f06f925c159510 +425615614bd508e28ccceb56f43c692240e429ab +eed8f949460d768fb1f1c4801e9abab58a5f9021 +d26ad7d9ce6a4718f96346b994c3c1cd0d74380c +aec517e53c6ac022f2b4cc91261daab5651cebf0 +44b75a88fdfc7af132f61905232877031ce32fcb +b0ff6f29dd2846e97f8aa85a2ca73736b03254a8 +``` + +`jq`s `select` function acts on a stream of JSON objects, not a list, so it filters the output of `hpi query` the objects are generated (the goal here is to conserve memory as items which aren't needed are filtered). The alternative would be to print the entire JSON list at the end, like: + +`hpi query my.coding.commits.commits --recent 30d | jq '.[] | select(.repo | contains("Repos/HPI"))' | jq '.sha' -r`, using `jq '.[]'` to convert the JSON list into a stream of JSON objects. + +## Usage on non-HPI code + +The command can accept any qualified function name, so this could for example be used to check the output of [`promnesia`](https://github.com/karlicoss/promnesia) sources: + +``` +hpi query promnesia.sources.smscalls | jq length +371 +``` + +This can be used on any function that produces an `Iterator`/`Generator` like output, as long as it can be called with no arguments. + +## GPX + +The `hpi query` command can also be used with the `--output gpx` flag to generate gpx files from a list of locations, like the ones defined in the `my.location` package. This could be used to extract some date range and create a `gpx` file which can then be visualized by a GUI application. + +This prints the contents for the `gpx` file to STDOUT, and prints warnings for any objects it could not convert to locations to STDERR, so pipe STDOUT to a output file, like `>out.gpx` + +``` +hpi query my.location.all --after '2021-07-01T00:00:00' --before '2021-07-05T00:00:00' --order-type datetime --output gpx >out.gpx +``` + +If you want to ignore any errors, you can use `--drop-exceptions`. + +To preview, you can use something like [`qgis`](https://qgis.org/en/site/) or for something easier more lightweight, [`gpxsee`](https://github.com/tumic0/GPXSee): + +`gpxsee out.gpx`: + +chicago trip + +(Sidenote: this is [`@seanbreckenridge`](https://github.com/seanbreckenridge/)s locations, on a trip to Chicago) + +## Python reference + +The `hpi query` command is a CLI wrapper around the code in [`query.py`](../my/core/query.py) and [`query_range.py`](../my/core/query_range.py). The `select` function is the core of this, and `select_range` lets you specify dates, timedelta, start-end ranges, and other CLI-specific code. + +`my.core.query.select`: + +``` + A function to query, order, sort and filter items from one or more sources + This supports iterables and lists of mixed types (including handling errors), + by allowing you to provide custom predicates (functions) which can sort + by a function, an attribute, dict key, or by the attributes values. + + Since this supports mixed types, there's always a possibility + of KeyErrors or AttributeErrors while trying to find some value to order by, + so this provides multiple mechanisms to deal with that + + 'where' lets you filter items before ordering, to remove possible errors + or filter the iterator by some condition + + There are multiple ways to instruct select on how to order items. The most + flexible is to provide an 'order_by' function, which takes an item in the + iterator, does any custom checks you may want and then returns the value to sort by + + 'order_key' is best used on items which have a similar structure, or have + the same attribute name for every item in the iterator. If you have a + iterator of objects whose datetime is accessed by the 'timestamp' attribute, + supplying order_key='timestamp' would sort by that (dictionary or attribute) key + + 'order_value' is the most confusing, but often the most useful. Instead of + testing against the keys of an item, this allows you to write a predicate + (function) to test against its values (dictionary, NamedTuple, dataclass, object). + If you had an iterator of mixed types and wanted to sort by the datetime, + but the attribute to access the datetime is different on each type, you can + provide `order_value=lambda v: isinstance(v, datetime)`, and this will + try to find that value for each type in the iterator, to sort it by + the value which is received when the predicate is true + + 'order_value' is often used in the 'hpi query' interface, because of its brevity. + Just given the input function, this can typically sort it by timestamp with + no human intervention. It can sort of be thought as an educated guess, + but it can always be improved by providing a more complete guess function + + Note that 'order_value' is also the most computationally expensive, as it has + to copy the iterator in memory (using itertools.tee) to determine how to order it + in memory + + The 'drop_exceptions', 'raise_exceptions', 'warn_exceptions' let you ignore or raise + when the src contains exceptions. The 'warn_func' lets you provide a custom function + to call when an exception is encountered instead of using the 'warnings' module + + src: an iterable of mixed types, or a function to be called, + as the input to this function + + where: a predicate which filters the results before sorting + + order_by: a function which when given an item in the src, + returns the value to sort by. Similar to the 'key' value + typically passed directly to 'sorted' + + order_key: a string which represents a dict key or attribute name + to use as they key to sort by + + order_value: predicate which determines which attribute on an ADT-like item to sort by, + when given its value. lambda o: isinstance(o, datetime) is commonly passed to sort + by datetime, without knowing the attributes or interface for the items in the src + + default: while ordering, if the order for an object cannot be determined, + use this as the default value + + reverse: reverse the order of the resulting iterable + + limit: limit the results to this many items + + drop_unsorted: before ordering, drop any items from the iterable for which a + order could not be determined. False by default + + wrap_unsorted: before ordering, wrap any items into an 'Unsortable' object. Place + them at the front of the list. True by default + + drop_exceptions: ignore any exceptions from the src + + raise_exceptions: raise exceptions when received from the input src +``` + +`my.core.query_range.select_range`: + +``` + A specialized select function which offers generating functions + to filter/query ranges from an iterable + + order_key and order_value are used in the same way they are in select + + If you specify order_by_value_type, it tries to search for an attribute + on each object/type which has that type, ordering the iterable by that value + + unparsed_range is a tuple of length 3, specifying 'after', 'before', 'duration', + i.e. some start point to allow the computed value we're ordering by, some + end point and a duration (can use the RangeTuple NamedTuple to construct one) + + (this is typically parsed/created in my.core.__main__, from CLI flags + + If you specify a range, drop_unsorted is forced to be True +``` + +Those can be imported and accept any sort of iterator, `hpi query` just defaults to the output of functions here. As an example, see [`listens`](https://github.com/seanbreckenridge/HPI-personal/blob/master/scripts/listens) which just passes an generator (iterator) as the first argument to `query_range` diff --git a/my/core/__main__.py b/my/core/__main__.py index dce646a..620cb5f 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -485,6 +485,13 @@ def _locate_functions_or_prompt(qualified_names: List[str], prompt: bool = True) yield data_providers[chosen_index] +def _warn_exceptions(exc: Exception) -> None: + from my.core.common import LazyLogger + logger = LazyLogger('CLI', level='warning') + + logger.exception(f'hpi query: {exc}') + + # handle the 'hpi query' call # can raise a QueryException, caught in the click command def query_hpi_functions( @@ -501,10 +508,12 @@ def query_hpi_functions( limit: Optional[int], drop_unsorted: bool, wrap_unsorted: bool, + warn_exceptions: bool, raise_exceptions: bool, drop_exceptions: bool, ) -> None: from .query_range import select_range, RangeTuple + import my.core.error as err # chain list of functions from user, in the order they wrote them on the CLI input_src = chain(*(f() for f in _locate_functions_or_prompt(qualified_names))) @@ -518,6 +527,8 @@ def query_hpi_functions( limit=limit, drop_unsorted=drop_unsorted, wrap_unsorted=wrap_unsorted, + warn_exceptions=warn_exceptions, + warn_func=_warn_exceptions, raise_exceptions=raise_exceptions, drop_exceptions=drop_exceptions) @@ -545,10 +556,21 @@ def query_hpi_functions( elif output == 'gpx': from my.location.common import locations_to_gpx + # if user didn't specify to ignore exceptions, warn if locations_to_gpx + # cannot process the output of the command. This can be silenced by + # passing --drop-exceptions + if not raise_exceptions and not drop_exceptions: + warn_exceptions = True + # can ignore the mypy warning here, locations_to_gpx yields any errors # if you didnt pass it something that matches the LocationProtocol for exc in locations_to_gpx(res, sys.stdout): # type: ignore[arg-type] - click.echo(str(exc), err=True) + if warn_exceptions: + _warn_exceptions(exc) + elif raise_exceptions: + raise exc + elif drop_exceptions: + pass sys.stdout.flush() else: res = list(res) # type: ignore[assignment] @@ -742,6 +764,10 @@ def module_install_cmd(user: bool, parallel: bool, modules: Sequence[str]) -> No default=False, is_flag=True, help="if the order of an item can't be determined while ordering, wrap them into an 'Unsortable' object") +@click.option('--warn-exceptions', + default=False, + is_flag=True, + help="if any errors are returned, print them as errors on STDERR") @click.option('--raise-exceptions', default=False, is_flag=True, @@ -765,6 +791,7 @@ def query_cmd( limit: Optional[int], drop_unsorted: bool, wrap_unsorted: bool, + warn_exceptions: bool, raise_exceptions: bool, drop_exceptions: bool, ) -> None: @@ -792,7 +819,7 @@ def query_cmd( \b Can also query within a range. To filter comments between 2016 and 2018: - hpi query --order-type datetime --after '2016-01-01 00:00:00' --before '2019-01-01 00:00:00' my.reddit.all.comments + hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments ''' from datetime import datetime, date @@ -831,6 +858,7 @@ def query_cmd( limit=limit, drop_unsorted=drop_unsorted, wrap_unsorted=wrap_unsorted, + warn_exceptions=warn_exceptions, raise_exceptions=raise_exceptions, drop_exceptions=drop_exceptions) except QueryException as qe: diff --git a/my/core/error.py b/my/core/error.py index e6f76cd..09c1733 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -4,7 +4,7 @@ See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail """ from itertools import tee -from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast +from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast, Iterator from .compat import Literal @@ -29,6 +29,37 @@ def unwrap(res: Res[T]) -> T: else: return res +def drop_exceptions(itr: Iterator[Res[T]]) -> Iterator[T]: + """Return non-errors from the iterable""" + for o in itr: + if isinstance(o, Exception): + continue + yield o + + +def raise_exceptions(itr: Iterable[Res[T]]) -> Iterator[T]: + """Raise errors from the iterable, stops the select function""" + for o in itr: + if isinstance(o, Exception): + raise o + yield o + + +def warn_exceptions(itr: Iterable[Res[T]], warn_func: Optional[Callable[[Exception], None]] = None) -> Iterator[T]: + # if not provided, use the 'warnings' module + if warn_func is None: + from my.core.warnings import medium + def _warn_func(e: Exception) -> None: + # TODO: print traceback? but user could always --raise-exceptions as well + medium(str(e)) + warn_func = _warn_func + + for o in itr: + if isinstance(o, Exception): + warn_func(o) + continue + yield o + def echain(ex: E, cause: Exception) -> E: ex.__cause__ = cause diff --git a/my/core/query.py b/my/core/query.py index ed29649..8a497db 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -14,6 +14,7 @@ from typing import TypeVar, Tuple, Optional, Union, Callable, Iterable, Iterator import more_itertools +import my.core.error as err from .common import is_namedtuple from .error import Res, unwrap from .warnings import low @@ -205,20 +206,6 @@ pass 'drop_exceptions' to ignore exceptions""") return None # couldn't compute a OrderFunc for this class/instance -def _drop_exceptions(itr: Iterator[ET]) -> Iterator[T]: - """Return non-errors from the iterable""" - for o in itr: - if isinstance(o, Exception): - continue - yield o - - -def _raise_exceptions(itr: Iterable[ET]) -> Iterator[T]: - """Raise errors from the iterable, stops the select function""" - for o in itr: - if isinstance(o, Exception): - raise o - yield o # currently using the 'key set' as a proxy for 'this is the same type of thing' @@ -365,6 +352,8 @@ def select( limit: Optional[int] = None, drop_unsorted: bool = False, wrap_unsorted: bool = True, + warn_exceptions: bool = False, + warn_func: Optional[Callable[[Exception], None]] = None, drop_exceptions: bool = False, raise_exceptions: bool = False, ) -> Iterator[ET]: @@ -408,7 +397,9 @@ def select( to copy the iterator in memory (using itertools.tee) to determine how to order it in memory - The 'drop_exceptions' and 'raise_exceptions' let you ignore or raise when the src contains exceptions + The 'drop_exceptions', 'raise_exceptions', 'warn_exceptions' let you ignore or raise + when the src contains exceptions. The 'warn_func' lets you provide a custom function + to call when an exception is encountered instead of using the 'warnings' module src: an iterable of mixed types, or a function to be called, as the input to this function @@ -469,10 +460,13 @@ Will attempt to call iter() on the value""") # if both drop_exceptions and drop_exceptions are provided for some reason, # should raise exceptions before dropping them if raise_exceptions: - itr = _raise_exceptions(itr) + itr = err.raise_exceptions(itr) if drop_exceptions: - itr = _drop_exceptions(itr) + itr = err.drop_exceptions(itr) + + if warn_exceptions: + itr = err.warn_exceptions(itr, warn_func=warn_func) if where is not None: itr = filter(where, itr) diff --git a/my/core/query_range.py b/my/core/query_range.py index 179e4ea..33eb03c 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -73,13 +73,28 @@ def parse_datetime_float(date_str: str) -> float: return ds_float try: # isoformat - default format when you call str() on datetime + # this also parses dates like '2020-01-01' return datetime.fromisoformat(ds).timestamp() except ValueError: pass try: return isoparse(ds).timestamp() except (AssertionError, ValueError): - raise QueryException(f"Was not able to parse {ds} into a datetime") + pass + + try: + import dateparser # type: ignore[import] + except ImportError: + pass + else: + # dateparser is a bit more lenient than the above, lets you type + # all sorts of dates as inputs + # https://github.com/scrapinghub/dateparser#how-to-use + res: Optional[datetime] = dateparser.parse(ds, settings={"DATE_ORDER": "YMD"}) + if res is not None: + return res.timestamp() + + raise QueryException(f"Was not able to parse {ds} into a datetime") # probably DateLike input? but a user could specify an order_key @@ -267,6 +282,8 @@ def select_range( limit: Optional[int] = None, drop_unsorted: bool = False, wrap_unsorted: bool = False, + warn_exceptions: bool = False, + warn_func: Optional[Callable[[Exception], None]] = None, drop_exceptions: bool = False, raise_exceptions: bool = False, ) -> Iterator[ET]: @@ -293,9 +310,15 @@ def select_range( unparsed_range = None # some operations to do before ordering/filtering - if drop_exceptions or raise_exceptions or where is not None: + if drop_exceptions or raise_exceptions or where is not None or warn_exceptions: # doesn't wrap unsortable items, because we pass no order related kwargs - itr = select(itr, where=where, drop_exceptions=drop_exceptions, raise_exceptions=raise_exceptions) + itr = select( + itr, + where=where, + drop_exceptions=drop_exceptions, + raise_exceptions=raise_exceptions, + warn_exceptions=warn_exceptions, + warn_func=warn_func) order_by_chosen: Optional[OrderFunc] = None