diff --git a/.gitignore b/.gitignore index 888867a..19c3380 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ auto-save-list tramp .\#* +*.gpx # Org-mode .org-id-locations diff --git a/README.org b/README.org index 4843a9f..c065a0c 100644 --- a/README.org +++ b/README.org @@ -531,7 +531,7 @@ If you like the shell or just want to quickly convert/grab some information from #+begin_src bash $ hpi query my.coding.commits.commits --stream # stream JSON objects as they're read --order-type datetime # find the 'datetime' attribute and order by that - --after '2020-01-01 00:00:00' --before '2020-12-31 23:59:59' # in 2020 + --after '2020-01-01' --before '2021-01-01' # in 2020 | jq '.committed_dt' -r # extract the datetime # mangle the output a bit to group by month and graph it | cut -d'-' -f-2 | sort | uniq -c | awk '{print $2,$1}' | sort -n | termgraph @@ -552,6 +552,8 @@ If you like the shell or just want to quickly convert/grab some information from 2020-12: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 383.00 #+end_src +See [[https://github.com/karlicoss/HPI/blob/master/doc/QUERY.md][query docs]] +for more examples ** Querying Roam Research database :PROPERTIES: diff --git a/doc/QUERY.md b/doc/QUERY.md new file mode 100644 index 0000000..9890b58 --- /dev/null +++ b/doc/QUERY.md @@ -0,0 +1,301 @@ +`hpi query` is a command line tool for querying the output of any `hpi` function. + +``` +Usage: hpi query [OPTIONS] FUNCTION_NAME... + + This allows you to query the results from one or more functions in HPI + + By default this runs with '-o json', converting the results to JSON and + printing them to STDOUT + + You can specify '-o pprint' to just print the objects using their repr, or + '-o repl' to drop into a ipython shell with access to the results + + While filtering using --order-key datetime, the --after, --before and + --within flags parse the input to their datetime and timedelta equivalents. + datetimes can be epoch time, the string 'now', or an date formatted in the + ISO format. timedelta (durations) are parsed from a similar format to the + GNU 'sleep' command, e.g. 1w2d8h5m20s -> 1 week, 2 days, 8 hours, 5 minutes, + 20 seconds + + As an example, to query reddit comments I've made in the last month + + hpi query --order-type datetime --before now --within 4w my.reddit.all.comments + or... + hpi query --recent 4w my.reddit.all.comments + + Can also query within a range. To filter comments between 2016 and 2018: + hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments + +Options: + -o, --output [json|pprint|repl|gpx] + what to do with the result [default: json] + -s, --stream stream objects from the data source instead + of printing a list at the end + -k, --order-key TEXT order by an object attribute or dict key on + the individual objects returned by the HPI + function + -t, --order-type [datetime|date|int|float] + order by searching for some type on the + iterable + -a, --after TEXT while ordering, filter items for the key or + type larger than or equal to this + -b, --before TEXT while ordering, filter items for the key or + type smaller than this + -w, --within TEXT a range 'after' or 'before' to filter items + by. see above for further explanation + -r, --recent TEXT a shorthand for '--order-type datetime + --reverse --before now --within'. e.g. + --recent 5d + --reverse / --no-reverse reverse the results returned from the + functions + -l, --limit INTEGER limit the number of items returned from the + (functions) + --drop-unsorted if the order of an item can't be determined + while ordering, drop those items from the + results + --wrap-unsorted if the order of an item can't be determined + while ordering, wrap them into an + 'Unsortable' object + --warn-exceptions if any errors are returned, print them as + errors on STDERR + --raise-exceptions if any errors are returned (as objects, not + raised) from the functions, raise them + --drop-exceptions ignore any errors returned as objects from + the functions +``` + +This works with any function which returns an iterable, for example `my.coding.commits`, which searches for `git commit`s on your computer: + +```bash +hpi query my.coding.commits +``` + +When run with a module, this does some analysis of the functions in that module and tries to find ones that look like data sources. If it can't figure out which, it prompts you like: + +``` +Which function should be used from 'my.coding.commits'? + + 1. commits + 2. repos +``` + +You select the one you want by clicking `1` or `2` on your keyboard. Otherwise, you can provide a fully qualified path, like: + +``` +hpi query my.coding.commits.repos +``` + +The corresponding `repos` function this queries is defined in [`my/coding/commits.py`](../my/coding/commits.py) + +### Ordering/Filtering/Streaming + +By default, this just returns the items in the order they were returned by the function. This allows you to filter by specifying a `--order-key`, or `--order-type`. For example, to get the 10 most recent commits. `--order-type datetime` will try to automatically figure out which attribute to use. If it chooses the wrong one (since `Commit`s have both a `committed_dt` and `authored_dt`), you could tell it which to use. For example, to scan my computer and find the most recent commit I made: + +``` +hpi query my.coding.commits.commits --order-key committed_dt --limit 1 --reverse --output pprint --stream +Commit(committed_dt=datetime.datetime(2023, 4, 14, 23, 9, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), + authored_dt=datetime.datetime(2023, 4, 14, 23, 4, 1, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))), + message='sources.smscalls: propogate errors if there are breaking ' + 'schema changes', + repo='/home/sean/Repos/promnesia-fork', + sha='22a434fca9a28df9b0915ccf16368df129d2c9ce', + ref='refs/heads/smscalls-handle-result') +``` + +To instead limit in some range, you can use `--before` and `--within` to filter by a range. For example, to get all the commits I committed in the last day: + +``` +hpi query my.coding.commits.commits --order-type datetime --before now --within 1d +``` + +That prints a a list of `Commit` as JSON objects. You could also use `--output pprint` to pretty-print the objects or `--output repl` drop into a REPL. + +To process the JSON, you can pipe it to [`jq`](https://github.com/stedolan/jq). I often use `jq length` to get the count of some output: + +``` +hpi query my.coding.commits.commits --order-type datetime --before now --within 1d | jq length +6 +``` + +Because that is such a common use case, the `--recent` flag is a shorthand for `--order-type datetime --reverse --before now --within`. The same as above: + +``` +hpi query my.coding.commits.commits --recent 1d | jq length +6 +``` + +To select a range of commits, you can use `--after` and `--before`, passing ISO or epoch timestamps. Those can be full `datetimes` (`2021-01-01T00:05:30`) or just dates (`2021-01-01`). For example, to get all the commits I made on January 1st, 2021: + +``` +hpi query my.coding.commits.commits --order-type datetime --after 2021-01-01 --before 2021-01-02 | jq length +1 +``` + +If you have [`dateparser`](https://github.com/scrapinghub/dateparser#how-to-use) installed, this supports dozens more natural language formats: + +``` +hpi query my.coding.commits.commits --order-type datetime --after 'last week' --before 'day before yesterday' | jq length +28 +``` + +If you're having issues ordering because there are exceptions in your results not all data is sortable (may have `None` for some attributes), you can use `--drop-unsorted` to drop those items from the results, or `--drop-exceptions` to remove the exceptions + +You can also stream the results, which is useful for functions that take a while to process or have a lot of data. For example, if you wanted to pick a sha hash from a particular repo, you could combine `jq` to `select` and pick that attribute from the JSON: + +``` +hpi query my.coding.commits.commits --recent 30d --stream | jq 'select(.repo | contains("HPI"))' | jq '.sha' -r +4afa899c8b365b3c10e468f6279c02e316d3b650 +40de162fab741df594b4d9651348ee46ee021e9b +e1cb229913482074dc5523e57ef0acf6e9ec2bb2 +87c13defd131e39292b93dcea661d3191222dace +02c738594f2cae36ca4fab43cf9533fe6aa89396 +0b3a2a6ef3a9e4992771aaea0252fb28217b814a +84817ce72d208038b66f634d4ceb6e3a4c7ec5e9 +47992b8e046d27fc5141839179f06f925c159510 +425615614bd508e28ccceb56f43c692240e429ab +eed8f949460d768fb1f1c4801e9abab58a5f9021 +d26ad7d9ce6a4718f96346b994c3c1cd0d74380c +aec517e53c6ac022f2b4cc91261daab5651cebf0 +44b75a88fdfc7af132f61905232877031ce32fcb +b0ff6f29dd2846e97f8aa85a2ca73736b03254a8 +``` + +`select` acts on a stream of JSON objects, not a list, so it filters as the objects are generated. The alternative would be to print the entire JSON list at the end, like: + +`hpi query my.coding.commits.commits --recent 30d | jq '.[] | select(.repo | contains("Repos/HPI"))' | jq '.sha' -r`, using `jq '.[]'` to convert the JSON list into a stream of JSON objects. + +## Usage on non-HPI code + +The command can accept any qualified function name, so this could for example be used to check the output of [`promnesia`](https://github.com/karlicoss/promnesia) commands: + +``` +hpi query promnesia.sources.smscalls | jq length +371 +``` + +## GPX + +The `hpi query` command can also be used with the `--output gpx` flag to generate GPX files from a list of locations, like the ones defined in the `my.location` package. This could be used to extract some date range and create a `gpx` file which can then be visualized by a GUI application. + +This prints the contents for the `GPX` file to STDOUT, and prints warnings for any objects it could not convert to locations to STDERR, so pipe STDOUT to a output file, like `>out.gpx` + +``` +hpi query my.location.all --after '2021-07-01T00:00:00' --before '2021-07-05T00:00:00' --order-type datetime --output gpx >out.gpx +``` + +If you want to ignore any errors, you can use `--drop-exceptions`. + +To preview, you can use something like [`qgis`](https://qgis.org/en/site/) or for something easier more lightweight, [`gpxsee`](https://github.com/tumic0/GPXSee): + +`gpxsee out.gpx`: + +TODO: add image here + +(Sidenote: this is [`@seanbreckenridge`](https://github.com/seanbreckenridge/)s locations, on a trip to Chicago) + +## Python reference + +The `hpi query` command is a CLI wrapper around the code in [`query.py`](../my/core/query.py) and [`query_range.py`](../my/core/query_range.py). The `select` function is the core of this, and `select_range` lets you specify dates, timedelta, start-end ranges, and other CLI-specific code. + +`select`: + +``` + A function to query, order, sort and filter items from one or more sources + This supports iterables and lists of mixed types (including handling errors), + by allowing you to provide custom predicates (functions) which can sort + by a function, an attribute, dict key, or by the attributes values. + + Since this supports mixed types, there's always a possibility + of KeyErrors or AttributeErrors while trying to find some value to order by, + so this provides multiple mechanisms to deal with that + + 'where' lets you filter items before ordering, to remove possible errors + or filter the iterator by some condition + + There are multiple ways to instruct select on how to order items. The most + flexible is to provide an 'order_by' function, which takes an item in the + iterator, does any custom checks you may want and then returns the value to sort by + + 'order_key' is best used on items which have a similar structure, or have + the same attribute name for every item in the iterator. If you have a + iterator of objects whose datetime is accessed by the 'timestamp' attribute, + supplying order_key='timestamp' would sort by that (dictionary or attribute) key + + 'order_value' is the most confusing, but often the most useful. Instead of + testing against the keys of an item, this allows you to write a predicate + (function) to test against its values (dictionary, NamedTuple, dataclass, object). + If you had an iterator of mixed types and wanted to sort by the datetime, + but the attribute to access the datetime is different on each type, you can + provide `order_value=lambda v: isinstance(v, datetime)`, and this will + try to find that value for each type in the iterator, to sort it by + the value which is received when the predicate is true + + 'order_value' is often used in the 'hpi query' interface, because of its brevity. + Just given the input function, this can typically sort it by timestamp with + no human intervention. It can sort of be thought as an educated guess, + but it can always be improved by providing a more complete guess function + + Note that 'order_value' is also the most computationally expensive, as it has + to copy the iterator in memory (using itertools.tee) to determine how to order it + in memory + + The 'drop_exceptions', 'raise_exceptions', 'warn_exceptions' let you ignore or raise + when the src contains exceptions. The 'warn_func' lets you provide a custom function + to call when an exception is encountered instead of using the 'warnings' module + + src: an iterable of mixed types, or a function to be called, + as the input to this function + + where: a predicate which filters the results before sorting + + order_by: a function which when given an item in the src, + returns the value to sort by. Similar to the 'key' value + typically passed directly to 'sorted' + + order_key: a string which represents a dict key or attribute name + to use as they key to sort by + + order_value: predicate which determines which attribute on an ADT-like item to sort by, + when given its value. lambda o: isinstance(o, datetime) is commonly passed to sort + by datetime, without knowing the attributes or interface for the items in the src + + default: while ordering, if the order for an object cannot be determined, + use this as the default value + + reverse: reverse the order of the resulting iterable + + limit: limit the results to this many items + + drop_unsorted: before ordering, drop any items from the iterable for which a + order could not be determined. False by default + + wrap_unsorted: before ordering, wrap any items into an 'Unsortable' object. Place + them at the front of the list. True by default + + drop_exceptions: ignore any exceptions from the src + + raise_exceptions: raise exceptions when received from the input src +``` + +`select_range`: + +``` + A specialized select function which offers generating functions + to filter/query ranges from an iterable + + order_key and order_value are used in the same way they are in select + + If you specify order_by_value_type, it tries to search for an attribute + on each object/type which has that type, ordering the iterable by that value + + unparsed_range is a tuple of length 3, specifying 'after', 'before', 'duration', + i.e. some start point to allow the computed value we're ordering by, some + end point and a duration (can use the RangeTuple NamedTuple to construct one) + + (this is typically parsed/created in my.core.__main__, from CLI flags + + If you specify a range, drop_unsorted is forced to be True +``` + +Those can be imported and accept any sort of iterator, `hpi query` just defaults to the output of functions here. As an example, see [`listens`](https://github.com/seanbreckenridge/HPI-personal/blob/master/scripts/listens) which just passes an generator (iterator) as the first argument to `query_range` diff --git a/my/core/__main__.py b/my/core/__main__.py index dce646a..620cb5f 100644 --- a/my/core/__main__.py +++ b/my/core/__main__.py @@ -485,6 +485,13 @@ def _locate_functions_or_prompt(qualified_names: List[str], prompt: bool = True) yield data_providers[chosen_index] +def _warn_exceptions(exc: Exception) -> None: + from my.core.common import LazyLogger + logger = LazyLogger('CLI', level='warning') + + logger.exception(f'hpi query: {exc}') + + # handle the 'hpi query' call # can raise a QueryException, caught in the click command def query_hpi_functions( @@ -501,10 +508,12 @@ def query_hpi_functions( limit: Optional[int], drop_unsorted: bool, wrap_unsorted: bool, + warn_exceptions: bool, raise_exceptions: bool, drop_exceptions: bool, ) -> None: from .query_range import select_range, RangeTuple + import my.core.error as err # chain list of functions from user, in the order they wrote them on the CLI input_src = chain(*(f() for f in _locate_functions_or_prompt(qualified_names))) @@ -518,6 +527,8 @@ def query_hpi_functions( limit=limit, drop_unsorted=drop_unsorted, wrap_unsorted=wrap_unsorted, + warn_exceptions=warn_exceptions, + warn_func=_warn_exceptions, raise_exceptions=raise_exceptions, drop_exceptions=drop_exceptions) @@ -545,10 +556,21 @@ def query_hpi_functions( elif output == 'gpx': from my.location.common import locations_to_gpx + # if user didn't specify to ignore exceptions, warn if locations_to_gpx + # cannot process the output of the command. This can be silenced by + # passing --drop-exceptions + if not raise_exceptions and not drop_exceptions: + warn_exceptions = True + # can ignore the mypy warning here, locations_to_gpx yields any errors # if you didnt pass it something that matches the LocationProtocol for exc in locations_to_gpx(res, sys.stdout): # type: ignore[arg-type] - click.echo(str(exc), err=True) + if warn_exceptions: + _warn_exceptions(exc) + elif raise_exceptions: + raise exc + elif drop_exceptions: + pass sys.stdout.flush() else: res = list(res) # type: ignore[assignment] @@ -742,6 +764,10 @@ def module_install_cmd(user: bool, parallel: bool, modules: Sequence[str]) -> No default=False, is_flag=True, help="if the order of an item can't be determined while ordering, wrap them into an 'Unsortable' object") +@click.option('--warn-exceptions', + default=False, + is_flag=True, + help="if any errors are returned, print them as errors on STDERR") @click.option('--raise-exceptions', default=False, is_flag=True, @@ -765,6 +791,7 @@ def query_cmd( limit: Optional[int], drop_unsorted: bool, wrap_unsorted: bool, + warn_exceptions: bool, raise_exceptions: bool, drop_exceptions: bool, ) -> None: @@ -792,7 +819,7 @@ def query_cmd( \b Can also query within a range. To filter comments between 2016 and 2018: - hpi query --order-type datetime --after '2016-01-01 00:00:00' --before '2019-01-01 00:00:00' my.reddit.all.comments + hpi query --order-type datetime --after '2016-01-01' --before '2019-01-01' my.reddit.all.comments ''' from datetime import datetime, date @@ -831,6 +858,7 @@ def query_cmd( limit=limit, drop_unsorted=drop_unsorted, wrap_unsorted=wrap_unsorted, + warn_exceptions=warn_exceptions, raise_exceptions=raise_exceptions, drop_exceptions=drop_exceptions) except QueryException as qe: diff --git a/my/core/error.py b/my/core/error.py index e6f76cd..09c1733 100644 --- a/my/core/error.py +++ b/my/core/error.py @@ -4,7 +4,7 @@ See https://beepb00p.xyz/mypy-error-handling.html#kiss for more detail """ from itertools import tee -from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast +from typing import Union, TypeVar, Iterable, List, Tuple, Type, Optional, Callable, Any, cast, Iterator from .compat import Literal @@ -29,6 +29,37 @@ def unwrap(res: Res[T]) -> T: else: return res +def drop_exceptions(itr: Iterator[Res[T]]) -> Iterator[T]: + """Return non-errors from the iterable""" + for o in itr: + if isinstance(o, Exception): + continue + yield o + + +def raise_exceptions(itr: Iterable[Res[T]]) -> Iterator[T]: + """Raise errors from the iterable, stops the select function""" + for o in itr: + if isinstance(o, Exception): + raise o + yield o + + +def warn_exceptions(itr: Iterable[Res[T]], warn_func: Optional[Callable[[Exception], None]] = None) -> Iterator[T]: + # if not provided, use the 'warnings' module + if warn_func is None: + from my.core.warnings import medium + def _warn_func(e: Exception) -> None: + # TODO: print traceback? but user could always --raise-exceptions as well + medium(str(e)) + warn_func = _warn_func + + for o in itr: + if isinstance(o, Exception): + warn_func(o) + continue + yield o + def echain(ex: E, cause: Exception) -> E: ex.__cause__ = cause diff --git a/my/core/query.py b/my/core/query.py index ed29649..8a497db 100644 --- a/my/core/query.py +++ b/my/core/query.py @@ -14,6 +14,7 @@ from typing import TypeVar, Tuple, Optional, Union, Callable, Iterable, Iterator import more_itertools +import my.core.error as err from .common import is_namedtuple from .error import Res, unwrap from .warnings import low @@ -205,20 +206,6 @@ pass 'drop_exceptions' to ignore exceptions""") return None # couldn't compute a OrderFunc for this class/instance -def _drop_exceptions(itr: Iterator[ET]) -> Iterator[T]: - """Return non-errors from the iterable""" - for o in itr: - if isinstance(o, Exception): - continue - yield o - - -def _raise_exceptions(itr: Iterable[ET]) -> Iterator[T]: - """Raise errors from the iterable, stops the select function""" - for o in itr: - if isinstance(o, Exception): - raise o - yield o # currently using the 'key set' as a proxy for 'this is the same type of thing' @@ -365,6 +352,8 @@ def select( limit: Optional[int] = None, drop_unsorted: bool = False, wrap_unsorted: bool = True, + warn_exceptions: bool = False, + warn_func: Optional[Callable[[Exception], None]] = None, drop_exceptions: bool = False, raise_exceptions: bool = False, ) -> Iterator[ET]: @@ -408,7 +397,9 @@ def select( to copy the iterator in memory (using itertools.tee) to determine how to order it in memory - The 'drop_exceptions' and 'raise_exceptions' let you ignore or raise when the src contains exceptions + The 'drop_exceptions', 'raise_exceptions', 'warn_exceptions' let you ignore or raise + when the src contains exceptions. The 'warn_func' lets you provide a custom function + to call when an exception is encountered instead of using the 'warnings' module src: an iterable of mixed types, or a function to be called, as the input to this function @@ -469,10 +460,13 @@ Will attempt to call iter() on the value""") # if both drop_exceptions and drop_exceptions are provided for some reason, # should raise exceptions before dropping them if raise_exceptions: - itr = _raise_exceptions(itr) + itr = err.raise_exceptions(itr) if drop_exceptions: - itr = _drop_exceptions(itr) + itr = err.drop_exceptions(itr) + + if warn_exceptions: + itr = err.warn_exceptions(itr, warn_func=warn_func) if where is not None: itr = filter(where, itr) diff --git a/my/core/query_range.py b/my/core/query_range.py index 179e4ea..b7049a3 100644 --- a/my/core/query_range.py +++ b/my/core/query_range.py @@ -73,13 +73,28 @@ def parse_datetime_float(date_str: str) -> float: return ds_float try: # isoformat - default format when you call str() on datetime + # this also parses dates like '2020-01-01' return datetime.fromisoformat(ds).timestamp() except ValueError: pass try: return isoparse(ds).timestamp() except (AssertionError, ValueError): - raise QueryException(f"Was not able to parse {ds} into a datetime") + pass + + try: + import dateparser # type: ignore[import] + # dateparser is a bit more lenient than the above, lets you type + # all sorts of dates as inputs + # https://github.com/scrapinghub/dateparser#how-to-use + + res: Optional[datetime] = dateparser.parse(ds) + if res is not None: + return res.timestamp() + except ImportError: + pass + + raise QueryException(f"Was not able to parse {ds} into a datetime") # probably DateLike input? but a user could specify an order_key @@ -267,6 +282,8 @@ def select_range( limit: Optional[int] = None, drop_unsorted: bool = False, wrap_unsorted: bool = False, + warn_exceptions: bool = False, + warn_func: Optional[Callable[[Exception], None]] = None, drop_exceptions: bool = False, raise_exceptions: bool = False, ) -> Iterator[ET]: @@ -293,9 +310,15 @@ def select_range( unparsed_range = None # some operations to do before ordering/filtering - if drop_exceptions or raise_exceptions or where is not None: + if drop_exceptions or raise_exceptions or where is not None or warn_exceptions: # doesn't wrap unsortable items, because we pass no order related kwargs - itr = select(itr, where=where, drop_exceptions=drop_exceptions, raise_exceptions=raise_exceptions) + itr = select( + itr, + where=where, + drop_exceptions=drop_exceptions, + raise_exceptions=raise_exceptions, + warn_exceptions=warn_exceptions, + warn_func=warn_func) order_by_chosen: Optional[OrderFunc] = None