HPI/my/core/common.py

235 lines
6.8 KiB
Python

from glob import glob as do_glob
from pathlib import Path
import os
from typing import (
Callable,
Iterable,
List,
Sequence,
TYPE_CHECKING,
Tuple,
TypeVar,
Union,
)
import warnings
from . import warnings as core_warnings
from . import compat
from .compat import deprecated
# some helper functions
PathIsh = Union[Path, str]
from .logging import setup_logger, LazyLogger
Paths = Union[Sequence[PathIsh], PathIsh]
DEFAULT_GLOB = '*'
def get_files(
pp: Paths,
glob: str=DEFAULT_GLOB,
sort: bool=True,
guess_compression: bool=True,
) -> Tuple[Path, ...]:
"""
Helper function to avoid boilerplate.
Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense
"""
# TODO FIXME mm, some wrapper to assert iterator isn't empty?
sources: List[Path]
if isinstance(pp, Path):
sources = [pp]
elif isinstance(pp, str):
if pp == '':
# special case -- makes sense for optional data sources, etc
return () # early return to prevent warnings etc
sources = [Path(pp)]
else:
sources = [p if isinstance(p, Path) else Path(p) for p in pp]
def caller() -> str:
import traceback
# TODO ugh. very flaky... -3 because [<this function>, get_files(), <actual caller>]
return traceback.extract_stack()[-3].filename
paths: List[Path] = []
for src in sources:
if src.parts[0] == '~':
src = src.expanduser()
# note: glob handled first, because e.g. on Windows asterisk makes is_dir unhappy
gs = str(src)
if '*' in gs:
if glob != DEFAULT_GLOB:
warnings.warn(f"{caller()}: treating {gs} as glob path. Explicit glob={glob} argument is ignored!")
paths.extend(map(Path, do_glob(gs)))
elif os.path.isdir(str(src)):
# NOTE: we're using os.path here on purpose instead of src.is_dir
# the reason is is_dir for archives might return True and then
# this clause would try globbing insize the archives
# this is generally undesirable (since modules handle archives themselves)
# todo not sure if should be recursive?
# note: glob='**/*.ext' works without any changes.. so perhaps it's ok as it is
gp: Iterable[Path] = src.glob(glob)
paths.extend(gp)
else:
assert src.exists(), src
# todo assert matches glob??
paths.append(src)
if sort:
paths = list(sorted(paths))
if len(paths) == 0:
# todo make it conditionally defensive based on some global settings
core_warnings.high(f'''
{caller()}: no paths were matched against {pp}. This might result in missing data. Likely, the directory you passed is empty.
'''.strip())
# traceback is useful to figure out what config caused it?
import traceback
traceback.print_stack()
if guess_compression:
from .kompress import CPath, is_compressed, ZipPath
# NOTE: wrap is just for backwards compat with vendorized kompress
# with kompress library, only is_compressed check and Cpath should be enough
def wrap(p: Path) -> Path:
if isinstance(p, ZipPath):
return p
if p.suffix == '.zip':
return ZipPath(p) # type: ignore[return-value]
if is_compressed(p):
return CPath(p)
return p
paths = [wrap(p) for p in paths]
return tuple(paths)
from typing import TypeVar, Callable, Generic
_R = TypeVar('_R')
# https://stackoverflow.com/a/5192374/706389
class classproperty(Generic[_R]):
def __init__(self, f: Callable[..., _R]) -> None:
self.f = f
def __get__(self, obj, cls) -> _R:
return self.f(cls)
# hmm, this doesn't really work with mypy well..
# https://github.com/python/mypy/issues/6244
# class staticproperty(Generic[_R]):
# def __init__(self, f: Callable[[], _R]) -> None:
# self.f = f
#
# def __get__(self) -> _R:
# return self.f()
import re
# https://stackoverflow.com/a/295466/706389
def get_valid_filename(s: str) -> str:
s = str(s).strip().replace(' ', '_')
return re.sub(r'(?u)[^-\w.]', '', s)
# TODO deprecate and suggest to use one from my.core directly? not sure
from .utils.itertools import unique_everseen
### legacy imports, keeping them here for backwards compatibility
## hiding behind TYPE_CHECKING so it works in runtime
## in principle, warnings.deprecated decorator should cooperate with mypy, but doesn't look like it works atm?
## perhaps it doesn't work when it's used from typing_extensions
if not TYPE_CHECKING:
@deprecated('use my.core.compat.assert_never instead')
def assert_never(*args, **kwargs):
return compat.assert_never(*args, **kwargs)
@deprecated('use my.core.compat.fromisoformat instead')
def isoparse(*args, **kwargs):
return compat.fromisoformat(*args, **kwargs)
@deprecated('use more_itertools.one instead')
def the(*args, **kwargs):
import more_itertools
return more_itertools.one(*args, **kwargs)
@deprecated('use functools.cached_property instead')
def cproperty(*args, **kwargs):
import functools
return functools.cached_property(*args, **kwargs)
@deprecated('use more_itertools.bucket instead')
def group_by_key(l, key):
res = {}
for i in l:
kk = key(i)
lst = res.get(kk, [])
lst.append(i)
res[kk] = lst
return res
@deprecated('use my.core.utils.itertools.make_dict instead')
def make_dict(*args, **kwargs):
from .utils import itertools as UI
return UI.make_dict(*args, **kwargs)
@deprecated('use my.core.utils.itertools.listify instead')
def listify(*args, **kwargs):
from .utils import itertools as UI
return UI.listify(*args, **kwargs)
@deprecated('use my.core.warn_if_empty instead')
def warn_if_empty(*args, **kwargs):
from .utils import itertools as UI
return UI.listify(*args, **kwargs)
@deprecated('use my.core.stat instead')
def stat(*args, **kwargs):
from . import stats
return stats.stat(*args, **kwargs)
# todo wrap these in deprecated decorator as well?
from .cachew import mcachew # noqa: F401
# TODO hmm how to deprecate these in runtime?
# tricky cause they are actually classes/types
from typing import Literal # noqa: F401
from .stats import Stats
from .types import (
Json,
datetime_naive,
datetime_aware,
)
tzdatetime = datetime_aware
else:
from .compat import Never
# make these invalid during type check while working in runtime
Stats = Never
tzdatetime = Never
Json = Never
datetime_naive = Never
datetime_aware = Never
###