HPI/my/coding/commits.py
2021-04-07 19:59:27 +01:00

217 lines
5.9 KiB
Python

"""
Git commits data for repositories on your filesystem
"""
REQUIRES = [
'gitpython',
]
import shutil
from pathlib import Path
from datetime import datetime, timezone
from dataclasses import dataclass, field
from typing import List, Optional, Iterator, Set, Sequence
from my.core import PathIsh, LazyLogger, make_config
from my.core.cachew import cache_dir
from my.core.common import mcachew
from my.core.warnings import high
from my.config import commits as user_config
@dataclass
class commits_cfg(user_config):
roots: Sequence[PathIsh] = field(default_factory=list)
emails: Optional[Sequence[str]] = None
names: Optional[Sequence[str]] = None
# experiment to make it lazy?
# would be nice to have a nicer syntax for it... maybe make_config could return a 'lazy' object
def config() -> commits_cfg:
res = make_config(commits_cfg)
if res.emails is None and res.names is None:
# todo error policy? throw/warn/ignore
high("Set either 'emails' or 'names', otherwise you'll get no commits")
return res
##########################
import git # type: ignore
from git.repo.fun import is_git_dir # type: ignore
log = LazyLogger(__name__, level='info')
def by_me(c: git.objects.commit.Commit) -> bool:
actor = c.author
if actor.email in (config().emails or ()):
return True
if actor.name in (config().names or ()):
return True
return False
@dataclass
class Commit:
committed_dt: datetime
authored_dt: datetime
message: str
repo: str # TODO put canonical name here straightaway??
sha: str
ref: Optional[str] = None
# TODO filter so they are authored by me
@property
def dt(self) -> datetime:
return self.committed_dt
# for backwards compatibility, was misspelled previously
@property
def commited_dt(self) -> datetime:
high("DEPRECATED! Please replace 'commited_dt' with 'committed_dt' (two 't's instead of one)")
return self.committed_dt
# TODO not sure, maybe a better idea to move it to timeline?
def fix_datetime(dt: datetime) -> datetime:
# git module got it's own tzinfo object.. and it's pretty weird
tz = dt.tzinfo
assert tz is not None, dt
assert getattr(tz, '_name') == 'fixed'
offset = getattr(tz, '_offset')
ntz = timezone(offset)
return dt.replace(tzinfo=ntz)
def _git_root(git_dir: PathIsh) -> Path:
gd = Path(git_dir)
if gd.name == '.git':
return gd.parent
else:
return gd # must be bare
def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Commit]:
# without path might not handle pull heads properly
for c in gr.iter_commits(rev=rev):
if not by_me(c):
continue
sha = c.hexsha
if sha in emitted:
continue
emitted.add(sha)
repo = str(_git_root(gr.git_dir))
yield Commit(
committed_dt=fix_datetime(c.committed_datetime),
authored_dt=fix_datetime(c.authored_datetime),
message=c.message.strip(),
repo=repo,
sha=sha,
ref=rev,
)
def repo_commits(repo: PathIsh):
gr = git.Repo(str(repo))
emitted: Set[str] = set()
for r in gr.references:
yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted)
def canonical_name(repo: Path) -> str:
# TODO could determine origin?
if repo.match('github/repositories/*/repository'):
return repo.parent.name
else:
return repo.name
# if r.name == 'repository': # 'repository' thing from github..
# rname = r.parent.name
# else:
# rname = r.name
# if 'backups/github' in repo:
# pass # TODO
def _fd_path() -> str:
# todo move it to core
fd_path: Optional[str] = shutil.which("fdfind") or shutil.which("fd-find") or shutil.which("fd")
if fd_path is None:
high("my.coding.commits requires 'fd' to be installed, See https://github.com/sharkdp/fd#installation")
assert fd_path is not None
return fd_path
def git_repos_in(roots: List[Path]) -> List[Path]:
from subprocess import check_output
outputs = check_output([
_fd_path(),
# '--follow', # right, not so sure about follow... make configurable?
'--hidden',
'--full-path',
'--type', 'f',
'/HEAD', # judging by is_git_dir, it should always be here..
*roots,
]).decode('utf8').splitlines()
candidates = set(Path(o).resolve().absolute().parent for o in outputs)
# exclude stuff within .git dirs (can happen for submodules?)
candidates = {c for c in candidates if '.git' not in c.parts[:-1]}
candidates = {c for c in candidates if is_git_dir(c)}
repos = list(sorted(map(_git_root, candidates)))
return repos
def repos() -> List[Path]:
return git_repos_in(list(map(Path, config().roots)))
# returns modification time for an index to use as hash function
def _repo_depends_on(_repo: Path) -> int:
for pp in {
".git/FETCH_HEAD",
".git/HEAD",
"FETCH_HEAD", # bare
"HEAD", # bare
}:
ff = _repo / pp
if ff.exists():
return int(ff.stat().st_mtime)
else:
raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}")
def _commits(_repos: List[Path]) -> Iterator[Commit]:
for r in _repos:
yield from _cached_commits(r)
def _cached_commits_path(p: Path) -> str:
p = cache_dir() / 'my.coding.commits:_cached_commits' / str(p.absolute()).strip("/")
p.mkdir(parents=True, exist_ok=True)
return str(p)
# per-repo commits, to use cachew
@mcachew(
depends_on=_repo_depends_on,
logger=log,
cache_path=_cached_commits_path,
)
def _cached_commits(repo: Path) -> Iterator[Commit]:
log.debug('processing %s', repo)
yield from repo_commits(repo)
def commits() -> Iterator[Commit]:
return _commits(repos())
# TODO enforce read only? although it doesn't touch index