* cleanup coding.commits remove the _things check, its never activated for me and seems pointless update mechanism for finding fdfind/fd path, send a core.warning if it fails update mechanism to cache repos to new cachew api (remove hashf), cache repos on a per-repo basis
193 lines
5.2 KiB
Python
193 lines
5.2 KiB
Python
"""
|
|
Git commits data for repositories on your filesystem
|
|
"""
|
|
|
|
import shutil
|
|
import string
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, NamedTuple, Optional, Dict, Any, Iterator, Set
|
|
|
|
from ..core.common import PathIsh, LazyLogger, mcachew, Stats
|
|
from ..core.cachew import cache_dir
|
|
from ..core.warnings import high
|
|
|
|
# TODO: create user_config dataclass?
|
|
from my.config import commits as config
|
|
|
|
# pip3 install gitpython
|
|
import git # type: ignore
|
|
from git.repo.fun import is_git_dir, find_worktree_git_dir # type: ignore
|
|
|
|
|
|
log = LazyLogger('my.commits', level='info')
|
|
|
|
|
|
def by_me(c) -> bool:
|
|
actor = c.author
|
|
if actor.email in config.emails:
|
|
return True
|
|
if actor.name in config.names:
|
|
return True
|
|
return False
|
|
|
|
|
|
class Commit(NamedTuple):
|
|
commited_dt: datetime
|
|
authored_dt: datetime
|
|
message: str
|
|
repo: str # TODO put canonical name here straightaway??
|
|
sha: str
|
|
ref: Optional[str] = None
|
|
# TODO filter so they are authored by me
|
|
|
|
@property
|
|
def dt(self) -> datetime:
|
|
return self.commited_dt
|
|
|
|
|
|
# TODO not sure, maybe a better idea to move it to timeline?
|
|
def fix_datetime(dt) -> datetime:
|
|
# git module got it's own tzinfo object.. and it's pretty weird
|
|
tz = dt.tzinfo
|
|
assert tz._name == 'fixed'
|
|
offset = tz._offset
|
|
ntz = timezone(offset)
|
|
return dt.replace(tzinfo=ntz)
|
|
|
|
|
|
def _git_root(git_dir: PathIsh) -> Path:
|
|
gd = Path(git_dir)
|
|
if gd.name == '.git':
|
|
return gd.parent
|
|
else:
|
|
return gd # must be bare
|
|
|
|
|
|
def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Commit]:
|
|
# without path might not handle pull heads properly
|
|
for c in gr.iter_commits(rev=rev):
|
|
if not by_me(c):
|
|
continue
|
|
sha = c.hexsha
|
|
if sha in emitted:
|
|
continue
|
|
emitted.add(sha)
|
|
|
|
repo = str(_git_root(gr.git_dir))
|
|
|
|
yield Commit(
|
|
commited_dt=fix_datetime(c.committed_datetime),
|
|
authored_dt=fix_datetime(c.authored_datetime),
|
|
message=c.message.strip(),
|
|
repo=repo,
|
|
sha=sha,
|
|
ref=rev,
|
|
)
|
|
|
|
|
|
def repo_commits(repo: PathIsh):
|
|
gr = git.Repo(str(repo))
|
|
emitted: Set[str] = set()
|
|
for r in gr.references:
|
|
yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted)
|
|
|
|
|
|
def canonical_name(repo: Path) -> str:
|
|
# TODO could determine origin?
|
|
if repo.match('github/repositories/*/repository'):
|
|
return repo.parent.name
|
|
else:
|
|
return repo.name
|
|
# if r.name == 'repository': # 'repository' thing from github..
|
|
# rname = r.parent.name
|
|
# else:
|
|
# rname = r.name
|
|
# if 'backups/github' in repo:
|
|
# pass # TODO
|
|
|
|
|
|
def _fd_path() -> str:
|
|
fd_path: Optional[str] = shutil.which("fdfind") or shutil.which("fd-find") or shutil.which("fd")
|
|
if fd_path is None:
|
|
high(f"my.coding.commits requires 'fd' to be installed, See https://github.com/sharkdp/fd#installation")
|
|
# TODO: this just causes it to fail if 'fd' can't be found, but the warning is still sent... seems fine?
|
|
return fd_path or "fd"
|
|
|
|
|
|
# TODO could reuse in clustergit?..
|
|
def git_repos_in(roots: List[Path]) -> List[Path]:
|
|
from subprocess import check_output
|
|
outputs = check_output([
|
|
_fd_path(),
|
|
# '--follow', # right, not so sure about follow... make configurable?
|
|
'--hidden',
|
|
'--full-path',
|
|
'--type', 'f',
|
|
'/HEAD', # judging by is_git_dir, it should always be here..
|
|
*roots,
|
|
]).decode('utf8').splitlines()
|
|
|
|
candidates = set(Path(o).resolve().absolute().parent for o in outputs)
|
|
|
|
# exclude stuff within .git dirs (can happen for submodules?)
|
|
candidates = {c for c in candidates if '.git' not in c.parts[:-1]}
|
|
|
|
candidates = {c for c in candidates if is_git_dir(c)}
|
|
|
|
repos = list(sorted(map(_git_root, candidates)))
|
|
return repos
|
|
|
|
|
|
def repos():
|
|
return git_repos_in(config.roots)
|
|
|
|
|
|
# returns modification time for an index to use as hash function
|
|
def _repo_depends_on(_repo: Path) -> int:
|
|
for pp in {
|
|
".git/FETCH_HEAD",
|
|
".git/HEAD",
|
|
"FETCH_HEAD", # bare
|
|
"HEAD", # bare
|
|
}:
|
|
ff = _repo / pp
|
|
if ff.exists():
|
|
return int(ff.stat().st_mtime)
|
|
else:
|
|
raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}")
|
|
|
|
|
|
def _commits(_repos: List[Path]) -> Iterator[Commit]:
|
|
for r in _repos:
|
|
yield from _cached_commits(r)
|
|
|
|
_allowed_letters: str = string.ascii_letters + string.digits
|
|
|
|
|
|
def _cached_commits_path(p: Path) -> str:
|
|
# compute a reduced simple filepath using the absolute path of the repo
|
|
simple_path = ''.join(filter(lambda c: c in _allowed_letters, str(p.absolute())))
|
|
return cache_dir() / simple_path / '_cached_commits'
|
|
|
|
|
|
# per-repo commits, to use cachew
|
|
@mcachew(
|
|
depends_on=_repo_depends_on,
|
|
logger=log,
|
|
cache_path=lambda p: _cached_commits_path(p)
|
|
)
|
|
def _cached_commits(repo: Path) -> Iterator[Commit]:
|
|
log.debug('processing %s', repo)
|
|
yield from repo_commits(repo)
|
|
|
|
def commits() -> Iterator[Commit]:
|
|
return _commits(repos())
|
|
|
|
|
|
def print_all():
|
|
for c in commits():
|
|
print(c)
|
|
|
|
|
|
# TODO enforce read only? although it doesn't touch index
|