""" Git commits data for repositories on your filesystem """ REQUIRES = [ 'gitpython', ] import shutil from pathlib import Path from datetime import datetime, timezone from dataclasses import dataclass, field from typing import List, Optional, Iterator, Set, Sequence from my.core import PathIsh, LazyLogger, make_config from my.core.cachew import cache_dir from my.core.common import mcachew from my.core.warnings import high from my.config import commits as user_config @dataclass class commits_cfg(user_config): roots: Sequence[PathIsh] = field(default_factory=list) emails: Optional[Sequence[str]] = None names: Optional[Sequence[str]] = None # experiment to make it lazy? # would be nice to have a nicer syntax for it... maybe make_config could return a 'lazy' object def config() -> commits_cfg: res = make_config(commits_cfg) if res.emails is None and res.names is None: # todo error policy? throw/warn/ignore high("Set either 'emails' or 'names', otherwise you'll get no commits") return res ########################## import git # type: ignore from git.repo.fun import is_git_dir # type: ignore log = LazyLogger(__name__, level='info') def by_me(c: git.objects.commit.Commit) -> bool: actor = c.author if actor.email in (config().emails or ()): return True if actor.name in (config().names or ()): return True return False @dataclass class Commit: commited_dt: datetime authored_dt: datetime message: str repo: str # TODO put canonical name here straightaway?? sha: str ref: Optional[str] = None # TODO filter so they are authored by me @property def dt(self) -> datetime: return self.commited_dt # TODO not sure, maybe a better idea to move it to timeline? def fix_datetime(dt: datetime) -> datetime: # git module got it's own tzinfo object.. and it's pretty weird tz = dt.tzinfo assert tz is not None, dt assert getattr(tz, '_name') == 'fixed' offset = getattr(tz, '_offset') ntz = timezone(offset) return dt.replace(tzinfo=ntz) def _git_root(git_dir: PathIsh) -> Path: gd = Path(git_dir) if gd.name == '.git': return gd.parent else: return gd # must be bare def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Commit]: # without path might not handle pull heads properly for c in gr.iter_commits(rev=rev): if not by_me(c): continue sha = c.hexsha if sha in emitted: continue emitted.add(sha) repo = str(_git_root(gr.git_dir)) yield Commit( commited_dt=fix_datetime(c.committed_datetime), authored_dt=fix_datetime(c.authored_datetime), message=c.message.strip(), repo=repo, sha=sha, ref=rev, ) def repo_commits(repo: PathIsh): gr = git.Repo(str(repo)) emitted: Set[str] = set() for r in gr.references: yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted) def canonical_name(repo: Path) -> str: # TODO could determine origin? if repo.match('github/repositories/*/repository'): return repo.parent.name else: return repo.name # if r.name == 'repository': # 'repository' thing from github.. # rname = r.parent.name # else: # rname = r.name # if 'backups/github' in repo: # pass # TODO def _fd_path() -> str: # todo move it to core fd_path: Optional[str] = shutil.which("fdfind") or shutil.which("fd-find") or shutil.which("fd") if fd_path is None: high("my.coding.commits requires 'fd' to be installed, See https://github.com/sharkdp/fd#installation") assert fd_path is not None return fd_path def git_repos_in(roots: List[Path]) -> List[Path]: from subprocess import check_output outputs = check_output([ _fd_path(), # '--follow', # right, not so sure about follow... make configurable? '--hidden', '--full-path', '--type', 'f', '/HEAD', # judging by is_git_dir, it should always be here.. *roots, ]).decode('utf8').splitlines() candidates = set(Path(o).resolve().absolute().parent for o in outputs) # exclude stuff within .git dirs (can happen for submodules?) candidates = {c for c in candidates if '.git' not in c.parts[:-1]} candidates = {c for c in candidates if is_git_dir(c)} repos = list(sorted(map(_git_root, candidates))) return repos def repos() -> List[Path]: return git_repos_in(list(map(Path, config().roots))) # returns modification time for an index to use as hash function def _repo_depends_on(_repo: Path) -> int: for pp in { ".git/FETCH_HEAD", ".git/HEAD", "FETCH_HEAD", # bare "HEAD", # bare }: ff = _repo / pp if ff.exists(): return int(ff.stat().st_mtime) else: raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}") def _commits(_repos: List[Path]) -> Iterator[Commit]: for r in _repos: yield from _cached_commits(r) def _cached_commits_path(p: Path) -> str: p = cache_dir() / 'my.coding.commits:_cached_commits' / str(p.absolute()).strip("/") p.mkdir(parents=True, exist_ok=True) return str(p) # per-repo commits, to use cachew @mcachew( depends_on=_repo_depends_on, logger=log, cache_path=_cached_commits_path, ) def _cached_commits(repo: Path) -> Iterator[Commit]: log.debug('processing %s', repo) yield from repo_commits(repo) def commits() -> Iterator[Commit]: return _commits(repos()) # TODO enforce read only? although it doesn't touch index