""" Git commits data for repositories on your filesystem """ REQUIRES = [ 'gitpython', ] import shutil import string from pathlib import Path from datetime import datetime, timezone from dataclasses import dataclass, field from typing import List, NamedTuple, Optional, Iterator, Set, Sequence from my.core import PathIsh, LazyLogger, make_config from my.core.cachew import cache_dir from my.core.common import mcachew from my.core.warnings import high from my.config import commits as user_config @dataclass class commits_cfg(user_config): roots: Sequence[PathIsh] = field(default_factory=list) emails: Optional[Sequence[str]] = None names: Optional[Sequence[str]] = None # experiment to make it lazy? # would be nice to have a nicer syntax for it... maybe make_config could return a 'lazy' object def config() -> commits_cfg: res = make_config(commits_cfg) if res.emails is None and res.names is None: # todo error policy? throw/warn/ignore high("Set either 'emails' or 'names', otherwise you'll get no commits") return res ########################## import git # type: ignore from git.repo.fun import is_git_dir, find_worktree_git_dir # type: ignore log = LazyLogger(__name__, level='info') def by_me(c: git.objects.commit.Commit) -> bool: actor = c.author if actor.email in (config().emails or ()): return True if actor.name in (config().names or ()): return True return False @dataclass class Commit: commited_dt: datetime authored_dt: datetime message: str repo: str # TODO put canonical name here straightaway?? sha: str ref: Optional[str] = None # TODO filter so they are authored by me @property def dt(self) -> datetime: return self.commited_dt # TODO not sure, maybe a better idea to move it to timeline? def fix_datetime(dt: datetime) -> datetime: # git module got it's own tzinfo object.. and it's pretty weird tz = dt.tzinfo assert tz is not None, dt assert getattr(tz, '_name') == 'fixed' offset = getattr(tz, '_offset') ntz = timezone(offset) return dt.replace(tzinfo=ntz) def _git_root(git_dir: PathIsh) -> Path: gd = Path(git_dir) if gd.name == '.git': return gd.parent else: return gd # must be bare def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Commit]: # without path might not handle pull heads properly for c in gr.iter_commits(rev=rev): if not by_me(c): continue sha = c.hexsha if sha in emitted: continue emitted.add(sha) repo = str(_git_root(gr.git_dir)) yield Commit( commited_dt=fix_datetime(c.committed_datetime), authored_dt=fix_datetime(c.authored_datetime), message=c.message.strip(), repo=repo, sha=sha, ref=rev, ) def repo_commits(repo: PathIsh): gr = git.Repo(str(repo)) emitted: Set[str] = set() for r in gr.references: yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted) def canonical_name(repo: Path) -> str: # TODO could determine origin? if repo.match('github/repositories/*/repository'): return repo.parent.name else: return repo.name # if r.name == 'repository': # 'repository' thing from github.. # rname = r.parent.name # else: # rname = r.name # if 'backups/github' in repo: # pass # TODO def _fd_path() -> str: # todo move it to core fd_path: Optional[str] = shutil.which("fdfind") or shutil.which("fd-find") or shutil.which("fd") if fd_path is None: high(f"my.coding.commits requires 'fd' to be installed, See https://github.com/sharkdp/fd#installation") assert fd_path is not None return fd_path def git_repos_in(roots: List[Path]) -> List[Path]: from subprocess import check_output outputs = check_output([ _fd_path(), # '--follow', # right, not so sure about follow... make configurable? '--hidden', '--full-path', '--type', 'f', '/HEAD', # judging by is_git_dir, it should always be here.. *roots, ]).decode('utf8').splitlines() candidates = set(Path(o).resolve().absolute().parent for o in outputs) # exclude stuff within .git dirs (can happen for submodules?) candidates = {c for c in candidates if '.git' not in c.parts[:-1]} candidates = {c for c in candidates if is_git_dir(c)} repos = list(sorted(map(_git_root, candidates))) return repos def repos() -> List[Path]: return git_repos_in(list(map(Path, config().roots))) # returns modification time for an index to use as hash function def _repo_depends_on(_repo: Path) -> int: for pp in { ".git/FETCH_HEAD", ".git/HEAD", "FETCH_HEAD", # bare "HEAD", # bare }: ff = _repo / pp if ff.exists(): return int(ff.stat().st_mtime) else: raise RuntimeError(f"Could not find a FETCH_HEAD/HEAD file in {_repo}") def _commits(_repos: List[Path]) -> Iterator[Commit]: for r in _repos: yield from _cached_commits(r) _allowed_letters: str = string.ascii_letters + string.digits def _cached_commits_path(p: Path) -> str: # compute a reduced simple filepath using the absolute path of the repo simple_path = ''.join(filter(lambda c: c in _allowed_letters, str(p.absolute()))) return str(cache_dir() / 'commits' / simple_path / '_cached_commits') # per-repo commits, to use cachew @mcachew( depends_on=_repo_depends_on, logger=log, cache_path=lambda p: _cached_commits_path(p) ) def _cached_commits(repo: Path) -> Iterator[Commit]: log.debug('processing %s', repo) yield from repo_commits(repo) def commits() -> Iterator[Commit]: return _commits(repos()) # TODO enforce read only? although it doesn't touch index