154 lines
3.9 KiB
Python
154 lines
3.9 KiB
Python
"""
|
|
Git commits data: crawls filesystem
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, NamedTuple, Optional, Dict, Any, Iterator, Set
|
|
|
|
from ..common import PathIsh, LazyLogger
|
|
from mycfg import commits as config
|
|
|
|
# pip3 install gitpython
|
|
import git # type: ignore
|
|
from git.repo.fun import is_git_dir, find_worktree_git_dir # type: ignore
|
|
|
|
|
|
log = LazyLogger('my.commits', level='info')
|
|
|
|
|
|
_things = {
|
|
*config.emails,
|
|
*config.names,
|
|
}
|
|
|
|
|
|
def by_me(c) -> bool:
|
|
actor = c.author
|
|
if actor.email in config.emails:
|
|
return True
|
|
if actor.name in config.names:
|
|
return True
|
|
aa = f"{actor.email} {actor.name}"
|
|
for thing in _things:
|
|
if thing in aa:
|
|
# TODO this is probably useless
|
|
raise RuntimeError("WARNING!!!", actor, c, c.repo)
|
|
return False
|
|
|
|
|
|
class Commit(NamedTuple):
|
|
commited_dt: datetime
|
|
authored_dt: datetime
|
|
message: str
|
|
repo: str # TODO put canonical name here straightaway??
|
|
sha: str
|
|
ref: Optional[str]=None
|
|
# TODO filter so they are authored by me
|
|
|
|
@property
|
|
def dt(self) -> datetime:
|
|
return self.commited_dt
|
|
|
|
|
|
# TODO not sure, maybe a better idea to move it to timeline?
|
|
def fix_datetime(dt) -> datetime:
|
|
# git module got it's own tzinfo object.. and it's pretty weird
|
|
tz = dt.tzinfo
|
|
assert tz._name == 'fixed'
|
|
offset = tz._offset
|
|
ntz = timezone(offset)
|
|
return dt.replace(tzinfo=ntz)
|
|
|
|
|
|
def _git_root(git_dir: PathIsh) -> Path:
|
|
gd = Path(git_dir)
|
|
if gd.name == '.git':
|
|
return gd.parent
|
|
else:
|
|
return gd # must be bare
|
|
|
|
|
|
def _repo_commits_aux(gr: git.Repo, rev: str, emitted: Set[str]) -> Iterator[Commit]:
|
|
# without path might not handle pull heads properly
|
|
for c in gr.iter_commits(rev=rev):
|
|
if not by_me(c):
|
|
continue
|
|
sha = c.hexsha
|
|
if sha in emitted:
|
|
continue
|
|
emitted.add(sha)
|
|
|
|
repo = str(_git_root(gr.git_dir))
|
|
|
|
yield Commit(
|
|
commited_dt=fix_datetime(c.committed_datetime),
|
|
authored_dt=fix_datetime(c.authored_datetime),
|
|
message=c.message.strip(),
|
|
repo=repo,
|
|
sha=sha,
|
|
ref=rev,
|
|
)
|
|
|
|
|
|
def repo_commits(repo: PathIsh):
|
|
gr = git.Repo(str(repo))
|
|
emitted: Set[str] = set()
|
|
for r in gr.references:
|
|
yield from _repo_commits_aux(gr=gr, rev=r.path, emitted=emitted)
|
|
|
|
|
|
def canonical_name(repo: Path) -> str:
|
|
# TODO could determine origin?
|
|
if repo.match('github/repositories/*/repository'):
|
|
return repo.parent.name
|
|
else:
|
|
return repo.name
|
|
# if r.name == 'repository': # 'repository' thing from github..
|
|
# rname = r.parent.name
|
|
# else:
|
|
# rname = r.name
|
|
# if 'backups/github' in repo:
|
|
# pass # TODO
|
|
|
|
|
|
# TODO could reuse in clustergit?..
|
|
def git_repos_in(roots: List[Path]) -> List[Path]:
|
|
from subprocess import check_output
|
|
outputs = check_output([
|
|
'fdfind',
|
|
# '--follow', # right, not so sure about follow... make configurable?
|
|
'--hidden',
|
|
'--full-path',
|
|
'--type', 'f',
|
|
'/HEAD', # judging by is_git_dir, it should always be here..
|
|
*roots,
|
|
]).decode('utf8').splitlines()
|
|
candidates = set(Path(o).resolve().absolute().parent for o in outputs)
|
|
|
|
# exclude stuff within .git dirs (can happen for submodules?)
|
|
candidates = {c for c in candidates if '.git' not in c.parts[:-1]}
|
|
|
|
candidates = {c for c in candidates if is_git_dir(c)}
|
|
|
|
repos = list(sorted(map(_git_root, candidates)))
|
|
return repos
|
|
|
|
|
|
def repos():
|
|
return git_repos_in(config.roots)
|
|
|
|
|
|
# TODO cachew for all commits?
|
|
def commits() -> Iterator[Commit]:
|
|
for r in repos():
|
|
log.info('processing %s', r)
|
|
yield from repo_commits(r)
|
|
|
|
|
|
def print_all():
|
|
for c in commits():
|
|
print(c)
|
|
|
|
|
|
# TODO enforce read only? although it doesn't touch index
|