blob: 67f19345d45e4d51eb0b075aef4eb1fb77388c16 [file] [log] [blame] [edit]
#!/usr/bin/env python3
#
# Copyright 2020, Data61, CSIRO (ABN 41 687 119 230)
#
# SPDX-License-Identifier: GPL-2.0-only
#
'''
A tool for determining provenance.
Occasionally one encounters a directory of source code that was derived from an upstream repository
with history either squashed or discarded. To pull in upstream changes, it is desirable to know what
commit the source code originated from. This script helps you determine that by looking for the
upstream commit with the smallest diff to the downstream files. Sample usage:
whence.py -u https://github.com/torvalds/linux --upstream-subdir scripts/kconfig \
-d https://github.com/seL4/seL4_tools --downstream-subdir kbuild-tool/kconfig
'''
import argparse
import os
import shutil
import subprocess
import sys
import tempfile
class GitRepo(object):
def __init__(self, url):
self.tmp = tempfile.mkdtemp()
subprocess.check_call(['git', 'clone', url, self.tmp])
def checkout(self, commit):
subprocess.check_call(['git', 'checkout', commit], cwd=self.tmp)
def log(self, subdir):
# Reverse the commit list to test them chronologically, just for consistency.
return reversed(subprocess.check_output(['git', 'log', '--pretty=tformat:%H', '.'],
cwd=os.path.join(self.tmp, subdir)).split())
def __del__(self):
shutil.rmtree(self.tmp)
def main(argv):
parser = argparse.ArgumentParser(description='locate a Git commit in an upstream project from '
'which downstream source was derived')
parser.add_argument('--upstream', '-u', required=True, help='URL of upstream repository to '
'search')
parser.add_argument('--upstream-subdir', default='', help='subdirectory within upstream '
'repository to consider (root by default)')
parser.add_argument('--downstream', '-d', required=True, help='URL of downstream repository '
'to analyse')
parser.add_argument('--downstream-subdir', default='', help='subdirectory within downstream '
'repository to analyse (root by default)')
parser.add_argument('--downstream-commit', help='commit in downstream repository to consider '
'(HEAD of master by default)')
opts = parser.parse_args(argv[1:])
sys.stderr.write('Cloning %s into a temporary directory...\n' % opts.upstream)
try:
upstream = GitRepo(opts.upstream)
except subprocess.CalledProcessError:
return -1
sys.stderr.write('Cloning %s into a temporary directory...\n' % opts.downstream)
try:
downstream = GitRepo(opts.downstream)
except subprocess.CalledProcessError:
return -1
if opts.downstream_commit is not None:
sys.stderr.write('Updating downstream to %s...\n' % opts.downstream_commit)
try:
downstream.checkout(opts.downstream_commit)
except subprocess.CalledProcessError:
return -1
sys.stderr.write('Retrieving candidate commit list...\n')
try:
commits = list(upstream.log(opts.upstream_subdir))
except subprocess.CalledProcessError:
return -1
sys.stderr.write('%d commits to consider\n' % len(commits))
# We now have everything we need. Examine each commit, tracking the smallest diff we've seen.
min_diff = None
min_commit = None
for index, commit in enumerate(commits):
sys.stderr.write('Considering %s (%d of %d)...\n' % (commit, index, len(commits)))
try:
upstream.checkout(commit)
except subprocess.CalledProcessError:
return -1
src = os.path.join(upstream.tmp, opts.upstream_subdir)
dst = os.path.join(downstream.tmp, opts.downstream_subdir)
p = subprocess.Popen(['diff', src, dst], stdout=subprocess.PIPE)
stdout, _ = p.communicate()
diff = len(stdout.split('\n'))
sys.stderr.write('This commit has a difference metric of %d\n' % diff)
if min_diff is None or min_diff > diff:
min_diff = diff
min_commit = commit
sys.stderr.write('The most likely commit is %s\n' % min_commit)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))