diff options
Diffstat (limited to 'rewrite-commit-dump.py')
-rwxr-xr-x | rewrite-commit-dump.py | 54 |
1 files changed, 41 insertions, 13 deletions
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py index 7678406..f657a8e 100755 --- a/rewrite-commit-dump.py +++ b/rewrite-commit-dump.py @@ -1,5 +1,7 @@ #!/usr/bin/python import functools +import operator +import os import re import sys from collections import namedtuple @@ -12,10 +14,10 @@ mangler.append(functools.partial( re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub, r"Package-Manager: portage-\1")) -fields = ('mark', 'author', 'committer', 'msg', 'files') +fields = ('author', 'committer', 'msg', 'files', 'timestamp') record = namedtuple('record', fields) -def deserialize_records(source): +def deserialize_records(source, blob_idx): line = source.readline() while line: while line.split()[0] in ('reset', 'progress'): @@ -28,9 +30,9 @@ def deserialize_records(source): line = source.readline() chunks = line.split(None, 1) assert len(chunks) == 2, line - if chunks[0] == 'from': + if chunks[0] in ('from', 'mark'): continue - assert chunks[0] in ('mark', 'author', 'committer', 'data') + assert chunks[0] in ('author', 'committer', 'data') if chunks[0] != 'data': d[chunks[0]] = chunks[1].strip() continue @@ -63,28 +65,39 @@ def deserialize_records(source): files[mode[1]] = (mode[0], line) elif mode[0] == 'M': # M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog - chunks = mode[1].split(None, 3) - assert len(chunks) == 3, line - files[chunks[2]] = (mode[0], line) + # if it's not a sha1, but startswith ':'... then it's an index. + chunks = line.split(None, 4) + assert len(chunks) == 4, line + fname = chunks[3] + if chunks[2][0] == ':': + line = ' '.join(chunks[:2] + [blob_idx[int(chunks[2][1:])], fname]) + files[fname] = (mode[0], line) else: raise AssertionError("got unknown file op: mode=%r, line:\n%r" % (mode[0], line)) line = source.readline() d['files'] = files # Basic sanity check for the code above... assert set(fields).issuperset(d), d + d.setdefault('author', d.get('committer')) + assert d['author'] is not None + # Skank the timestamp out... + chunks = d['author'].rsplit(None, 1) + assert len(chunks) == 2 and chunks[1] == '+0000', d['author'] + d['timestamp'] = long(chunks[0].rsplit(None, 1)[1]) yield record(*[d.get(x) for x in fields]) # Bleh... of course namedtuple doesn't make this easy. line = source.readline() -def serialize_records(records, handle, target='refs/heads/master', progress=1000): +def serialize_records(records, handle, target='refs/heads/master', progress=5000): write = handle.write write('reset %s\n' % target) total = len(records) for idx, record in enumerate(records, 1): if idx % progress == 0: write('progress %02.1f%%: %i of %i commits\n' - % ((100 * float(idx))//total, idx, total)) + % ((100 * float(idx))/total, idx, total)) write('commit %s\n' % target) + write('mark :%i\n' % idx) # fields = ('mark', 'author', 'committer', 'msg', 'files') for name, value in zip(fields, record): if name == 'files': @@ -94,17 +107,32 @@ def serialize_records(records, handle, target='refs/heads/master', progress=1000 write("%s %s\n" % (name, value)) elif name == 'msg': write("data %i\n%s" % (len(value), value)) + elif name == 'timestamp': + continue else: raise AssertionError("serialize is out of sync; don't know field %s" % name) write("\n") +def deserialize_blob_map(source): + source = (x.strip().split() for x in source) + return dict((int(x[0].lstrip(':')), x[1]) for x in source) + def main(argv): - source = open(argv[0], 'r') if argv else sys.stdin - records = list(deserialize_records(source)) + records = [] + source = argv if argv else sys.stdin + directories = [x.strip() for x in source] + for directory in directories: + tmp = os.path.join(directory, 'cvs2svn-tmp') + commits = os.path.join(tmp, 'git-dump.dat') + if not os.path.exists(commits): + sys.stderr.write("skipping %s; no commit data\n" % directory) + continue + blob_index = deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx'))) + records.extend(deserialize_records(open(commits, 'r'), blob_index)) + records.sort(key=operator.attrgetter('timestamp')) + #records = list(deserialize_records(source)) serialize_records(records, sys.stdout) return 0 if __name__ == '__main__': - if len(sys.argv) not in (1, 2): - raise SystemExit("args must be either none, or path to fast-export stream to read", code=1) sys.exit(main(sys.argv[1:])) |