aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Harring <ferringb@google.com>2012-10-15 23:13:53 -0700
committerBrian Harring <ferringb@google.com>2012-10-16 13:28:49 -0700
commit333dd4c0fd75240be5f0329c837d8effdf705d67 (patch)
treeec2caf51dd8e62961aefeb733c968638bed15a24
parentSuppress known missing names, leaving just the email address. (diff)
downloadgit-conversion-tools-333dd4c0fd75240be5f0329c837d8effdf705d67.tar.gz
git-conversion-tools-333dd4c0fd75240be5f0329c837d8effdf705d67.tar.bz2
git-conversion-tools-333dd4c0fd75240be5f0329c837d8effdf705d67.zip
ongoing work
-rwxr-xr-xrewrite-commit-dump.py123
1 files changed, 97 insertions, 26 deletions
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 4784cb5..7678406 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -2,6 +2,7 @@
import functools
import re
import sys
+from collections import namedtuple
mangler = []
mangler.append(functools.partial(
@@ -11,29 +12,99 @@ mangler.append(functools.partial(
re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub,
r"Package-Manager: portage-\1"))
-write = sys.stdout.write
-source = open(sys.argv[1]) if len(sys.argv) > 1 else sys.stdin
-write('reset refs/heads/master\n')
-while True:
- x = source.readline()
- if not x:
- break
- chunked = x.split()
- if not chunked:
- write(x)
- continue
- elif chunked[0] in ('reset', 'from'):
- continue
- elif chunked[0] == 'commit':
- write('commit refs/heads/master\n')
- continue
- elif chunked[0] != 'data':
- write(x)
- continue
- assert len(chunked) == 2
- size = int(chunked[1])
- data = source.read(size)
- assert len(data) == size
- for func in mangler:
- data = func(data)
- write("data %i\n%s" % (len(data), data))
+fields = ('mark', 'author', 'committer', 'msg', 'files')
+record = namedtuple('record', fields)
+
+def deserialize_records(source):
+ line = source.readline()
+ while line:
+ while line.split()[0] in ('reset', 'progress'):
+ line = source.readline()
+
+ # First get the free form fields; stop after we get the commit msg.
+ assert line.split()[0] == 'commit', line
+ d = {}
+ while True:
+ line = source.readline()
+ chunks = line.split(None, 1)
+ assert len(chunks) == 2, line
+ if chunks[0] == 'from':
+ continue
+ assert chunks[0] in ('mark', 'author', 'committer', 'data')
+ if chunks[0] != 'data':
+ d[chunks[0]] = chunks[1].strip()
+ continue
+ # Process the commit message...
+ size = int(chunks[1])
+ data = source.read(size)
+ assert len(data) == size, (line, data)
+ for func in mangler:
+ data = func(data)
+ d['msg'] = data
+ line = source.readline()
+ # Note that cvs2git writes slightly funky data statements; the byte count
+ # doesn't necessarily include the trailing newline.
+ if line == '\n':
+ line = source.readline()
+ break
+
+ assert line
+ # From can show up here on occasion... annoying.
+ if line.split()[0:1] == ['from']:
+ line = source.readline()
+ files = {}
+ while line != '\n':
+ # Two types I can spot; M=modify, and D=delete.
+ assert line[-1] == '\n'
+ line = line[:-1]
+ mode = line.split(None, 1)
+ assert len(mode) == 2, line
+ if mode[0] == 'D':
+ files[mode[1]] = (mode[0], line)
+ elif mode[0] == 'M':
+ # M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog
+ chunks = mode[1].split(None, 3)
+ assert len(chunks) == 3, line
+ files[chunks[2]] = (mode[0], line)
+ else:
+ raise AssertionError("got unknown file op: mode=%r, line:\n%r" % (mode[0], line))
+ line = source.readline()
+ d['files'] = files
+ # Basic sanity check for the code above...
+ assert set(fields).issuperset(d), d
+ yield record(*[d.get(x) for x in fields])
+ # Bleh... of course namedtuple doesn't make this easy.
+ line = source.readline()
+
+def serialize_records(records, handle, target='refs/heads/master', progress=1000):
+ write = handle.write
+ write('reset %s\n' % target)
+ total = len(records)
+ for idx, record in enumerate(records, 1):
+ if idx % progress == 0:
+ write('progress %02.1f%%: %i of %i commits\n'
+ % ((100 * float(idx))//total, idx, total))
+ write('commit %s\n' % target)
+ # fields = ('mark', 'author', 'committer', 'msg', 'files')
+ for name, value in zip(fields, record):
+ if name == 'files':
+ for filename in sorted(value):
+ write("%s\n" % (value[filename][1],))
+ elif name in ('mark', 'author', 'committer'):
+ write("%s %s\n" % (name, value))
+ elif name == 'msg':
+ write("data %i\n%s" % (len(value), value))
+ else:
+ raise AssertionError("serialize is out of sync; don't know field %s" % name)
+ write("\n")
+
+def main(argv):
+ source = open(argv[0], 'r') if argv else sys.stdin
+ records = list(deserialize_records(source))
+ serialize_records(records, sys.stdout)
+ return 0
+
+if __name__ == '__main__':
+ if len(sys.argv) not in (1, 2):
+ raise SystemExit("args must be either none, or path to fast-export stream to read", code=1)
+ sys.exit(main(sys.argv[1:]))