aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Harring <ferringb@google.com>2012-10-16 03:40:16 -0700
committerBrian Harring <ferringb@google.com>2012-10-16 13:28:49 -0700
commit1c5a9777fcae0bc614afda80022790e6fcda2d78 (patch)
treeb8be036b0cf26e6e0c377ebf1720e56942dd6281
parentpunt unsigned manifest commit messages (diff)
downloadgit-conversion-tools-1c5a9777fcae0bc614afda80022790e6fcda2d78.tar.gz
git-conversion-tools-1c5a9777fcae0bc614afda80022790e6fcda2d78.tar.bz2
git-conversion-tools-1c5a9777fcae0bc614afda80022790e6fcda2d78.zip
use custom function for breaking up portage version markers in commits
This gets all of them in history, best I can tell.
-rwxr-xr-xrewrite-commit-dump.py97
1 files changed, 80 insertions, 17 deletions
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 0d1397e..e08a9cf 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -1,4 +1,5 @@
#!/usr/bin/python
+import collections
import functools
import itertools
import operator
@@ -11,18 +12,65 @@ mangler = []
mangler.append(functools.partial(
re.compile(r"^\(paludis (0.1.*)\)$", re.M|re.I).sub,
r"Package-Manager: paludis-\1/"))
+# Special case not covered by the main portage mangler.
mangler.append(functools.partial(
- re.compile(r'^\(portage version: *([^,\n)]*), +unsigned Manifest commit\)$', re.M|re.I).sub,
+ re.compile('r^\(Portage (2\.1\.2[^\)]+)\)$', re.M|re.I).sub,
r'Package-Manager: portage-\1'))
mangler.append(functools.partial(
- re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub,
- r"Package-Manager: portage-\1"))
+ re.compile(r' *\((?:manifest +recommit|(?:un)?signed +manifest +commit)\) *$', re.M|re.I).sub,
+ r''))
-fields = ('author', 'committer', 'msg', 'files', 'timestamp')
+def mangle_portage(match, allowed=frozenset('abcdef0123456789')):
+ content = match.group()
+ assert isinstance(content, (unicode, str))
+ content = content.strip()
+ assert ('(', ')') == (content[0], content[-1]), content
+ content = content[1:-1]
+ values = [x.strip() for x in content.split(',')]
+ # portage version: blah
+ version = values[0].split(':', 1)[1].strip()
+ results = ['Package-Manager: portage-' + version]
+ values = [x for x in values if 'unsigned manifest' not in x.lower()]
+ repoman = [x for x in values if 'repoman options:' in x.lower()]
+ assert len(repoman) <= 1, content
+ if repoman:
+ repoman = ' '.join(repoman[0].split(':', 1)[1].split())
+ results.append('RepoMan-Options: ' + repoman)
+ values = [x.lower() for x in values]
+ signage = [x for x in values if 'key' in x and 'signed' in x and 'unsigned' not in x]
+ assert len(signage) <= 1, content
+ if signage:
+ signage = signage[0].rstrip().rsplit(None, 1)[1]
+ if signage.startswith('0x'):
+ signage = signage[2:]
+ if signage in ('key', 'ultrabug'):
+ # Known bad keys; this is why portage needs to do basic enforcement...
+ signage = None
+ elif '@' in signage:
+ # Bleh. be paranoid, ensure case wasn't affected.
+ assert signage in content, (signage, content)
+ signage = '<%s>' % signage
+ elif signage.endswith('!'):
+ assert allowed.issuperset(signage[:-1]), content
+ else:
+ assert allowed.issuperset(signage), content
+ if signage:
+ results.append('Manifest-Sign-Key: 0x' + signage.upper())
+ return "\n".join(results)
+
+# the TM/R is for crap like this:
+# (Portage version: 2.2_pre7/cvs/Linux 2.6.25.4 Intel(R) Core(TM)2 Duo CPU E6750 @ 2.66GHz)
+mangler.append(functools.partial(
+ re.compile(r'^\(portage version: +(?:\((?:tm|r)\)|[^\)\n])+\)$', re.M|re.I).sub,
+ mangle_portage))
+
+known_footers = ('Package-Manager', 'RepoMan-Options', 'Manifest-Sign-Key')
+fields = ('author', 'committer', 'msg', 'files', 'timestamp', 'footerless_msg')
fields_map = dict((attr, idx) for idx, attr in enumerate(fields))
+fake_fields = ('footerless_msg', 'timestamp')
file_idx = fields_map['files']
class record(namedtuple('record', fields)):
- def safe_combine(self, other, file_idx=fields_map['files']):
+ def safe_combine(self, other):
files = self.files.copy()
assert not set(files).intersection(other.files), (files, other.files)
files.update(other.files)
@@ -30,6 +78,18 @@ class record(namedtuple('record', fields)):
items[file_idx] = files
return self.__class__(*items)
+ def update_files(self, other):
+ files = self.files.copy()
+ files.update(other.files)
+ items = list(self)
+ items[file_idx] = files
+ return self.__class__(*items)
+
+ @staticmethod
+ def calculate_footerless_msg(msg):
+ return tuple(x for x in msg.splitlines()
+ if x.split(':', 1)[0] not in known_footers)
+
def deserialize_records(source, blob_idx):
line = source.readline()
while line:
@@ -47,7 +107,7 @@ def deserialize_records(source, blob_idx):
continue
assert chunks[0] in ('author', 'committer', 'data')
if chunks[0] != 'data':
- d[chunks[0]] = chunks[1].strip()
+ d[chunks[0]] = intern(chunks[1].strip())
continue
# Process the commit message...
size = int(chunks[1])
@@ -55,7 +115,9 @@ def deserialize_records(source, blob_idx):
assert len(data) == size, (line, data)
for func in mangler:
data = func(data)
- d['msg'] = data
+ # Throw away the prefixed/trailing whitespace- some of our manglers leave those behind
+ # unfortunately. For fast-export reasons, a newline trailing is needed- but that should be it.
+ d['msg'] = data.strip() + "\n"
line = source.readline()
# Note that cvs2git writes slightly funky data statements; the byte count
# doesn't necessarily include the trailing newline.
@@ -75,13 +137,13 @@ def deserialize_records(source, blob_idx):
mode = line.split(None, 1)
assert len(mode) == 2, line
if mode[0] == 'D':
- files[mode[1]] = (mode[0], line)
+ files[intern(os.path.normpath(mode[1]))] = (mode[0], line)
elif mode[0] == 'M':
# M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog
# if it's not a sha1, but startswith ':'... then it's an index.
chunks = line.split(None, 4)
assert len(chunks) == 4, line
- fname = chunks[3]
+ fname = intern(os.path.normpath(chunks[3]))
if chunks[2][0] == ':':
line = ' '.join(chunks[:2] + [blob_idx[int(chunks[2][1:])], fname])
files[fname] = (mode[0], line)
@@ -97,6 +159,7 @@ def deserialize_records(source, blob_idx):
chunks = d['author'].rsplit(None, 1)
assert len(chunks) == 2 and chunks[1] == '+0000', d['author']
d['timestamp'] = long(chunks[0].rsplit(None, 1)[1])
+ d['footerless_msg'] = record.calculate_footerless_msg(d['msg'])
yield record(*[d.get(x) for x in fields])
# Bleh... of course namedtuple doesn't make this easy.
line = source.readline()
@@ -113,15 +176,15 @@ def serialize_records(records, handle, target='refs/heads/master', progress=5000
write('mark :%i\n' % idx)
# fields = ('mark', 'author', 'committer', 'msg', 'files')
for name, value in zip(fields, record):
- if name == 'files':
- for filename in sorted(value):
- write("%s\n" % (value[filename][1],))
- elif name in ('mark', 'author', 'committer'):
+ if name in ('mark', 'author', 'committer'):
write("%s %s\n" % (name, value))
+ elif name in fake_fields:
+ continue
elif name == 'msg':
write("data %i\n%s" % (len(value), value))
- elif name == 'timestamp':
- continue
+ elif name == 'files':
+ for filename in sorted(value):
+ write("%s\n" % (value[filename][1],))
else:
raise AssertionError("serialize is out of sync; don't know field %s" % name)
write("\n")
@@ -132,9 +195,9 @@ def deserialize_blob_map(source):
def simple_dedup(records):
# dedup via timestamp/author/msg
- dupes = {}
+ dupes = collections.defaultdict(list)
for idx, record in enumerate(records):
- dupes.setdefault((record.timestamp, record.author, record.msg), []).append((idx, record))
+ dupes[(record.timestamp, record.author, record.footerless_msg)].append((idx, record))
mangled = []
for key, value in dupes.iteritems():
if len(value) == 1: