aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Harring <ferringb@google.com>2012-10-16 17:17:02 -0700
committerBrian Harring <ferringb@google.com>2012-10-16 17:17:02 -0700
commite5bf95661deb6caea04f29db8d0c52c7d52ef485 (patch)
tree381648f9387d587e99d6e75068cdc99fff93f6ca
parentMinor bit of speedups; use sed instead of whacky python script, add time'ing ... (diff)
downloadgit-conversion-tools-e5bf95661deb6caea04f29db8d0c52c7d52ef485.tar.gz
git-conversion-tools-e5bf95661deb6caea04f29db8d0c52c7d52ef485.tar.bz2
git-conversion-tools-e5bf95661deb6caea04f29db8d0c52c7d52ef485.zip
parallelize things further; add thin manifest converter in addition (no huge gain, but wrote it, so what the hell)
-rwxr-xr-xcreate-git.sh7
-rwxr-xr-xrewrite-commit-dump.py135
2 files changed, 113 insertions, 29 deletions
diff --git a/create-git.sh b/create-git.sh
index 79847b5..dab679e 100755
--- a/create-git.sh
+++ b/create-git.sh
@@ -23,11 +23,9 @@ update_alternates() {
echo "$l/git/objects" >> "${alternates}"
echo "$l"
done
- echo "starting history linearizing/rewriting" >&2
}
standalone_mode() {
- echo "loading all commits" >&2
find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \
xargs -n1 readlink -f | update_alternates
}
@@ -36,7 +34,6 @@ if [ "$1" == --fast ]; then
command=update_alternates
else
command=standalone_mode
- echo "loading all commits in parallel to their generation..." >&2
fi
# Roughly; since alternates are updated as we go- and since rewrite-commit-dump
@@ -44,9 +41,11 @@ fi
# to delay fast-import's startup until we know we have data (meaning linearize
# has finished- thus the alternates are all in place).
# Bit tricky, but the gains have been worth it.
+# Regarding the misc cd'ing that occurs- this is to position things where the
+# scripts expect to be positions.
time {
${command} | \
- "${root}/rewrite-commit-dump.py" | \
+ ( cd "${root}"; ./rewrite-commit-dump.py; ) | \
( read line; { echo "$line"; cat; } | \
tee ../export-stream-rewritten |\
time git fast-import
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 53a1bae..607c7a5 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -2,13 +2,14 @@
import contextlib
import collections
import functools
-import mmap
import itertools
+import mmap
+import multiprocessing
import operator
import os
import re
+import subprocess
import sys
-from collections import namedtuple
@contextlib.contextmanager
def mmap_open(path):
@@ -92,7 +93,7 @@ fields = ('author', 'msg', 'files', 'timestamp', 'footerless_msg')
fields_map = dict((attr, idx) for idx, attr in enumerate(fields))
fake_fields = ('footerless_msg', 'timestamp')
file_idx = fields_map['files']
-class record(namedtuple('record', fields)):
+class record(collections.namedtuple('record', fields)):
def safe_combine(self, other):
files = self.files.copy()
assert not set(files).intersection(other.files), (files, other.files)
@@ -103,7 +104,7 @@ class record(namedtuple('record', fields)):
def update_files(self, other):
files = self.files.copy()
- files.update(other.files)
+ files.update(other.files if isinstance(other, record) else other)
items = list(self)
items[file_idx] = files
return self.__class__(*items)
@@ -199,8 +200,8 @@ def serialize_records(records, handle, target='refs/heads/master', progress=100)
progress_interval = max(1, total // progress)
for idx, record in enumerate(records, 1):
if idx % progress_interval == 0:
- write('progress %02.0f%%: %s of %i commits\n'
- % ((100 * float(idx))/total, str(idx).rjust(total_len), total))
+ write('progress %s%%: %s of %i commits\n'
+ % (str((100 * float(idx))/total).rjust(2), str(idx).rjust(total_len), total))
write('commit %s\n' % target)
write('mark :%i\n' % idx)
# fields = ('mark', 'author', 'committer', 'msg', 'files')
@@ -287,8 +288,90 @@ def manifest_dedup(records, backwards=(5*60)):
# Sort by idx, but strip idx on the way out.
return itertools.imap(operator.itemgetter(1), sorted(l, key=operator.itemgetter(0)))
+def get_blob(sha1):
+ return subprocess.check_output(['git', 'show', sha1], cwd='git')
+
+import traceback
+def process_record(data):
+ try:
+ return _process_record(data)
+ except Exception, e:
+ return traceback.format_exc()
+
+def _process_record(data):
+ idx, manifests, record = data
+ rewritten_record = record
+ for fname, data in manifests:
+ # Hacky, but it's just a test..
+ chunked = data[1].split()
+ sha1 = chunked[2]
+ blob = get_blob(sha1)
+ if '-----BEGIN PGP SIGNATURE-----' in blob:
+ continue
+ # Don't touch any old v1 manifests...
+ blob = [x for x in blob.splitlines() if x]
+ if not blob:
+ # Empty manifest? The hell?
+ continue
+ if any(x.startswith('MD5') for x in blob):
+ continue
+ blob2 = [x for x in blob if x.startswith('DIST')]
+ if not blob or blob2 != blob:
+ if blob2:
+ p = subprocess.Popen(['git', 'hash-object', '-w', '--stdin', '--path', fname],
+ cwd='git', stdout=subprocess.PIPE, stdin=subprocess.PIPE)
+ stdout, _ = p.communicate("\n".join(blob2))
+ assert p.wait() == 0
+ new_sha1 = stdout.strip()
+ assert len(new_sha1) == 40, new_sha1
+ rewritten_record = rewritten_record.update_files(
+ {fname:(data[0], " ".join(chunked[:2] + [new_sha1, fname]))})
+ else:
+ rewritten_record = rewritten_record.update_files({})
+ del rewritten_record.files[fname]
+ if rewritten_record is not record:
+ return (idx, record)
+ else:
+ return None
+
+def thin_manifest_conversion(records, processing_pool):
+ potentials = []
+ for idx, record in enumerate(records):
+ manifests = [(fname, data) for fname, data in record.files.iteritems()
+ if fname.endswith('/Manifest') and data[0] != 'D']
+ if manifests:
+ potentials.append((idx, manifests, record))
+
+ rewrites = deletes = 0
+ processed = 0
+ for result in processing_pool.imap_unordered(
+ process_record, potentials, chunksize=30):
+ processed += 1
+ if result is not None:
+ if not isinstance(result, tuple):
+ raise Exception(result)
+
+ idx, value = result
+ if not value.files:
+ # Just drop the commit.
+ value = None
+ deletes += 1
+ else:
+ records[idx] = value
+ rewrites += 1
+ sys.stderr.write("potential:%i, deletes: %i, rewrites:%i\n" % (len(potentials), deletes, rewrites))
+ return itertools.ifilter(None, records)
+
+def process_directory(paths):
+ commit_path, idx_path = paths
+ with mmap_open(commit_path) as data:
+ return tuple(manifest_dedup(
+ deserialize_records(data, deserialize_blob_map(idx_path))))
+
def main(argv):
- records = []
+ # allocate the pool now, before we start getting memory abusive
+ clean_pool = multiprocessing.Pool()
+
# Be careful here to just iterate over source; doing so allows this script
# to do basic processing as it goes (specifically while it's being fed from
# the mainline cvs2git parallelized repo creator).
@@ -297,24 +380,25 @@ def main(argv):
# See python manpage for details; stdin buffers if you iterate over it;
# we want each line as they're available, thus use this form.
source = readline_iterate(sys.stdin)
- for directory in source:
- directory = directory.strip()
- tmp = os.path.join(directory, 'cvs2svn-tmp')
- commits = os.path.join(tmp, 'git-dump.dat')
- if not os.path.exists(commits):
- sys.stderr.write("skipping %s; no commit data\n" % directory)
- sys.stderr.flush()
- continue
- with mmap_open(commits) as data:
- records.extend(
- manifest_dedup(
- deserialize_records(data,
- deserialize_blob_map(
- os.path.join(tmp, 'git-blob.idx')
- )
- )
- )
- )
+ def consumable():
+ for directory in source:
+ directory = directory.strip()
+ tmp = os.path.join(directory, 'cvs2svn-tmp')
+ commits = os.path.join(tmp, 'git-dump.dat')
+ if not os.path.exists(commits):
+ sys.stderr.write("skipping %s; no commit data\n" % directory)
+ sys.stderr.flush()
+ continue
+ yield (commits, os.path.join(tmp, 'git-blob.idx'))
+ records = []
+ record_generator = multiprocessing.Pool()
+ for result in record_generator.imap_unordered(process_directory, consumable()):
+ records.extend(result)
+ record_generator.close()
+ record_generator.join()
+ del record_generator
+ sys.stderr.write("All commits loaded.. starting dedup runs\n")
+ sys.stderr.flush()
sorter = operator.attrgetter('timestamp')
# Get them into timestamp ordering first; this is abusing python stable
# sort pretty much since any commits to the same repo w/ the same timestamp
@@ -323,6 +407,7 @@ def main(argv):
records.sort(key=sorter)
records[:] = simple_dedup(records)
# records[:] = manifest_dedup(records)
+# records[:] = thin_manifest_conversion(records, clean_pool)
serialize_records(records, sys.stdout)
return 0