aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'cvs2svn_lib/database.py')
-rw-r--r--cvs2svn_lib/database.py322
1 files changed, 322 insertions, 0 deletions
diff --git a/cvs2svn_lib/database.py b/cvs2svn_lib/database.py
new file mode 100644
index 0000000..9db9be2
--- /dev/null
+++ b/cvs2svn_lib/database.py
@@ -0,0 +1,322 @@
+# (Be in -*- python -*- mode.)
+#
+# ====================================================================
+# Copyright (c) 2000-2009 CollabNet. All rights reserved.
+#
+# This software is licensed as described in the file COPYING, which
+# you should have received as part of this distribution. The terms
+# are also available at http://subversion.tigris.org/license-1.html.
+# If newer versions of this license are posted there, you may use a
+# newer version instead, at your option.
+#
+# This software consists of voluntary contributions made by many
+# individuals. For exact contribution history, see the revision
+# history and logs, available at http://cvs2svn.tigris.org/.
+# ====================================================================
+
+"""This module contains database facilities used by cvs2svn."""
+
+
+import sys
+import os
+import cPickle
+
+from cvs2svn_lib.common import DB_OPEN_READ
+from cvs2svn_lib.common import DB_OPEN_WRITE
+from cvs2svn_lib.common import DB_OPEN_NEW
+from cvs2svn_lib.common import warning_prefix
+from cvs2svn_lib.common import error_prefix
+from cvs2svn_lib.log import Log
+from cvs2svn_lib.record_table import FileOffsetPacker
+from cvs2svn_lib.record_table import RecordTable
+
+
+# DBM module selection
+
+# 1. If we have bsddb3, it is probably newer than bsddb. Fake bsddb = bsddb3,
+# so that the dbhash module used by anydbm will use bsddb3.
+try:
+ import bsddb3
+ sys.modules['bsddb'] = sys.modules['bsddb3']
+except ImportError:
+ pass
+
+# 2. These DBM modules are not good for cvs2svn.
+import anydbm
+if anydbm._defaultmod.__name__ in ['dumbdbm', 'dbm']:
+ Log().error(
+ '%s: cvs2svn uses the anydbm package, which depends on lower level '
+ 'dbm\n'
+ 'libraries. Your system has %s, with which cvs2svn is known to have\n'
+ 'problems. To use cvs2svn, you must install a Python dbm library '
+ 'other than\n'
+ 'dumbdbm or dbm. See '
+ 'http://python.org/doc/current/lib/module-anydbm.html\n'
+ 'for more information.\n'
+ % (error_prefix, anydbm._defaultmod.__name__,)
+ )
+ sys.exit(1)
+
+# 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
+# Unfortunately, gdbm appears not to be trouble free, either.
+if hasattr(anydbm._defaultmod, 'bsddb') \
+ and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
+ try:
+ gdbm = __import__('gdbm')
+ except ImportError:
+ Log().warn(
+ '%s: The version of the bsddb module found on your computer '
+ 'has been\n'
+ 'reported to malfunction on some datasets, causing KeyError '
+ 'exceptions.\n'
+ % (warning_prefix,)
+ )
+ else:
+ anydbm._defaultmod = gdbm
+
+
+class Database:
+ """A database that uses a Serializer to store objects of a certain type.
+
+ The serializer is stored in the database under the key
+ self.serializer_key. (This implies that self.serializer_key may not
+ be used as a key for normal entries.)
+
+ The backing database is an anydbm-based DBM.
+
+ """
+
+ serializer_key = '_.%$1\t;_ '
+
+ def __init__(self, filename, mode, serializer=None):
+ """Constructor.
+
+ The database stores its Serializer, so none needs to be supplied
+ when opening an existing database."""
+
+ # pybsddb3 has a bug which prevents it from working with
+ # Berkeley DB 4.2 if you open the db with 'n' ("new"). This
+ # causes the DB_TRUNCATE flag to be passed, which is disallowed
+ # for databases protected by lock and transaction support
+ # (bsddb databases use locking from bsddb version 4.2.4 onwards).
+ #
+ # Therefore, manually perform the removal (we can do this, because
+ # we know that for bsddb - but *not* anydbm in general - the database
+ # consists of one file with the name we specify, rather than several
+ # based on that name).
+ if mode == DB_OPEN_NEW and anydbm._defaultmod.__name__ == 'dbhash':
+ if os.path.isfile(filename):
+ os.unlink(filename)
+ self.db = anydbm.open(filename, 'c')
+ else:
+ self.db = anydbm.open(filename, mode)
+
+ # Import implementations for many mapping interface methods.
+ for meth_name in ('__delitem__',
+ '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'):
+ meth_ref = getattr(self.db, meth_name, None)
+ if meth_ref:
+ setattr(self, meth_name, meth_ref)
+
+ if mode == DB_OPEN_NEW:
+ self.serializer = serializer
+ self.db[self.serializer_key] = cPickle.dumps(self.serializer)
+ else:
+ self.serializer = cPickle.loads(self.db[self.serializer_key])
+
+ def __getitem__(self, key):
+ return self.serializer.loads(self.db[key])
+
+ def __setitem__(self, key, value):
+ self.db[key] = self.serializer.dumps(value)
+
+ def __delitem__(self, key):
+ # gdbm defines a __delitem__ method, but it cannot be assigned. So
+ # this method provides a fallback definition via explicit delegation:
+ del self.db[key]
+
+ def keys(self):
+ retval = self.db.keys()
+ retval.remove(self.serializer_key)
+ return retval
+
+ def __iter__(self):
+ for key in self.keys():
+ yield key
+
+ def has_key(self, key):
+ try:
+ self.db[key]
+ return True
+ except KeyError:
+ return False
+
+ def __contains__(self, key):
+ return self.has_key(key)
+
+ def iterkeys(self):
+ return self.__iter__()
+
+ def clear(self):
+ for key in self.keys():
+ del self[key]
+
+ def items(self):
+ return [(key, self[key],) for key in self.keys()]
+
+ def values(self):
+ return [self[key] for key in self.keys()]
+
+ def get(self, key, default=None):
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def close(self):
+ self.db.close()
+ self.db = None
+
+
+class IndexedDatabase:
+ """A file of objects that are written sequentially and read randomly.
+
+ The objects are indexed by small non-negative integers, and a
+ RecordTable is used to store the index -> fileoffset map.
+ fileoffset=0 is used to represent an empty record. (An offset of 0
+ cannot occur for a legitimate record because the serializer is
+ written there.)
+
+ The main file consists of a sequence of pickles (or other serialized
+ data format). The zeroth record is a pickled Serializer.
+ Subsequent ones are objects serialized using the serializer. The
+ offset of each object in the file is stored to an index table so
+ that the data can later be retrieved randomly.
+
+ Objects are always stored to the end of the file. If an object is
+ deleted or overwritten, the fact is recorded in the index_table but
+ the space in the pickle file is not garbage collected. This has the
+ advantage that one can create a modified version of a database that
+ shares the main data file with an old version by copying the index
+ file. But it has the disadvantage that space is wasted whenever
+ objects are written multiple times."""
+
+ def __init__(self, filename, index_filename, mode, serializer=None):
+ """Initialize an IndexedDatabase, writing the serializer if necessary.
+
+ SERIALIZER is only used if MODE is DB_OPEN_NEW; otherwise the
+ serializer is read from the file."""
+
+ self.filename = filename
+ self.index_filename = index_filename
+ self.mode = mode
+ if self.mode == DB_OPEN_NEW:
+ self.f = open(self.filename, 'wb+')
+ elif self.mode == DB_OPEN_WRITE:
+ self.f = open(self.filename, 'rb+')
+ elif self.mode == DB_OPEN_READ:
+ self.f = open(self.filename, 'rb')
+ else:
+ raise RuntimeError('Invalid mode %r' % self.mode)
+
+ self.index_table = RecordTable(
+ self.index_filename, self.mode, FileOffsetPacker()
+ )
+
+ if self.mode == DB_OPEN_NEW:
+ assert serializer is not None
+ self.serializer = serializer
+ cPickle.dump(self.serializer, self.f, -1)
+ else:
+ # Read the memo from the first pickle:
+ self.serializer = cPickle.load(self.f)
+
+ # Seek to the end of the file, and record that position:
+ self.f.seek(0, 2)
+ self.fp = self.f.tell()
+ self.eofp = self.fp
+
+ def __setitem__(self, index, item):
+ """Write ITEM into the database indexed by INDEX."""
+
+ # Make sure we're at the end of the file:
+ if self.fp != self.eofp:
+ self.f.seek(self.eofp)
+ self.index_table[index] = self.eofp
+ s = self.serializer.dumps(item)
+ self.f.write(s)
+ self.eofp += len(s)
+ self.fp = self.eofp
+
+ def _fetch(self, offset):
+ if self.fp != offset:
+ self.f.seek(offset)
+
+ # There is no easy way to tell how much data will be read, so just
+ # indicate that we don't know the current file pointer:
+ self.fp = None
+
+ return self.serializer.loadf(self.f)
+
+ def iterkeys(self):
+ return self.index_table.iterkeys()
+
+ def itervalues(self):
+ for offset in self.index_table.itervalues():
+ yield self._fetch(offset)
+
+ def __getitem__(self, index):
+ offset = self.index_table[index]
+ return self._fetch(offset)
+
+ def get(self, item, default=None):
+ try:
+ return self[item]
+ except KeyError:
+ return default
+
+ def get_many(self, indexes, default=None):
+ """Yield (index,item) tuples for INDEXES, in arbitrary order.
+
+ Yield (index,default) for indexes with no defined values."""
+
+ offsets = []
+ for (index, offset) in self.index_table.get_many(indexes):
+ if offset is None:
+ yield (index, default)
+ else:
+ offsets.append((offset, index))
+
+ # Sort the offsets to reduce disk seeking:
+ offsets.sort()
+ for (offset,index) in offsets:
+ yield (index, self._fetch(offset))
+
+ def __delitem__(self, index):
+ # We don't actually free the data in self.f.
+ del self.index_table[index]
+
+ def close(self):
+ self.index_table.close()
+ self.index_table = None
+ self.f.close()
+ self.f = None
+
+ def __str__(self):
+ return 'IndexedDatabase(%r)' % (self.filename,)
+
+
+class IndexedStore(IndexedDatabase):
+ """A file of items that is written sequentially and read randomly.
+
+ This is just like IndexedDatabase, except that it has an additional
+ add() method which assumes that the object to be written to the
+ database has an 'id' member, which is used as its database index.
+ See IndexedDatabase for more information."""
+
+ def add(self, item):
+ """Write ITEM into the database indexed by ITEM.id."""
+
+ self[item.id] = item
+
+