aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'cvs2svn_lib/collect_data.py')
-rw-r--r--cvs2svn_lib/collect_data.py1431
1 files changed, 1431 insertions, 0 deletions
diff --git a/cvs2svn_lib/collect_data.py b/cvs2svn_lib/collect_data.py
new file mode 100644
index 0000000..160d7b9
--- /dev/null
+++ b/cvs2svn_lib/collect_data.py
@@ -0,0 +1,1431 @@
+# (Be in -*- python -*- mode.)
+#
+# ====================================================================
+# Copyright (c) 2000-2009 CollabNet. All rights reserved.
+#
+# This software is licensed as described in the file COPYING, which
+# you should have received as part of this distribution. The terms
+# are also available at http://subversion.tigris.org/license-1.html.
+# If newer versions of this license are posted there, you may use a
+# newer version instead, at your option.
+#
+# This software consists of voluntary contributions made by many
+# individuals. For exact contribution history, see the revision
+# history and logs, available at http://cvs2svn.tigris.org/.
+# ====================================================================
+
+"""Data collection classes.
+
+This module contains the code used to collect data from the CVS
+repository. It parses *,v files, recording all useful information
+except for the actual file contents (though even the file contents
+might be recorded by the RevisionRecorder if one is configured).
+
+As a *,v file is parsed, the information pertaining to the file is
+accumulated in memory, mostly in _RevisionData, _BranchData, and
+_TagData objects. When parsing is complete, a final pass is made over
+the data to create some final dependency links, collect statistics,
+etc., then the _*Data objects are converted into CVSItem objects
+(CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
+dumped into databases.
+
+During the data collection, persistent unique ids are allocated to
+many types of objects: CVSFile, Symbol, and CVSItems. CVSItems are a
+special case. CVSItem ids are unique across all CVSItem types, and
+the ids are carried over from the corresponding data collection
+objects:
+
+ _RevisionData -> CVSRevision
+
+ _BranchData -> CVSBranch
+
+ _TagData -> CVSTag
+
+In a later pass it is possible to convert tags <-> branches. But even
+if this occurs, the new branch or tag uses the same id as the old tag
+or branch.
+
+"""
+
+
+import os
+import stat
+import re
+
+from cvs2svn_lib import config
+from cvs2svn_lib.common import DB_OPEN_NEW
+from cvs2svn_lib.common import FatalError
+from cvs2svn_lib.common import warning_prefix
+from cvs2svn_lib.common import error_prefix
+from cvs2svn_lib.common import IllegalSVNPathError
+from cvs2svn_lib.common import verify_svn_filename_legal
+from cvs2svn_lib.log import Log
+from cvs2svn_lib.context import Ctx
+from cvs2svn_lib.artifact_manager import artifact_manager
+from cvs2svn_lib.project import FileInAndOutOfAtticException
+from cvs2svn_lib.cvs_file import CVSPath
+from cvs2svn_lib.cvs_file import CVSDirectory
+from cvs2svn_lib.cvs_file import CVSFile
+from cvs2svn_lib.symbol import Symbol
+from cvs2svn_lib.symbol import Trunk
+from cvs2svn_lib.cvs_item import CVSRevision
+from cvs2svn_lib.cvs_item import CVSBranch
+from cvs2svn_lib.cvs_item import CVSTag
+from cvs2svn_lib.cvs_item import cvs_revision_type_map
+from cvs2svn_lib.cvs_file_items import VendorBranchError
+from cvs2svn_lib.cvs_file_items import CVSFileItems
+from cvs2svn_lib.key_generator import KeyGenerator
+from cvs2svn_lib.cvs_item_database import NewCVSItemStore
+from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
+from cvs2svn_lib.metadata_database import MetadataDatabase
+from cvs2svn_lib.metadata_database import MetadataLogger
+
+import cvs2svn_rcsparse
+
+
+# A regular expression defining "valid" revision numbers (used to
+# check that symbol definitions are reasonable).
+_valid_revision_re = re.compile(r'''
+ ^
+ (?:\d+\.)+ # Digit groups with trailing dots
+ \d+ # And the last digit group.
+ $
+ ''', re.VERBOSE)
+
+_branch_revision_re = re.compile(r'''
+ ^
+ ((?:\d+\.\d+\.)+) # A nonzero even number of digit groups w/trailing dot
+ (?:0\.)? # CVS sticks an extra 0 here; RCS does not
+ (\d+) # And the last digit group
+ $
+ ''', re.VERBOSE)
+
+
+def rev_tuple(rev):
+ """Return a tuple of integers corresponding to revision number REV.
+
+ For example, if REV is '1.2.3.4', then return (1,2,3,4)."""
+
+ return tuple([int(x) for x in rev.split('.')])
+
+
+def is_trunk_revision(rev):
+ """Return True iff REV is a trunk revision.
+
+ REV is a revision number corresponding to a specific revision (i.e.,
+ not a whole branch)."""
+
+ return rev.count('.') == 1
+
+
+def is_branch_revision_number(rev):
+ """Return True iff REV is a branch revision number.
+
+ REV is a CVS revision number in canonical form (i.e., with zeros
+ removed). Return True iff it refers to a whole branch, as opposed
+ to a single revision."""
+
+ return rev.count('.') % 2 == 0
+
+
+def is_same_line_of_development(rev1, rev2):
+ """Return True if rev1 and rev2 are on the same line of
+ development (i.e., both on trunk, or both on the same branch);
+ return False otherwise. Either rev1 or rev2 can be None, in
+ which case automatically return False."""
+
+ if rev1 is None or rev2 is None:
+ return False
+ if rev1.count('.') == 1 and rev2.count('.') == 1:
+ return True
+ if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
+ return True
+ return False
+
+
+class _RevisionData:
+ """We track the state of each revision so that in set_revision_info,
+ we can determine if our op is an add/change/delete. We can do this
+ because in set_revision_info, we'll have all of the _RevisionData
+ for a file at our fingertips, and we need to examine the state of
+ our prev_rev to determine if we're an add or a change. Without the
+ state of the prev_rev, we are unable to distinguish between an add
+ and a change."""
+
+ def __init__(self, cvs_rev_id, rev, timestamp, author, state):
+ # The id of this revision:
+ self.cvs_rev_id = cvs_rev_id
+ self.rev = rev
+ self.timestamp = timestamp
+ self.author = author
+ self.original_timestamp = timestamp
+ self.state = state
+
+ # If this is the first revision on a branch, then this is the
+ # branch_data of that branch; otherwise it is None.
+ self.parent_branch_data = None
+
+ # The revision number of the parent of this revision along the
+ # same line of development, if any. For the first revision R on a
+ # branch, we consider the revision from which R sprouted to be the
+ # 'parent'. If this is the root revision in the file's revision
+ # tree, then this field is None.
+ #
+ # Note that this revision can't be determined arithmetically (due
+ # to cvsadmin -o), which is why this field is necessary.
+ self.parent = None
+
+ # The revision number of the primary child of this revision (the
+ # child along the same line of development), if any; otherwise,
+ # None.
+ self.child = None
+
+ # The _BranchData instances of branches that sprout from this
+ # revision, sorted in ascending order by branch number. It would
+ # be inconvenient to initialize it here because we would have to
+ # scan through all branches known by the _SymbolDataCollector to
+ # find the ones having us as the parent. Instead, this
+ # information is filled in by
+ # _FileDataCollector._resolve_dependencies() and sorted by
+ # _FileDataCollector._sort_branches().
+ self.branches_data = []
+
+ # The revision numbers of the first commits on any branches on
+ # which commits occurred. This dependency is kept explicitly
+ # because otherwise a revision-only topological sort would miss
+ # the dependency that exists via branches_data.
+ self.branches_revs_data = []
+
+ # The _TagData instances of tags that are connected to this
+ # revision.
+ self.tags_data = []
+
+ # A token that may be returned from
+ # RevisionRecorder.record_text(). It can be used by
+ # RevisionReader to obtain the text again.
+ self.revision_recorder_token = None
+
+ def get_first_on_branch_id(self):
+ return self.parent_branch_data and self.parent_branch_data.id
+
+
+class _SymbolData:
+ """Collection area for information about a symbol in a single CVSFile.
+
+ SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
+ Tag regardless of whether self is a _BranchData or a _TagData."""
+
+ def __init__(self, id, symbol):
+ """Initialize an object for SYMBOL."""
+
+ # The unique id that will be used for this particular symbol in
+ # this particular file. This same id will be used for the CVSItem
+ # that is derived from this instance.
+ self.id = id
+
+ # An instance of Symbol.
+ self.symbol = symbol
+
+
+class _BranchData(_SymbolData):
+ """Collection area for information about a Branch in a single CVSFile."""
+
+ def __init__(self, id, symbol, branch_number):
+ _SymbolData.__init__(self, id, symbol)
+
+ # The branch number (e.g., '1.5.2') of this branch.
+ self.branch_number = branch_number
+
+ # The revision number of the revision from which this branch
+ # sprouts (e.g., '1.5').
+ self.parent = self.branch_number[:self.branch_number.rindex(".")]
+
+ # The revision number of the first commit on this branch, if any
+ # (e.g., '1.5.2.1'); otherwise, None.
+ self.child = None
+
+
+class _TagData(_SymbolData):
+ """Collection area for information about a Tag in a single CVSFile."""
+
+ def __init__(self, id, symbol, rev):
+ _SymbolData.__init__(self, id, symbol)
+
+ # The revision number being tagged (e.g., '1.5.2.3').
+ self.rev = rev
+
+
+class _SymbolDataCollector(object):
+ """Collect information about symbols in a single CVSFile."""
+
+ def __init__(self, fdc, cvs_file):
+ self.fdc = fdc
+ self.cvs_file = cvs_file
+
+ self.pdc = self.fdc.pdc
+ self.collect_data = self.fdc.collect_data
+
+ # A list [(name, revision), ...] of symbols defined in the header
+ # of the file. The name has already been transformed using the
+ # symbol transform rules. If the symbol transform rules indicate
+ # that the symbol should be ignored, then it is never added to
+ # this list. This list is processed then deleted in
+ # process_symbols().
+ self._symbol_defs = []
+
+ # A set containing the transformed names of symbols in this file
+ # (used to detect duplicats during processing of unlabeled
+ # branches):
+ self._defined_symbols = set()
+
+ # Map { branch_number : _BranchData }, where branch_number has an
+ # odd number of digits.
+ self.branches_data = { }
+
+ # Map { revision : [ tag_data ] }, where revision has an even
+ # number of digits, and the value is a list of _TagData objects
+ # for tags that apply to that revision.
+ self.tags_data = { }
+
+ def _add_branch(self, name, branch_number):
+ """Record that BRANCH_NUMBER is the branch number for branch NAME,
+ and derive and record the revision from which NAME sprouts.
+ BRANCH_NUMBER is an RCS branch number with an odd number of
+ components, for example '1.7.2' (never '1.7.0.2'). Return the
+ _BranchData instance (which is usually newly-created)."""
+
+ branch_data = self.branches_data.get(branch_number)
+
+ if branch_data is not None:
+ Log().warn(
+ "%s: in '%s':\n"
+ " branch '%s' already has name '%s',\n"
+ " cannot also have name '%s', ignoring the latter\n"
+ % (warning_prefix,
+ self.cvs_file.filename, branch_number,
+ branch_data.symbol.name, name)
+ )
+ return branch_data
+
+ symbol = self.pdc.get_symbol(name)
+ branch_data = _BranchData(
+ self.collect_data.item_key_generator.gen_id(), symbol, branch_number
+ )
+ self.branches_data[branch_number] = branch_data
+ return branch_data
+
+ def _construct_distinct_name(self, name, original_name):
+ """Construct a distinct symbol name from NAME.
+
+ If NAME is distinct, return it. If it is already used in this
+ file (as determined from its presence in self._defined_symbols),
+ construct and return a new name that is not already used."""
+
+ if name not in self._defined_symbols:
+ return name
+ else:
+ index = 1
+ while True:
+ dup_name = '%s-DUPLICATE-%d' % (name, index,)
+ if dup_name not in self._defined_symbols:
+ self.collect_data.record_fatal_error(
+ "Symbol name '%s' is already used in '%s'.\n"
+ "The unlabeled branch '%s' must be renamed using "
+ "--symbol-transform."
+ % (name, self.cvs_file.filename, original_name,)
+ )
+ return dup_name
+
+ def _add_unlabeled_branch(self, branch_number):
+ original_name = "unlabeled-" + branch_number
+ name = self.transform_symbol(original_name, branch_number)
+ if name is None:
+ self.collect_data.record_fatal_error(
+ "The unlabeled branch '%s' in '%s' contains commits.\n"
+ "It may not be ignored via a symbol transform. (Use --exclude "
+ "instead.)"
+ % (original_name, self.cvs_file.filename,)
+ )
+ # Retain the original name to allow the conversion to continue:
+ name = original_name
+
+ distinct_name = self._construct_distinct_name(name, original_name)
+ self._defined_symbols.add(distinct_name)
+ return self._add_branch(distinct_name, branch_number)
+
+ def _add_tag(self, name, revision):
+ """Record that tag NAME refers to the specified REVISION."""
+
+ symbol = self.pdc.get_symbol(name)
+ tag_data = _TagData(
+ self.collect_data.item_key_generator.gen_id(), symbol, revision
+ )
+ self.tags_data.setdefault(revision, []).append(tag_data)
+ return tag_data
+
+ def transform_symbol(self, name, revision):
+ """Transform a symbol according to the project's symbol transforms.
+
+ Transform the symbol with the original name NAME and canonicalized
+ revision number REVISION. Return the new symbol name or None if
+ the symbol should be ignored entirely.
+
+ Log the results of the symbol transform if necessary."""
+
+ old_name = name
+ # Apply any user-defined symbol transforms to the symbol name:
+ name = self.cvs_file.project.transform_symbol(
+ self.cvs_file, name, revision
+ )
+
+ if name is None:
+ # Ignore symbol:
+ self.pdc.log_symbol_transform(old_name, None)
+ Log().verbose(
+ " symbol '%s'=%s ignored in %s"
+ % (old_name, revision, self.cvs_file.filename,)
+ )
+ else:
+ if name != old_name:
+ self.pdc.log_symbol_transform(old_name, name)
+ Log().verbose(
+ " symbol '%s'=%s transformed to '%s' in %s"
+ % (old_name, revision, name, self.cvs_file.filename,)
+ )
+
+ return name
+
+ def define_symbol(self, name, revision):
+ """Record a symbol definition for later processing."""
+
+ # Canonicalize the revision number:
+ revision = _branch_revision_re.sub(r'\1\2', revision)
+
+ # Apply any user-defined symbol transforms to the symbol name:
+ name = self.transform_symbol(name, revision)
+
+ if name is not None:
+ # Verify that the revision number is valid:
+ if _valid_revision_re.match(revision):
+ # The revision number is valid; record it for later processing:
+ self._symbol_defs.append( (name, revision) )
+ else:
+ Log().warn(
+ 'In %r:\n'
+ ' branch %r references invalid revision %s\n'
+ ' and will be ignored.'
+ % (self.cvs_file.filename, name, revision,)
+ )
+
+ def _eliminate_trivial_duplicate_defs(self, symbol_defs):
+ """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
+
+ Duplicate definitions of symbol names have been seen in the wild,
+ and they can also happen when --symbol-transform is used. If a
+ symbol is defined to the same revision number repeatedly, then
+ ignore all but the last definition."""
+
+ # Make a copy, since we have to iterate through the definitions
+ # twice:
+ symbol_defs = list(symbol_defs)
+
+ # A map { (name, revision) : [index,...] } of the indexes where
+ # symbol definitions name=revision were found:
+ known_definitions = {}
+ for (i, symbol_def) in enumerate(symbol_defs):
+ known_definitions.setdefault(symbol_def, []).append(i)
+
+ # A set of the indexes of entries that have to be removed from
+ # symbol_defs:
+ dup_indexes = set()
+ for ((name, revision), indexes) in known_definitions.iteritems():
+ if len(indexes) > 1:
+ Log().verbose(
+ "in %r:\n"
+ " symbol %s:%s defined multiple times; ignoring duplicates\n"
+ % (self.cvs_file.filename, name, revision,)
+ )
+ dup_indexes.update(indexes[:-1])
+
+ for (i, symbol_def) in enumerate(symbol_defs):
+ if i not in dup_indexes:
+ yield symbol_def
+
+ def _process_duplicate_defs(self, symbol_defs):
+ """Iterate through SYMBOL_DEFS, processing duplicate names.
+
+ Duplicate definitions of symbol names have been seen in the wild,
+ and they can also happen when --symbol-transform is used. If a
+ symbol is defined multiple times, then it is a fatal error. This
+ method should be called after _eliminate_trivial_duplicate_defs()."""
+
+ # Make a copy, since we have to access multiple times:
+ symbol_defs = list(symbol_defs)
+
+ # A map {name : [index,...]} mapping the names of symbols to a
+ # list of their definitions' indexes in symbol_defs:
+ known_symbols = {}
+ for (i, (name, revision)) in enumerate(symbol_defs):
+ known_symbols.setdefault(name, []).append(i)
+
+ known_symbols = known_symbols.items()
+ known_symbols.sort()
+ dup_indexes = set()
+ for (name, indexes) in known_symbols:
+ if len(indexes) > 1:
+ # This symbol was defined multiple times.
+ self.collect_data.record_fatal_error(
+ "Multiple definitions of the symbol '%s' in '%s': %s" % (
+ name, self.cvs_file.filename,
+ ' '.join([symbol_defs[i][1] for i in indexes]),
+ )
+ )
+ # Ignore all but the last definition for now, to allow the
+ # conversion to proceed:
+ dup_indexes.update(indexes[:-1])
+
+ for (i, symbol_def) in enumerate(symbol_defs):
+ if i not in dup_indexes:
+ yield symbol_def
+
+ def _process_symbol(self, name, revision):
+ """Process a symbol called NAME, which is associated with REVISON.
+
+ REVISION is a canonical revision number with zeros removed, for
+ example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'. NAME is a
+ transformed branch or tag name."""
+
+ # Add symbol to our records:
+ if is_branch_revision_number(revision):
+ self._add_branch(name, revision)
+ else:
+ self._add_tag(name, revision)
+
+ def process_symbols(self):
+ """Process the symbol definitions from SELF._symbol_defs."""
+
+ symbol_defs = self._symbol_defs
+ del self._symbol_defs
+
+ symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
+ symbol_defs = self._process_duplicate_defs(symbol_defs)
+
+ for (name, revision) in symbol_defs:
+ self._defined_symbols.add(name)
+ self._process_symbol(name, revision)
+
+ @staticmethod
+ def rev_to_branch_number(revision):
+ """Return the branch_number of the branch on which REVISION lies.
+
+ REVISION is a branch revision number with an even number of
+ components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
+ The return value is the branch number (for example, '1.7.2').
+ Return none iff REVISION is a trunk revision such as '1.2'."""
+
+ if is_trunk_revision(revision):
+ return None
+ return revision[:revision.rindex(".")]
+
+ def rev_to_branch_data(self, revision):
+ """Return the branch_data of the branch on which REVISION lies.
+
+ REVISION must be a branch revision number with an even number of
+ components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
+ Raise KeyError iff REVISION is unknown."""
+
+ assert not is_trunk_revision(revision)
+
+ return self.branches_data[self.rev_to_branch_number(revision)]
+
+ def rev_to_lod(self, revision):
+ """Return the line of development on which REVISION lies.
+
+ REVISION must be a revision number with an even number of
+ components. Raise KeyError iff REVISION is unknown."""
+
+ if is_trunk_revision(revision):
+ return self.pdc.trunk
+ else:
+ return self.rev_to_branch_data(revision).symbol
+
+
+class _FileDataCollector(cvs2svn_rcsparse.Sink):
+ """Class responsible for collecting RCS data for a particular file.
+
+ Any collected data that need to be remembered are stored into the
+ referenced CollectData instance."""
+
+ def __init__(self, pdc, cvs_file):
+ """Create an object that is prepared to receive data for CVS_FILE.
+ CVS_FILE is a CVSFile instance. COLLECT_DATA is used to store the
+ information collected about the file."""
+
+ self.pdc = pdc
+ self.cvs_file = cvs_file
+
+ self.collect_data = self.pdc.collect_data
+ self.project = self.cvs_file.project
+
+ # A place to store information about the symbols in this file:
+ self.sdc = _SymbolDataCollector(self, self.cvs_file)
+
+ # { revision : _RevisionData instance }
+ self._rev_data = { }
+
+ # Lists [ (parent, child) ] of revision number pairs indicating
+ # that revision child depends on revision parent along the main
+ # line of development.
+ self._primary_dependencies = []
+
+ # If set, this is an RCS branch number -- rcsparse calls this the
+ # "principal branch", but CVS and RCS refer to it as the "default
+ # branch", so that's what we call it, even though the rcsparse API
+ # setter method is still 'set_principal_branch'.
+ self.default_branch = None
+
+ # True iff revision 1.1 of the file appears to have been imported
+ # (as opposed to added normally).
+ self._file_imported = False
+
+ def _get_rev_id(self, revision):
+ if revision is None:
+ return None
+ return self._rev_data[revision].cvs_rev_id
+
+ def set_principal_branch(self, branch):
+ """This is a callback method declared in Sink."""
+
+ if branch.find('.') == -1:
+ # This just sets the default branch to trunk. Normally this
+ # shouldn't occur, but it has been seen in at least one CVS
+ # repository. Just ignore it.
+ pass
+ else:
+ self.default_branch = branch
+
+ def set_expansion(self, mode):
+ """This is a callback method declared in Sink."""
+
+ self.cvs_file.mode = mode
+
+ def define_tag(self, name, revision):
+ """Remember the symbol name and revision, but don't process them yet.
+
+ This is a callback method declared in Sink."""
+
+ self.sdc.define_symbol(name, revision)
+
+ def admin_completed(self):
+ """This is a callback method declared in Sink."""
+
+ self.sdc.process_symbols()
+
+ def define_revision(self, revision, timestamp, author, state,
+ branches, next):
+ """This is a callback method declared in Sink."""
+
+ for branch in branches:
+ try:
+ branch_data = self.sdc.rev_to_branch_data(branch)
+ except KeyError:
+ # Normally we learn about the branches from the branch names
+ # and numbers parsed from the symbolic name header. But this
+ # must have been an unlabeled branch that slipped through the
+ # net. Generate a name for it and create a _BranchData record
+ # for it now.
+ branch_data = self.sdc._add_unlabeled_branch(
+ self.sdc.rev_to_branch_number(branch))
+
+ assert branch_data.child is None
+ branch_data.child = branch
+
+ if revision in self._rev_data:
+ # This revision has already been seen.
+ Log().error('File %r contains duplicate definitions of revision %s.'
+ % (self.cvs_file.filename, revision,))
+ raise RuntimeError
+
+ # Record basic information about the revision:
+ rev_data = _RevisionData(
+ self.collect_data.item_key_generator.gen_id(),
+ revision, int(timestamp), author, state)
+ self._rev_data[revision] = rev_data
+
+ # When on trunk, the RCS 'next' revision number points to what
+ # humans might consider to be the 'previous' revision number. For
+ # example, 1.3's RCS 'next' is 1.2.
+ #
+ # However, on a branch, the RCS 'next' revision number really does
+ # point to what humans would consider to be the 'next' revision
+ # number. For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
+ #
+ # In other words, in RCS, 'next' always means "where to find the next
+ # deltatext that you need this revision to retrieve.
+ #
+ # That said, we don't *want* RCS's behavior here, so we determine
+ # whether we're on trunk or a branch and set the dependencies
+ # accordingly.
+ if next:
+ if is_trunk_revision(revision):
+ self._primary_dependencies.append( (next, revision,) )
+ else:
+ self._primary_dependencies.append( (revision, next,) )
+
+ def _resolve_primary_dependencies(self):
+ """Resolve the dependencies listed in self._primary_dependencies."""
+
+ for (parent, child,) in self._primary_dependencies:
+ parent_data = self._rev_data[parent]
+ assert parent_data.child is None
+ parent_data.child = child
+
+ child_data = self._rev_data[child]
+ assert child_data.parent is None
+ child_data.parent = parent
+
+ def _resolve_branch_dependencies(self):
+ """Resolve dependencies involving branches."""
+
+ for branch_data in self.sdc.branches_data.values():
+ # The branch_data's parent has the branch as a child regardless
+ # of whether the branch had any subsequent commits:
+ try:
+ parent_data = self._rev_data[branch_data.parent]
+ except KeyError:
+ Log().warn(
+ 'In %r:\n'
+ ' branch %r references non-existing revision %s\n'
+ ' and will be ignored.'
+ % (self.cvs_file.filename, branch_data.symbol.name,
+ branch_data.parent,))
+ del self.sdc.branches_data[branch_data.branch_number]
+ else:
+ parent_data.branches_data.append(branch_data)
+
+ # If the branch has a child (i.e., something was committed on
+ # the branch), then we store a reference to the branch_data
+ # there, define the child's parent to be the branch's parent,
+ # and list the child in the branch parent's branches_revs_data:
+ if branch_data.child is not None:
+ child_data = self._rev_data[branch_data.child]
+ assert child_data.parent_branch_data is None
+ child_data.parent_branch_data = branch_data
+ assert child_data.parent is None
+ child_data.parent = branch_data.parent
+ parent_data.branches_revs_data.append(branch_data.child)
+
+ def _sort_branches(self):
+ """Sort the branches sprouting from each revision in creation order.
+
+ Creation order is taken to be the reverse of the order that they
+ are listed in the symbols part of the RCS file. (If a branch is
+ created then deleted, a later branch can be assigned the recycled
+ branch number; therefore branch numbers are not an indication of
+ creation order.)"""
+
+ for rev_data in self._rev_data.values():
+ rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
+
+ def _resolve_tag_dependencies(self):
+ """Resolve dependencies involving tags."""
+
+ for (rev, tag_data_list) in self.sdc.tags_data.items():
+ try:
+ parent_data = self._rev_data[rev]
+ except KeyError:
+ Log().warn(
+ 'In %r:\n'
+ ' the following tag(s) reference non-existing revision %s\n'
+ ' and will be ignored:\n'
+ ' %s' % (
+ self.cvs_file.filename, rev,
+ ', '.join([repr(tag_data.symbol.name)
+ for tag_data in tag_data_list]),))
+ del self.sdc.tags_data[rev]
+ else:
+ for tag_data in tag_data_list:
+ assert tag_data.rev == rev
+ # The tag_data's rev has the tag as a child:
+ parent_data.tags_data.append(tag_data)
+
+ def _determine_operation(self, rev_data):
+ prev_rev_data = self._rev_data.get(rev_data.parent)
+ return cvs_revision_type_map[(
+ rev_data.state != 'dead',
+ prev_rev_data is not None and prev_rev_data.state != 'dead',
+ )]
+
+ def _get_cvs_revision(self, rev_data):
+ """Create and return a CVSRevision for REV_DATA."""
+
+ branch_ids = [
+ branch_data.id
+ for branch_data in rev_data.branches_data
+ ]
+
+ branch_commit_ids = [
+ self._get_rev_id(rev)
+ for rev in rev_data.branches_revs_data
+ ]
+
+ tag_ids = [
+ tag_data.id
+ for tag_data in rev_data.tags_data
+ ]
+
+ revision_type = self._determine_operation(rev_data)
+
+ return revision_type(
+ self._get_rev_id(rev_data.rev), self.cvs_file,
+ rev_data.timestamp, None,
+ self._get_rev_id(rev_data.parent),
+ self._get_rev_id(rev_data.child),
+ rev_data.rev,
+ True,
+ self.sdc.rev_to_lod(rev_data.rev),
+ rev_data.get_first_on_branch_id(),
+ False, None, None,
+ tag_ids, branch_ids, branch_commit_ids,
+ rev_data.revision_recorder_token)
+
+ def _get_cvs_revisions(self):
+ """Generate the CVSRevisions present in this file."""
+
+ for rev_data in self._rev_data.itervalues():
+ yield self._get_cvs_revision(rev_data)
+
+ def _get_cvs_branches(self):
+ """Generate the CVSBranches present in this file."""
+
+ for branch_data in self.sdc.branches_data.values():
+ yield CVSBranch(
+ branch_data.id, self.cvs_file, branch_data.symbol,
+ branch_data.branch_number,
+ self.sdc.rev_to_lod(branch_data.parent),
+ self._get_rev_id(branch_data.parent),
+ self._get_rev_id(branch_data.child),
+ None,
+ )
+
+ def _get_cvs_tags(self):
+ """Generate the CVSTags present in this file."""
+
+ for tags_data in self.sdc.tags_data.values():
+ for tag_data in tags_data:
+ yield CVSTag(
+ tag_data.id, self.cvs_file, tag_data.symbol,
+ self.sdc.rev_to_lod(tag_data.rev),
+ self._get_rev_id(tag_data.rev),
+ None,
+ )
+
+ def tree_completed(self):
+ """The revision tree has been parsed.
+
+ Analyze it for consistency and connect some loose ends.
+
+ This is a callback method declared in Sink."""
+
+ self._resolve_primary_dependencies()
+ self._resolve_branch_dependencies()
+ self._sort_branches()
+ self._resolve_tag_dependencies()
+
+ # Compute the preliminary CVSFileItems for this file:
+ cvs_items = []
+ cvs_items.extend(self._get_cvs_revisions())
+ cvs_items.extend(self._get_cvs_branches())
+ cvs_items.extend(self._get_cvs_tags())
+ self._cvs_file_items = CVSFileItems(
+ self.cvs_file, self.pdc.trunk, cvs_items
+ )
+
+ self._cvs_file_items.check_link_consistency()
+
+ # Tell the revision recorder about the file dependency tree.
+ self.collect_data.revision_recorder.start_file(self._cvs_file_items)
+
+ def set_revision_info(self, revision, log, text):
+ """This is a callback method declared in Sink."""
+
+ rev_data = self._rev_data[revision]
+ cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
+
+ if cvs_rev.metadata_id is not None:
+ # Users have reported problems with repositories in which the
+ # deltatext block for revision 1.1 appears twice. It is not
+ # known whether this results from a CVS/RCS bug, or from botched
+ # hand-editing of the repository. In any case, empirically, cvs
+ # and rcs both use the first version when checking out data, so
+ # that's what we will do. (For the record: "cvs log" fails on
+ # such a file; "rlog" prints the log message from the first
+ # block and ignores the second one.)
+ Log().warn(
+ "%s: in '%s':\n"
+ " Deltatext block for revision %s appeared twice;\n"
+ " ignoring the second occurrence.\n"
+ % (warning_prefix, self.cvs_file.filename, revision,)
+ )
+ return
+
+ if is_trunk_revision(revision):
+ branch_name = None
+ else:
+ branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
+
+ cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
+ self.project, branch_name, rev_data.author, log
+ )
+ cvs_rev.deltatext_exists = bool(text)
+
+ # If this is revision 1.1, determine whether the file appears to
+ # have been created via 'cvs add' instead of 'cvs import'. The
+ # test is that the log message CVS uses for 1.1 in imports is
+ # "Initial revision\n" with no period. (This fact helps determine
+ # whether this file might have had a default branch in the past.)
+ if revision == '1.1':
+ self._file_imported = (log == 'Initial revision\n')
+
+ cvs_rev.revision_recorder_token = \
+ self.collect_data.revision_recorder.record_text(cvs_rev, log, text)
+
+ def parse_completed(self):
+ """Finish the processing of this file.
+
+ This is a callback method declared in Sink."""
+
+ # Make sure that there was an info section for each revision:
+ for cvs_item in self._cvs_file_items.values():
+ if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
+ self.collect_data.record_fatal_error(
+ '%r has no deltatext section for revision %s'
+ % (self.cvs_file.filename, cvs_item.rev,)
+ )
+
+ def _process_ntdbrs(self):
+ """Fix up any non-trunk default branch revisions (if present).
+
+ If a non-trunk default branch is determined to have existed, yield
+ the _RevisionData.ids for all revisions that were once non-trunk
+ default revisions, in dependency order.
+
+ There are two cases to handle:
+
+ One case is simple. The RCS file lists a default branch
+ explicitly in its header, such as '1.1.1'. In this case, we know
+ that every revision on the vendor branch is to be treated as head
+ of trunk at that point in time.
+
+ But there's also a degenerate case. The RCS file does not
+ currently have a default branch, yet we can deduce that for some
+ period in the past it probably *did* have one. For example, the
+ file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
+ dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
+ after 1.2. In this case, we should record 1.1.1.96 as the last
+ vendor revision to have been the head of the default branch.
+
+ If any non-trunk default branch revisions are found:
+
+ - Set their ntdbr members to True.
+
+ - Connect the last one with revision 1.2.
+
+ - Remove revision 1.1 if it is not needed.
+
+ """
+
+ try:
+ if self.default_branch:
+ vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
+ vendor_lod_items = self._cvs_file_items.get_lod_items(
+ self._cvs_file_items[vendor_cvs_branch_id]
+ )
+ if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
+ return
+ elif self._file_imported:
+ vendor_branch_data = self.sdc.branches_data.get('1.1.1')
+ if vendor_branch_data is None:
+ return
+ else:
+ vendor_lod_items = self._cvs_file_items.get_lod_items(
+ self._cvs_file_items[vendor_branch_data.id]
+ )
+ if not self._cvs_file_items.process_historical_ntdb(
+ vendor_lod_items
+ ):
+ return
+ else:
+ return
+ except VendorBranchError, e:
+ self.collect_data.record_fatal_error(str(e))
+ return
+
+ if self._file_imported:
+ self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
+
+ self._cvs_file_items.check_link_consistency()
+
+ def get_cvs_file_items(self):
+ """Finish up and return a CVSFileItems instance for this file.
+
+ This method must only be called once."""
+
+ self._process_ntdbrs()
+
+ # Break a circular reference loop, allowing the memory for self
+ # and sdc to be freed.
+ del self.sdc
+
+ return self._cvs_file_items
+
+
+class _ProjectDataCollector:
+ def __init__(self, collect_data, project):
+ self.collect_data = collect_data
+ self.project = project
+ self.num_files = 0
+
+ # The Trunk LineOfDevelopment object for this project:
+ self.trunk = Trunk(
+ self.collect_data.symbol_key_generator.gen_id(), self.project
+ )
+ self.project.trunk_id = self.trunk.id
+
+ # This causes a record for self.trunk to spring into existence:
+ self.collect_data.symbol_stats[self.trunk]
+
+ # A map { name -> Symbol } for all known symbols in this project.
+ # The symbols listed here are undifferentiated into Branches and
+ # Tags because the same name might appear as a branch in one file
+ # and a tag in another.
+ self.symbols = {}
+
+ # A map { (old_name, new_name) : count } indicating how many files
+ # were affected by each each symbol name transformation:
+ self.symbol_transform_counts = {}
+
+ def get_symbol(self, name):
+ """Return the Symbol object for the symbol named NAME in this project.
+
+ If such a symbol does not yet exist, allocate a new symbol_id,
+ create a Symbol instance, store it in self.symbols, and return it."""
+
+ symbol = self.symbols.get(name)
+ if symbol is None:
+ symbol = Symbol(
+ self.collect_data.symbol_key_generator.gen_id(),
+ self.project, name)
+ self.symbols[name] = symbol
+ return symbol
+
+ def log_symbol_transform(self, old_name, new_name):
+ """Record that OLD_NAME was transformed to NEW_NAME in one file.
+
+ This information is used to generated a statistical summary of
+ symbol transforms."""
+
+ try:
+ self.symbol_transform_counts[old_name, new_name] += 1
+ except KeyError:
+ self.symbol_transform_counts[old_name, new_name] = 1
+
+ def summarize_symbol_transforms(self):
+ if self.symbol_transform_counts and Log().is_on(Log.NORMAL):
+ log = Log()
+ log.normal('Summary of symbol transforms:')
+ transforms = self.symbol_transform_counts.items()
+ transforms.sort()
+ for ((old_name, new_name), count) in transforms:
+ if new_name is None:
+ log.normal(' "%s" ignored in %d files' % (old_name, count,))
+ else:
+ log.normal(
+ ' "%s" transformed to "%s" in %d files'
+ % (old_name, new_name, count,)
+ )
+
+ def _process_cvs_file_items(self, cvs_file_items):
+ """Process the CVSFileItems from one CVSFile."""
+
+ # Remove CVSRevisionDeletes that are not needed:
+ cvs_file_items.remove_unneeded_deletes(self.collect_data.metadata_db)
+
+ # Remove initial branch deletes that are not needed:
+ cvs_file_items.remove_initial_branch_deletes(
+ self.collect_data.metadata_db
+ )
+
+ # If this is a --trunk-only conversion, discard all branches and
+ # tags, then draft any non-trunk default branch revisions to
+ # trunk:
+ if Ctx().trunk_only:
+ cvs_file_items.exclude_non_trunk()
+
+ self.collect_data.revision_recorder.finish_file(cvs_file_items)
+ self.collect_data.add_cvs_file_items(cvs_file_items)
+ self.collect_data.symbol_stats.register(cvs_file_items)
+
+ def process_file(self, cvs_file):
+ Log().normal(cvs_file.filename)
+ fdc = _FileDataCollector(self, cvs_file)
+ try:
+ cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc)
+ except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
+ self.collect_data.record_fatal_error(
+ "%r is not a valid ,v file" % (cvs_file.filename,)
+ )
+ # Abort the processing of this file, but let the pass continue
+ # with other files:
+ return
+ except:
+ Log().warn("Exception occurred while parsing %s" % cvs_file.filename)
+ raise
+ else:
+ self.num_files += 1
+
+ cvs_file_items = fdc.get_cvs_file_items()
+
+ del fdc
+
+ self._process_cvs_file_items(cvs_file_items)
+
+
+class CollectData:
+ """Repository for data collected by parsing the CVS repository files.
+
+ This class manages the databases into which information collected
+ from the CVS repository is stored. The data are stored into this
+ class by _FileDataCollector instances, one of which is created for
+ each file to be parsed."""
+
+ def __init__(self, revision_recorder, stats_keeper):
+ self.revision_recorder = revision_recorder
+ self._cvs_item_store = NewCVSItemStore(
+ artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
+ self.metadata_db = MetadataDatabase(
+ artifact_manager.get_temp_file(config.METADATA_STORE),
+ artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
+ DB_OPEN_NEW,
+ )
+ self.metadata_logger = MetadataLogger(self.metadata_db)
+ self.fatal_errors = []
+ self.num_files = 0
+ self.symbol_stats = SymbolStatisticsCollector()
+ self.stats_keeper = stats_keeper
+
+ # Key generator for CVSFiles:
+ self.file_key_generator = KeyGenerator()
+
+ # Key generator for CVSItems:
+ self.item_key_generator = KeyGenerator()
+
+ # Key generator for Symbols:
+ self.symbol_key_generator = KeyGenerator()
+
+ self.revision_recorder.start()
+
+ def record_fatal_error(self, err):
+ """Record that fatal error ERR was found.
+
+ ERR is a string (without trailing newline) describing the error.
+ Output the error to stderr immediately, and record a copy to be
+ output again in a summary at the end of CollectRevsPass."""
+
+ err = '%s: %s' % (error_prefix, err,)
+ Log().error(err + '\n')
+ self.fatal_errors.append(err)
+
+ def add_cvs_directory(self, cvs_directory):
+ """Record CVS_DIRECTORY."""
+
+ Ctx()._cvs_file_db.log_file(cvs_directory)
+
+ def add_cvs_file_items(self, cvs_file_items):
+ """Record the information from CVS_FILE_ITEMS.
+
+ Store the CVSFile to _cvs_file_db under its persistent id, store
+ the CVSItems, and record the CVSItems to self.stats_keeper."""
+
+ Ctx()._cvs_file_db.log_file(cvs_file_items.cvs_file)
+ self._cvs_item_store.add(cvs_file_items)
+
+ self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
+ for cvs_item in cvs_file_items.values():
+ self.stats_keeper.record_cvs_item(cvs_item)
+
+ def _get_cvs_file(
+ self, parent_directory, basename, file_in_attic, leave_in_attic=False
+ ):
+ """Return a CVSFile describing the file with name BASENAME.
+
+ PARENT_DIRECTORY is the CVSDirectory instance describing the
+ directory that physically holds this file in the filesystem.
+ BASENAME must be the base name of a *,v file within
+ PARENT_DIRECTORY.
+
+ FILE_IN_ATTIC is a boolean telling whether the specified file is
+ in an Attic subdirectory. If FILE_IN_ATTIC is True, then:
+
+ - If LEAVE_IN_ATTIC is True, then leave the 'Attic' component in
+ the filename.
+
+ - Otherwise, raise FileInAndOutOfAtticException if a file with the
+ same filename appears outside of Attic.
+
+ The CVSFile is assigned a new unique id. All of the CVSFile
+ information is filled in except mode (which can only be determined
+ by parsing the file).
+
+ Raise FatalError if the resulting filename would not be legal in
+ SVN."""
+
+ filename = os.path.join(parent_directory.filename, basename)
+ try:
+ verify_svn_filename_legal(basename[:-2])
+ except IllegalSVNPathError, e:
+ raise FatalError(
+ 'File %r would result in an illegal SVN filename: %s'
+ % (filename, e,)
+ )
+
+ if file_in_attic and not leave_in_attic:
+ in_attic = True
+ logical_parent_directory = parent_directory.parent_directory
+
+ # If this file also exists outside of the attic, it's a fatal
+ # error:
+ non_attic_filename = os.path.join(
+ logical_parent_directory.filename, basename,
+ )
+ if os.path.exists(non_attic_filename):
+ raise FileInAndOutOfAtticException(non_attic_filename, filename)
+ else:
+ in_attic = False
+ logical_parent_directory = parent_directory
+
+ file_stat = os.stat(filename)
+
+ # The size of the file in bytes:
+ file_size = file_stat[stat.ST_SIZE]
+
+ # Whether or not the executable bit is set:
+ file_executable = bool(file_stat[0] & stat.S_IXUSR)
+
+ # mode is not known, so we temporarily set it to None.
+ return CVSFile(
+ self.file_key_generator.gen_id(),
+ parent_directory.project, logical_parent_directory, basename[:-2],
+ in_attic, file_executable, file_size, None
+ )
+
+ def _get_attic_file(self, parent_directory, basename):
+ """Return a CVSFile object for the Attic file at BASENAME.
+
+ PARENT_DIRECTORY is the CVSDirectory that physically contains the
+ file on the filesystem (i.e., the Attic directory). It is not
+ necessarily the parent_directory of the CVSFile that will be
+ returned.
+
+ Return CVSFile, whose parent directory is usually
+ PARENT_DIRECTORY.parent_directory, but might be PARENT_DIRECTORY
+ iff CVSFile will remain in the Attic directory."""
+
+ try:
+ return self._get_cvs_file(parent_directory, basename, True)
+ except FileInAndOutOfAtticException, e:
+ if Ctx().retain_conflicting_attic_files:
+ Log().warn(
+ "%s: %s;\n"
+ " storing the latter into 'Attic' subdirectory.\n"
+ % (warning_prefix, e)
+ )
+ else:
+ self.record_fatal_error(str(e))
+
+ # Either way, return a CVSFile object so that the rest of the
+ # file processing can proceed:
+ return self._get_cvs_file(
+ parent_directory, basename, True, leave_in_attic=True
+ )
+
+ def _generate_attic_cvs_files(self, cvs_directory):
+ """Generate CVSFiles for the files in Attic directory CVS_DIRECTORY.
+
+ Also add CVS_DIRECTORY to self if any files are being retained in
+ that directory."""
+
+ retained_attic_file = False
+
+ fnames = os.listdir(cvs_directory.filename)
+ fnames.sort()
+ for fname in fnames:
+ pathname = os.path.join(cvs_directory.filename, fname)
+ if os.path.isdir(pathname):
+ Log().warn("Directory %s found within Attic; ignoring" % (pathname,))
+ elif fname.endswith(',v'):
+ cvs_file = self._get_attic_file(cvs_directory, fname)
+ if cvs_file.parent_directory == cvs_directory:
+ # This file will be retained in the Attic directory.
+ retained_attic_file = True
+ yield cvs_file
+
+ if retained_attic_file:
+ # If any files were retained in the Attic directory, then write
+ # the Attic directory to CVSFileDatabase:
+ self.add_cvs_directory(cvs_directory)
+
+ def _get_non_attic_file(self, parent_directory, basename):
+ """Return a CVSFile object for the non-Attic file at BASENAME."""
+
+ return self._get_cvs_file(parent_directory, basename, False)
+
+ def _generate_cvs_files(self, cvs_directory):
+ """Generate the CVSFiles under non-Attic directory CVS_DIRECTORY.
+
+ Process directories recursively, including Attic directories.
+ Also create and register CVSDirectories as they are found, and
+ look for conflicts between the filenames that will result from
+ files, attic files, and subdirectories."""
+
+ self.add_cvs_directory(cvs_directory)
+
+ # Map {cvs_file.basename : cvs_file.filename} for files directly
+ # in cvs_directory:
+ rcsfiles = {}
+
+ attic_dir = None
+
+ # Non-Attic subdirectories of cvs_directory (to be recursed into):
+ dirs = []
+
+ fnames = os.listdir(cvs_directory.filename)
+ fnames.sort()
+ for fname in fnames:
+ pathname = os.path.join(cvs_directory.filename, fname)
+ if os.path.isdir(pathname):
+ if fname == 'Attic':
+ attic_dir = fname
+ else:
+ dirs.append(fname)
+ elif fname.endswith(',v'):
+ cvs_file = self._get_non_attic_file(cvs_directory, fname)
+ rcsfiles[cvs_file.basename] = cvs_file.filename
+ yield cvs_file
+ else:
+ # Silently ignore other files:
+ pass
+
+ # Map {cvs_file.basename : cvs_file.filename} for files in an
+ # Attic directory within cvs_directory:
+ attic_rcsfiles = {}
+
+ if attic_dir is not None:
+ attic_directory = CVSDirectory(
+ self.file_key_generator.gen_id(),
+ cvs_directory.project, cvs_directory, 'Attic',
+ )
+
+ for cvs_file in self._generate_attic_cvs_files(attic_directory):
+ if cvs_file.parent_directory == cvs_directory:
+ attic_rcsfiles[cvs_file.basename] = cvs_file.filename
+ yield cvs_file
+
+ alldirs = dirs + [attic_dir]
+ else:
+ alldirs = dirs
+
+ # Check for conflicts between directory names and the filenames
+ # that will result from the rcs files (both in this directory and
+ # in attic). (We recurse into the subdirectories nevertheless, to
+ # try to detect more problems.)
+ for fname in alldirs:
+ pathname = os.path.join(cvs_directory.filename, fname)
+ for rcsfile_list in [rcsfiles, attic_rcsfiles]:
+ if fname in rcsfile_list:
+ self.record_fatal_error(
+ 'Directory name conflicts with filename. Please remove or '
+ 'rename one\n'
+ 'of the following:\n'
+ ' "%s"\n'
+ ' "%s"'
+ % (pathname, rcsfile_list[fname],)
+ )
+
+ # Now recurse into the other subdirectories:
+ for fname in dirs:
+ dirname = os.path.join(cvs_directory.filename, fname)
+
+ # Verify that the directory name does not contain any illegal
+ # characters:
+ try:
+ verify_svn_filename_legal(fname)
+ except IllegalSVNPathError, e:
+ raise FatalError(
+ 'Directory %r would result in an illegal SVN path name: %s'
+ % (dirname, e,)
+ )
+
+ sub_directory = CVSDirectory(
+ self.file_key_generator.gen_id(),
+ cvs_directory.project, cvs_directory, fname,
+ )
+
+ for cvs_file in self._generate_cvs_files(sub_directory):
+ yield cvs_file
+
+ def process_project(self, project):
+ Ctx()._projects[project.id] = project
+
+ root_cvs_directory = CVSDirectory(
+ self.file_key_generator.gen_id(), project, None, ''
+ )
+ project.root_cvs_directory_id = root_cvs_directory.id
+ pdc = _ProjectDataCollector(self, project)
+
+ found_rcs_file = False
+ for cvs_file in self._generate_cvs_files(root_cvs_directory):
+ pdc.process_file(cvs_file)
+ found_rcs_file = True
+
+ if not found_rcs_file:
+ self.record_fatal_error(
+ 'No RCS files found under %r!\n'
+ 'Are you absolutely certain you are pointing cvs2svn\n'
+ 'at a CVS repository?\n'
+ % (project.project_cvs_repos_path,)
+ )
+
+ pdc.summarize_symbol_transforms()
+
+ self.num_files += pdc.num_files
+ Log().verbose('Processed', self.num_files, 'files')
+
+ def _set_cvs_path_ordinals(self):
+ cvs_files = list(Ctx()._cvs_file_db.itervalues())
+ cvs_files.sort(CVSPath.slow_compare)
+ for (i, cvs_file) in enumerate(cvs_files):
+ cvs_file.ordinal = i
+
+ def close(self):
+ """Close the data structures associated with this instance.
+
+ Return a list of fatal errors encountered while processing input.
+ Each list entry is a string describing one fatal error."""
+
+ self.revision_recorder.finish()
+ self.symbol_stats.purge_ghost_symbols()
+ self.symbol_stats.close()
+ self.symbol_stats = None
+ self.metadata_logger = None
+ self.metadata_db.close()
+ self.metadata_db = None
+ self._cvs_item_store.close()
+ self._cvs_item_store = None
+ self._set_cvs_path_ordinals()
+ self.revision_recorder = None
+ retval = self.fatal_errors
+ self.fatal_errors = None
+ return retval
+
+