changelog shortlog tags files raw

changeset: svn area and patched svndumpfilter2

changeset 0: f877178cd9f9
child 1:7b2f731a930e
author: Author Name <bill@dehora.net>
date: Wed Apr 18 20:36:14 2007 +0100 (5 years ago)
files: svn/svndumpfilter2
description: svn area and patched svndumpfilter2
       1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
       2+++ b/svn/svndumpfilter2	Wed Apr 18 20:36:14 2007 +0100
       3@@ -0,0 +1,356 @@
       4+#!/usr/bin/env python
       5+
       6+# Utility to filter a dump file of a Subversion repository to
       7+# produce a dump file describing only specified subdirectories of
       8+# the tree contained in the original one. This is similar in
       9+# concept to the official tool `svndumpfilter', but it's able to
      10+# cope with revisions which copy files into the area of interest
      11+# from outside it (in which situation a Node-copyfrom won't be
      12+# valid in the output dump file). However, in order to support
      13+# this, svndumpfilter2 requires access via `svnlook' to the
      14+# original repository from which the input dump file was produced.
      15+#
      16+# Usage:
      17+#
      18+#     svndumpfilter source-repository regexp [regexp...]
      19+#
      20+# This command expects to receive a Subversion dump file on
      21+# standard input, which must correspond to the Subversion
      22+# repository pointed to by the first argument. It outputs a
      23+# filtered dump file on standard output.
      24+#
      25+# `source-repository': The first argument must be a pathname to a
      26+# _local_ Subversion repository. That is, it isn't a Subversion URL
      27+# (beginning with http:// or svn:// or anything else like that);
      28+# it's a simple local pathname (absolute or relative). A simple
      29+# test to see if it's a valid pathname is to pass it as an argument
      30+# to `svnlook tree'. If that succeeds, it's also a valid first
      31+# argument to svndumpfilter2.
      32+#
      33+# `regexp': The remaining arguments are used to select directory
      34+# names from the top level of the repository's internal directory
      35+# tree. Any directory matching any of the regexps will be
      36+# considered `interesting' and copied into the output dump file;
      37+# any directory not matching will not. Matching is performed at the
      38+# top level only: it is not currently possible to selectively
      39+# include a subset of second-level directories with a common
      40+# parent.
      41+#
      42+# For example, this command...
      43+#
      44+#     svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x
      45+#
      46+# ... will read a dump file on standard input, and output one on
      47+# standard output which contains only the subdirectories `foo',
      48+# `bar', `baz', `quux', `quuux', `quuuux', etc.
      49+#
      50+# You will probably usually want to use svndumpfilter2 in
      51+# conjunction with the production of the dump file in the first
      52+# place, like this:
      53+#
      54+#     svnadmin dump /home/svnadmin/myrepos | \
      55+#         svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x > msv.dump
      56+
      57+import sys
      58+import os
      59+import re
      60+import string
      61+import types
      62+import md5
      63+
      64+# First, the sensible way to deal with a pathname is to split it
      65+# into pieces at the slashes and thereafter treat it as a list.
      66+def splitpath(s):
      67+    list = string.split(s, "/")
      68+    # Simplest way to remove all empty elements!
      69+    try:
      70+	while 1:
      71+	    list.remove("")
      72+    except ValueError:
      73+	pass
      74+    return list
      75+
      76+def joinpath(list, prefix=""):
      77+    return prefix + string.join(list, "/")
      78+
      79+def catpath(path1, path2, prefix=""):
      80+    return joinpath(splitpath(path1) + splitpath(path2), prefix)
      81+
      82+# Decide whether a pathname is interesting or not.
      83+class InterestingPaths:
      84+    def __init__(self, args):
      85+	self.res = []
      86+	for a in args:
      87+	    self.res.append(re.compile(a))
      88+    def interesting(self, path):
      89+	a = splitpath(path)
      90+	assert len(a) > 0
      91+	for r in self.res:
      92+	    if r.match(a[0]):
      93+		return 1
      94+	return 0
      95+
      96+# A class and some functions to handle a single lump of
      97+# RFC822-ish-headers-plus-data read from an SVN dump file.
      98+
      99+class Lump:
     100+    def __init__(self):
     101+	self.hdrlist = []
     102+	self.hdrdict = {}
     103+	self.prop = ""
     104+	self.text = ""
     105+	self.extant = 1
     106+	self.props = [[], {}]
     107+    def sethdr(self, key, val):
     108+	if not self.hdrdict.has_key(key):
     109+	    self.hdrlist.append(key)
     110+	self.hdrdict[key] = val
     111+    def delhdr(self, key):
     112+	if self.hdrdict.has_key(key):
     113+	    del self.hdrdict[key]
     114+	    self.hdrlist.remove(key)
     115+    def propparse(self):
     116+	index = 0
     117+	while 1:
     118+	    if self.prop[index:index+2] == "K ":
     119+		wantval = 1
     120+	    elif self.prop[index:index+2] == "D ":
     121+		wantval = 0
     122+	    elif self.prop[index:index+9] == "PROPS-END":
     123+		break
     124+	    else:
     125+		raise "Unrecognised record in props section"
     126+	    nlpos = string.find(self.prop, "\n", index)
     127+	    assert nlpos > 0
     128+	    namelen = string.atoi(self.prop[index+2:nlpos])
     129+	    assert self.prop[nlpos+1+namelen] == "\n"
     130+	    name = self.prop[nlpos+1:nlpos+1+namelen]
     131+	    index = nlpos+2+namelen
     132+	    if wantval:
     133+		assert self.prop[index:index+2] == "V "
     134+		nlpos = string.find(self.prop, "\n", index)
     135+		assert nlpos > 0
     136+		proplen = string.atoi(self.prop[index+2:nlpos])
     137+		assert self.prop[nlpos+1+proplen] == "\n"
     138+		prop = self.prop[nlpos+1:nlpos+1+proplen]
     139+		index = nlpos+2+proplen
     140+	    else:
     141+		prop = None
     142+	    self.props[0].append(name)
     143+	    self.props[1][name] = prop
     144+    def setprop(self, key, val):
     145+	if not self.props[1].has_key(key):
     146+	    self.props[0].append(key)
     147+	self.props[1][key] = val
     148+    def delprop(self, key):
     149+	if self.props[1].has_key(key):
     150+	    del self.props[1][key]
     151+	    self.props[0].remove(key)
     152+    def correct_headers(self):
     153+	# First reconstitute the properties block.
     154+	self.prop = ""
     155+	if len(self.props[0]) > 0:
     156+	    for key in self.props[0]:
     157+		val = self.props[1][key]
     158+		if val == None:
     159+		    self.prop = self.prop + "D %d" % len(key) + "\n" + key + "\n"
     160+		else:
     161+		    self.prop = self.prop + "K %d" % len(key) + "\n" + key + "\n"
     162+		    self.prop = self.prop + "V %d" % len(val) + "\n" + val + "\n"
     163+	    self.prop = self.prop + "PROPS-END\n"
     164+	# Now fix up the content length headers.
     165+	if len(self.prop) > 0:
     166+	    self.sethdr("Prop-content-length", str(len(self.prop)))
     167+	else:
     168+	    self.delhdr("Prop-content-length")
     169+	# Only fiddle with the md5 if we're not doing a delta.
     170+	if self.hdrdict.get("Text-delta", "false") != "true":
     171+	    if len(self.text) > 0:
     172+		self.sethdr("Text-content-length", str(len(self.text)))
     173+		m = md5.new()
     174+		m.update(self.text)
     175+		self.sethdr("Text-content-md5", m.hexdigest())
     176+	    else:
     177+		self.delhdr("Text-content-length")
     178+		self.delhdr("Text-content-md5")
     179+	if len(self.prop) > 0 or len(self.text) > 0:
     180+	    self.sethdr("Content-length", str(len(self.prop)+len(self.text)))
     181+	else:
     182+	    self.delhdr("Content-length")
     183+
     184+def read_rfc822_headers(f):
     185+    ret = Lump()
     186+    while 1:
     187+	s = f.readline()
     188+	if s == "":
     189+	    return None # end of file
     190+	if s == "\n":
     191+	    if len(ret.hdrlist) > 0:
     192+		break # newline after headers ends them
     193+	    else:
     194+		continue # newline before headers is simply ignored
     195+	if s[-1:] == "\n": s = s[:-1]
     196+	colon = string.find(s, ":")
     197+	assert colon > 0
     198+	assert s[colon:colon+2] == ": "
     199+	key = s[:colon]
     200+	val = s[colon+2:]
     201+	ret.sethdr(key, val)
     202+    return ret
     203+
     204+def read_lump(f):
     205+    lump = read_rfc822_headers(f)
     206+    if lump == None:
     207+	return None
     208+    pcl = string.atoi(lump.hdrdict.get("Prop-content-length", "0"))
     209+    tcl = string.atoi(lump.hdrdict.get("Text-content-length", "0"))
     210+    if pcl > 0:
     211+	lump.prop = f.read(pcl)
     212+	lump.propparse()
     213+    if tcl > 0:
     214+	lump.text = f.read(tcl)
     215+    return lump
     216+
     217+def write_lump(f, lump):
     218+    if not lump.extant:
     219+	return
     220+    lump.correct_headers()
     221+    for key in lump.hdrlist:
     222+	val = lump.hdrdict[key]
     223+	f.write(key + ": " + val + "\n")
     224+    f.write("\n")
     225+    f.write(lump.prop)
     226+    f.write(lump.text)
     227+    if lump.hdrdict.has_key("Prop-content-length") or \
     228+    lump.hdrdict.has_key("Text-content-length") or \
     229+    lump.hdrdict.has_key("Content-length"):
     230+	f.write("\n")
     231+
     232+# Higher-level class that makes use of the above to filter dump
     233+# file fragments a whole revision at a time.
     234+
     235+class Filter:
     236+    def __init__(self, paths):
     237+	self.revisions = {}
     238+	self.paths = paths
     239+
     240+    def tweak(self, revhdr, contents):
     241+	contents2 = []
     242+	for lump in contents:
     243+	    action = lump.hdrdict["Node-action"]
     244+	    path = lump.hdrdict["Node-path"]
     245+
     246+	    if not self.paths.interesting(path):
     247+		continue # boooring
     248+
     249+	    need = 1 # we need to do something about this lump
     250+
     251+	    if action == "add":
     252+		if lump.hdrdict.has_key("Node-copyfrom-path"):
     253+		    srcrev = string.atoi(lump.hdrdict["Node-copyfrom-rev"])
     254+		    srcpath = lump.hdrdict["Node-copyfrom-path"]
     255+		    if not self.paths.interesting(srcpath):
     256+			# Copy from a boring path to an interesting
     257+			# one, meaning we must use svnlook to
     258+			# extract the subtree and convert it into
     259+			# lumps.
     260+			treecmd = "svnlook tree -r%d %s '%s'" % \
     261+			(srcrev, repos, srcpath)
     262+			tree = os.popen(treecmd, "r")
     263+			pathcomponents = []
     264+			while 1:
     265+			    treeline = tree.readline()
     266+			    if treeline == "": break
     267+			    if treeline[-1:] == "\n": treeline = treeline[:-1]
     268+			    subdir = 0
     269+			    while treeline[-1:] == "/":
     270+				subdir = 1
     271+				treeline = treeline[:-1]
     272+			    depth = 0
     273+			    while treeline[:1] == " ":
     274+				depth = depth + 1
     275+				treeline = treeline[1:]
     276+			    pathcomponents[depth:] = [treeline]
     277+			    thissrcpath = string.join([srcpath] + pathcomponents[1:], "/")
     278+			    thisdstpath = string.join([path] + pathcomponents[1:], "/")
     279+			    newlump = Lump()
     280+			    newlump.sethdr("Node-path", thisdstpath)
     281+			    newlump.sethdr("Node-action", "add")
     282+			    props = os.popen("svnlook pl -r%d %s '%s'" % \
     283+			    (srcrev, repos, thissrcpath), "r")
     284+			    while 1:
     285+				propname = props.readline()
     286+				if propname == "": break
     287+				if propname[-1:] == "\n": propname = propname[:-1]
     288+				while propname[:1] == " ": propname = propname[1:]
     289+				propf = os.popen("svnlook pg -r%d %s %s '%s'" % \
     290+				(srcrev, repos, propname, thissrcpath), "r")
     291+				proptext = propf.read()
     292+				propf.close()
     293+				newlump.setprop(propname, proptext)
     294+			    props.close()
     295+			    if subdir:
     296+				newlump.sethdr("Node-kind", "dir")
     297+			    else:
     298+				newlump.sethdr("Node-kind", "file")
     299+				f = os.popen("svnlook cat -r%d %s '%s'" % \
     300+				(srcrev, repos, thissrcpath), "r")
     301+				newlump.text = f.read()
     302+				f.close()
     303+			    contents2.append(newlump)
     304+			tree.close()
     305+			need = 0 # we have now done something
     306+	    if need:
     307+		contents2.append(lump)
     308+
     309+	# Change the contents array.
     310+	contents[:] = contents2
     311+
     312+	# If we've just removed everything in this revision, leave
     313+	# out some revision properties as well.
     314+	if (len(contents) == 0):
     315+	    revhdr.delprop("svn:log")
     316+	    revhdr.delprop("svn:author")
     317+	    revhdr.delprop("svn:date")
     318+
     319+fr = sys.stdin
     320+fw = sys.stdout
     321+
     322+repos = sys.argv[1]
     323+paths = InterestingPaths(sys.argv[2:])
     324+
     325+# Pass the dump-file header through unchanged.
     326+lump = read_lump(fr)
     327+while not lump.hdrdict.has_key("Revision-number"):
     328+    write_lump(fw, lump)
     329+    lump = read_lump(fr)
     330+
     331+revhdr = lump
     332+
     333+filt = Filter(paths)
     334+
     335+while revhdr != None:
     336+    # Read revision header.
     337+    assert revhdr.hdrdict.has_key("Revision-number")
     338+    contents = []
     339+    # Read revision contents.
     340+    while 1:
     341+	lump = read_lump(fr)
     342+	if lump == None or lump.hdrdict.has_key("Revision-number"):
     343+	    newrevhdr = lump
     344+	    break
     345+	contents.append(lump)
     346+
     347+    # Alter the contents of the revision.
     348+    filt.tweak(revhdr, contents)
     349+
     350+    # Write out revision.
     351+    write_lump(fw, revhdr)
     352+    for lump in contents:
     353+	write_lump(fw, lump)
     354+
     355+    # And loop round again.
     356+    revhdr = newrevhdr
     357+
     358+fr.close()
     359+fw.close()