| changeset 0: |
f877178cd9f9 |
| child 1: | 7b2f731a930e |
| author: |
Author Name <bill@dehora.net> |
| date: |
Wed Apr 18 20:36:14 2007 +0100 (5 years ago) |
| files: |
svn/svndumpfilter2 |
| description: |
svn area and patched svndumpfilter2 |
1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
2+++ b/svn/svndumpfilter2 Wed Apr 18 20:36:14 2007 +0100
3@@ -0,0 +1,356 @@
4+#!/usr/bin/env python
5+
6+# Utility to filter a dump file of a Subversion repository to
7+# produce a dump file describing only specified subdirectories of
8+# the tree contained in the original one. This is similar in
9+# concept to the official tool `svndumpfilter', but it's able to
10+# cope with revisions which copy files into the area of interest
11+# from outside it (in which situation a Node-copyfrom won't be
12+# valid in the output dump file). However, in order to support
13+# this, svndumpfilter2 requires access via `svnlook' to the
14+# original repository from which the input dump file was produced.
15+#
16+# Usage:
17+#
18+# svndumpfilter source-repository regexp [regexp...]
19+#
20+# This command expects to receive a Subversion dump file on
21+# standard input, which must correspond to the Subversion
22+# repository pointed to by the first argument. It outputs a
23+# filtered dump file on standard output.
24+#
25+# `source-repository': The first argument must be a pathname to a
26+# _local_ Subversion repository. That is, it isn't a Subversion URL
27+# (beginning with http:// or svn:// or anything else like that);
28+# it's a simple local pathname (absolute or relative). A simple
29+# test to see if it's a valid pathname is to pass it as an argument
30+# to `svnlook tree'. If that succeeds, it's also a valid first
31+# argument to svndumpfilter2.
32+#
33+# `regexp': The remaining arguments are used to select directory
34+# names from the top level of the repository's internal directory
35+# tree. Any directory matching any of the regexps will be
36+# considered `interesting' and copied into the output dump file;
37+# any directory not matching will not. Matching is performed at the
38+# top level only: it is not currently possible to selectively
39+# include a subset of second-level directories with a common
40+# parent.
41+#
42+# For example, this command...
43+#
44+# svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x
45+#
46+# ... will read a dump file on standard input, and output one on
47+# standard output which contains only the subdirectories `foo',
48+# `bar', `baz', `quux', `quuux', `quuuux', etc.
49+#
50+# You will probably usually want to use svndumpfilter2 in
51+# conjunction with the production of the dump file in the first
52+# place, like this:
53+#
54+# svnadmin dump /home/svnadmin/myrepos | \
55+# svndumpfilter2 /home/svnadmin/myrepos foo bar baz quu+x > msv.dump
56+
57+import sys
58+import os
59+import re
60+import string
61+import types
62+import md5
63+
64+# First, the sensible way to deal with a pathname is to split it
65+# into pieces at the slashes and thereafter treat it as a list.
66+def splitpath(s):
67+ list = string.split(s, "/")
68+ # Simplest way to remove all empty elements!
69+ try:
70+ while 1:
71+ list.remove("")
72+ except ValueError:
73+ pass
74+ return list
75+
76+def joinpath(list, prefix=""):
77+ return prefix + string.join(list, "/")
78+
79+def catpath(path1, path2, prefix=""):
80+ return joinpath(splitpath(path1) + splitpath(path2), prefix)
81+
82+# Decide whether a pathname is interesting or not.
83+class InterestingPaths:
84+ def __init__(self, args):
85+ self.res = []
86+ for a in args:
87+ self.res.append(re.compile(a))
88+ def interesting(self, path):
89+ a = splitpath(path)
90+ assert len(a) > 0
91+ for r in self.res:
92+ if r.match(a[0]):
93+ return 1
94+ return 0
95+
96+# A class and some functions to handle a single lump of
97+# RFC822-ish-headers-plus-data read from an SVN dump file.
98+
99+class Lump:
100+ def __init__(self):
101+ self.hdrlist = []
102+ self.hdrdict = {}
103+ self.prop = ""
104+ self.text = ""
105+ self.extant = 1
106+ self.props = [[], {}]
107+ def sethdr(self, key, val):
108+ if not self.hdrdict.has_key(key):
109+ self.hdrlist.append(key)
110+ self.hdrdict[key] = val
111+ def delhdr(self, key):
112+ if self.hdrdict.has_key(key):
113+ del self.hdrdict[key]
114+ self.hdrlist.remove(key)
115+ def propparse(self):
116+ index = 0
117+ while 1:
118+ if self.prop[index:index+2] == "K ":
119+ wantval = 1
120+ elif self.prop[index:index+2] == "D ":
121+ wantval = 0
122+ elif self.prop[index:index+9] == "PROPS-END":
123+ break
124+ else:
125+ raise "Unrecognised record in props section"
126+ nlpos = string.find(self.prop, "\n", index)
127+ assert nlpos > 0
128+ namelen = string.atoi(self.prop[index+2:nlpos])
129+ assert self.prop[nlpos+1+namelen] == "\n"
130+ name = self.prop[nlpos+1:nlpos+1+namelen]
131+ index = nlpos+2+namelen
132+ if wantval:
133+ assert self.prop[index:index+2] == "V "
134+ nlpos = string.find(self.prop, "\n", index)
135+ assert nlpos > 0
136+ proplen = string.atoi(self.prop[index+2:nlpos])
137+ assert self.prop[nlpos+1+proplen] == "\n"
138+ prop = self.prop[nlpos+1:nlpos+1+proplen]
139+ index = nlpos+2+proplen
140+ else:
141+ prop = None
142+ self.props[0].append(name)
143+ self.props[1][name] = prop
144+ def setprop(self, key, val):
145+ if not self.props[1].has_key(key):
146+ self.props[0].append(key)
147+ self.props[1][key] = val
148+ def delprop(self, key):
149+ if self.props[1].has_key(key):
150+ del self.props[1][key]
151+ self.props[0].remove(key)
152+ def correct_headers(self):
153+ # First reconstitute the properties block.
154+ self.prop = ""
155+ if len(self.props[0]) > 0:
156+ for key in self.props[0]:
157+ val = self.props[1][key]
158+ if val == None:
159+ self.prop = self.prop + "D %d" % len(key) + "\n" + key + "\n"
160+ else:
161+ self.prop = self.prop + "K %d" % len(key) + "\n" + key + "\n"
162+ self.prop = self.prop + "V %d" % len(val) + "\n" + val + "\n"
163+ self.prop = self.prop + "PROPS-END\n"
164+ # Now fix up the content length headers.
165+ if len(self.prop) > 0:
166+ self.sethdr("Prop-content-length", str(len(self.prop)))
167+ else:
168+ self.delhdr("Prop-content-length")
169+ # Only fiddle with the md5 if we're not doing a delta.
170+ if self.hdrdict.get("Text-delta", "false") != "true":
171+ if len(self.text) > 0:
172+ self.sethdr("Text-content-length", str(len(self.text)))
173+ m = md5.new()
174+ m.update(self.text)
175+ self.sethdr("Text-content-md5", m.hexdigest())
176+ else:
177+ self.delhdr("Text-content-length")
178+ self.delhdr("Text-content-md5")
179+ if len(self.prop) > 0 or len(self.text) > 0:
180+ self.sethdr("Content-length", str(len(self.prop)+len(self.text)))
181+ else:
182+ self.delhdr("Content-length")
183+
184+def read_rfc822_headers(f):
185+ ret = Lump()
186+ while 1:
187+ s = f.readline()
188+ if s == "":
189+ return None # end of file
190+ if s == "\n":
191+ if len(ret.hdrlist) > 0:
192+ break # newline after headers ends them
193+ else:
194+ continue # newline before headers is simply ignored
195+ if s[-1:] == "\n": s = s[:-1]
196+ colon = string.find(s, ":")
197+ assert colon > 0
198+ assert s[colon:colon+2] == ": "
199+ key = s[:colon]
200+ val = s[colon+2:]
201+ ret.sethdr(key, val)
202+ return ret
203+
204+def read_lump(f):
205+ lump = read_rfc822_headers(f)
206+ if lump == None:
207+ return None
208+ pcl = string.atoi(lump.hdrdict.get("Prop-content-length", "0"))
209+ tcl = string.atoi(lump.hdrdict.get("Text-content-length", "0"))
210+ if pcl > 0:
211+ lump.prop = f.read(pcl)
212+ lump.propparse()
213+ if tcl > 0:
214+ lump.text = f.read(tcl)
215+ return lump
216+
217+def write_lump(f, lump):
218+ if not lump.extant:
219+ return
220+ lump.correct_headers()
221+ for key in lump.hdrlist:
222+ val = lump.hdrdict[key]
223+ f.write(key + ": " + val + "\n")
224+ f.write("\n")
225+ f.write(lump.prop)
226+ f.write(lump.text)
227+ if lump.hdrdict.has_key("Prop-content-length") or \
228+ lump.hdrdict.has_key("Text-content-length") or \
229+ lump.hdrdict.has_key("Content-length"):
230+ f.write("\n")
231+
232+# Higher-level class that makes use of the above to filter dump
233+# file fragments a whole revision at a time.
234+
235+class Filter:
236+ def __init__(self, paths):
237+ self.revisions = {}
238+ self.paths = paths
239+
240+ def tweak(self, revhdr, contents):
241+ contents2 = []
242+ for lump in contents:
243+ action = lump.hdrdict["Node-action"]
244+ path = lump.hdrdict["Node-path"]
245+
246+ if not self.paths.interesting(path):
247+ continue # boooring
248+
249+ need = 1 # we need to do something about this lump
250+
251+ if action == "add":
252+ if lump.hdrdict.has_key("Node-copyfrom-path"):
253+ srcrev = string.atoi(lump.hdrdict["Node-copyfrom-rev"])
254+ srcpath = lump.hdrdict["Node-copyfrom-path"]
255+ if not self.paths.interesting(srcpath):
256+ # Copy from a boring path to an interesting
257+ # one, meaning we must use svnlook to
258+ # extract the subtree and convert it into
259+ # lumps.
260+ treecmd = "svnlook tree -r%d %s '%s'" % \
261+ (srcrev, repos, srcpath)
262+ tree = os.popen(treecmd, "r")
263+ pathcomponents = []
264+ while 1:
265+ treeline = tree.readline()
266+ if treeline == "": break
267+ if treeline[-1:] == "\n": treeline = treeline[:-1]
268+ subdir = 0
269+ while treeline[-1:] == "/":
270+ subdir = 1
271+ treeline = treeline[:-1]
272+ depth = 0
273+ while treeline[:1] == " ":
274+ depth = depth + 1
275+ treeline = treeline[1:]
276+ pathcomponents[depth:] = [treeline]
277+ thissrcpath = string.join([srcpath] + pathcomponents[1:], "/")
278+ thisdstpath = string.join([path] + pathcomponents[1:], "/")
279+ newlump = Lump()
280+ newlump.sethdr("Node-path", thisdstpath)
281+ newlump.sethdr("Node-action", "add")
282+ props = os.popen("svnlook pl -r%d %s '%s'" % \
283+ (srcrev, repos, thissrcpath), "r")
284+ while 1:
285+ propname = props.readline()
286+ if propname == "": break
287+ if propname[-1:] == "\n": propname = propname[:-1]
288+ while propname[:1] == " ": propname = propname[1:]
289+ propf = os.popen("svnlook pg -r%d %s %s '%s'" % \
290+ (srcrev, repos, propname, thissrcpath), "r")
291+ proptext = propf.read()
292+ propf.close()
293+ newlump.setprop(propname, proptext)
294+ props.close()
295+ if subdir:
296+ newlump.sethdr("Node-kind", "dir")
297+ else:
298+ newlump.sethdr("Node-kind", "file")
299+ f = os.popen("svnlook cat -r%d %s '%s'" % \
300+ (srcrev, repos, thissrcpath), "r")
301+ newlump.text = f.read()
302+ f.close()
303+ contents2.append(newlump)
304+ tree.close()
305+ need = 0 # we have now done something
306+ if need:
307+ contents2.append(lump)
308+
309+ # Change the contents array.
310+ contents[:] = contents2
311+
312+ # If we've just removed everything in this revision, leave
313+ # out some revision properties as well.
314+ if (len(contents) == 0):
315+ revhdr.delprop("svn:log")
316+ revhdr.delprop("svn:author")
317+ revhdr.delprop("svn:date")
318+
319+fr = sys.stdin
320+fw = sys.stdout
321+
322+repos = sys.argv[1]
323+paths = InterestingPaths(sys.argv[2:])
324+
325+# Pass the dump-file header through unchanged.
326+lump = read_lump(fr)
327+while not lump.hdrdict.has_key("Revision-number"):
328+ write_lump(fw, lump)
329+ lump = read_lump(fr)
330+
331+revhdr = lump
332+
333+filt = Filter(paths)
334+
335+while revhdr != None:
336+ # Read revision header.
337+ assert revhdr.hdrdict.has_key("Revision-number")
338+ contents = []
339+ # Read revision contents.
340+ while 1:
341+ lump = read_lump(fr)
342+ if lump == None or lump.hdrdict.has_key("Revision-number"):
343+ newrevhdr = lump
344+ break
345+ contents.append(lump)
346+
347+ # Alter the contents of the revision.
348+ filt.tweak(revhdr, contents)
349+
350+ # Write out revision.
351+ write_lump(fw, revhdr)
352+ for lump in contents:
353+ write_lump(fw, lump)
354+
355+ # And loop round again.
356+ revhdr = newrevhdr
357+
358+fr.close()
359+fw.close()