meta2deps.py revision 249033
1#!/usr/bin/env python
2
3"""
4This script parses each "meta" file and extracts the
5information needed to deduce build and src dependencies.
6
7It works much the same as the original shell script, but is
8*much* more efficient.
9
10The parsing work is handled by the class MetaFile.
11We only pay attention to a subset of the information in the
12"meta" files.  Specifically:
13
14'CWD'	to initialize our notion.
15
16'C'	to track chdir(2) on a per process basis
17
18'R'	files read are what we really care about.
19	directories read, provide a clue to resolving
20	subsequent relative paths.  That is if we cannot find
21	them relative to 'cwd', we check relative to the last
22	dir read.
23
24'W'	files opened for write or read-write,
25	for filemon V3 and earlier.
26
27'E'	files executed.
28
29'L'	files linked
30
31'V'	the filemon version, this record is used as a clue
32	that we have reached the interesting bit.
33
34"""
35
36"""
37RCSid:
38	$Id: meta2deps.py,v 1.12 2013/03/31 22:31:59 sjg Exp $
39
40	Copyright (c) 2011-2013, Juniper Networks, Inc.
41	All rights reserved.
42
43	Redistribution and use in source and binary forms, with or without
44	modification, are permitted provided that the following conditions
45	are met:
46	1. Redistributions of source code must retain the above copyright
47	   notice, this list of conditions and the following disclaimer.
48	2. Redistributions in binary form must reproduce the above copyright
49	   notice, this list of conditions and the following disclaimer in the
50	   documentation and/or other materials provided with the distribution.
51
52	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
53	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
54	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
55	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
56	OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
57	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
58	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
62	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63
64"""
65
66import os, re, sys
67
68def getv(dict, key, d=None):
69    """Lookup key in dict and return value or the supplied default."""
70    if key in dict:
71        return dict[key]
72    return d
73
74def resolve(path, cwd, last_dir=None, debug=0, debug_out=sys.stderr):
75    """
76    Return an absolute path, resolving via cwd or last_dir if needed.
77    """
78    if path.endswith('/.'):
79        path = path[0:-2]
80    if path[0] == '/':
81        return path
82    if path == '.':
83        return cwd
84    if path.startswith('./'):
85        return cwd + path[1:]
86    if last_dir == cwd:
87        last_dir = None
88    for d in [last_dir, cwd]:
89        if not d:
90            continue
91        p = '/'.join([d,path])
92        if debug > 2:
93            print >> debug_out, "looking for:", p,
94        if not os.path.exists(p):
95            if debug > 2:
96                print >> debug_out, "nope"
97            p = None
98            continue
99        if debug > 2:
100            print >> debug_out, "found:", p
101        return p
102    return None
103
104def abspath(path, cwd, last_dir=None, debug=0, debug_out=sys.stderr):
105    """
106    Return an absolute path, resolving via cwd or last_dir if needed.
107    this gets called a lot, so we try to avoid calling realpath
108    until we know we have something.
109    """
110    path = resolve(path, cwd, last_dir, debug, debug_out)
111    if path and (path.find('./') > 0 or
112                 path.endswith('/..') or
113                 os.path.islink(path)):
114        return os.path.realpath(path)
115    return path
116
117def sort_unique(list, cmp=None, key=None, reverse=False):
118    list.sort(cmp, key, reverse)
119    nl = []
120    le = None
121    for e in list:
122        if e == le:
123            continue
124        nl.append(e)
125    return nl
126
127class MetaFile:
128    """class to parse meta files generated by bmake."""
129
130    conf = None
131    dirdep_re = None
132    host_target = None
133    srctops = []
134    objroots = []
135
136    seen = {}
137    obj_deps = []
138    src_deps = []
139    file_deps = []
140
141    def __init__(self, name, conf={}):
142        """if name is set we will parse it now.
143        conf can have the follwing keys:
144
145        SRCTOPS	list of tops of the src tree(s).
146
147        CURDIR	the src directory 'bmake' was run from.
148
149        RELDIR	the relative path from SRCTOP to CURDIR
150
151        MACHINE	the machine we built for.
152        	set to 'none' if we are not cross-building.
153		More specifically if machine cannot be deduced from objdirs.
154
155        HOST_TARGET
156		when we build for the psuedo machine 'host'
157		the object tree uses HOST_TARGET rather than MACHINE.
158
159        OBJROOTS a list of the common prefix for all obj dirs it might
160		end in '/' or '-'.
161
162        DPDEPS	names an optional file to which per file dependencies
163		will be appended.
164		For example if 'some/path/foo.h' is read from SRCTOP
165		then 'DPDEPS_some/path/foo.h +=' "RELDIR" is output.
166		This can allow 'bmake' to learn all the dirs within
167 		the tree that depend on 'foo.h'
168
169        debug	desired debug level
170
171        debug_out open file to send debug output to (sys.stderr)
172
173        """
174
175        self.name = name
176        self.debug = getv(conf, 'debug', 0)
177        self.debug_out = getv(conf, 'debug_out', sys.stderr)
178
179        self.machine = getv(conf, 'MACHINE', '')
180        self.curdir = getv(conf, 'CURDIR')
181        self.reldir = getv(conf, 'RELDIR')
182        self.dpdeps = getv(conf, 'DPDEPS')
183
184        if not self.conf:
185            # some of the steps below we want to do only once
186            self.conf = conf
187            self.host_target = getv(conf, 'HOST_TARGET')
188            for srctop in getv(conf, 'SRCTOPS', []):
189                if srctop[-1] != '/':
190                    srctop += '/'
191                if not srctop in self.srctops:
192                    self.srctops.append(srctop)
193                _srctop = os.path.realpath(srctop)
194                if _srctop[-1] != '/':
195                    _srctop += '/'
196                if not _srctop in self.srctops:
197                    self.srctops.append(_srctop)
198
199            trim_list = ['/' + self.machine + '/',
200                         '/' + self.machine,
201                         self.machine + '/',
202                         self.machine]
203
204            if self.machine == 'host':
205                trim_list += ['/' + self.host_target + '/',
206                              '/' + self.host_target,
207                              self.host_target + '/',
208                              self.host_target]
209
210            for objroot in getv(conf, 'OBJROOTS', []):
211                for e in trim_list:
212                    if objroot.endswith(e):
213                        # this is not what we want - fix it
214                        objroot = objroot[0:-len(e)]
215                        if e.endswith('/'):
216                            objroot += '/'
217                if not objroot in self.objroots:
218                    self.objroots.append(objroot)
219                    _objroot = os.path.realpath(objroot)
220                    if objroot[-1] == '/':
221                        _objroot += '/'
222                    if not _objroot in self.objroots:
223                        self.objroots.append(_objroot)
224
225            # we want the longest match
226            self.srctops.sort(reverse=True)
227            self.objroots.sort(reverse=True)
228
229            if self.debug:
230                print >> self.debug_out, "host_target=", self.host_target
231                print >> self.debug_out, "srctops=", self.srctops
232                print >> self.debug_out, "objroots=", self.objroots
233
234            self.dirdep_re = re.compile(r'([^/]+)/(.+)')
235
236        if self.dpdeps and not self.reldir:
237            if self.debug:
238                print >> self.debug_out, "need reldir:",
239            if self.curdir:
240                srctop = self.find_top(self.curdir, self.srctops)
241                if srctop:
242                    self.reldir = self.curdir.replace(srctop,'')
243                    if self.debug:
244                        print >> self.debug_out, self.reldir
245            if not self.reldir:
246                self.dpdeps = None      # we cannot do it?
247
248        self.cwd = os.getcwd()          # make sure this is initialized
249
250        if name:
251            self.parse()
252
253    def reset(self):
254        """reset state if we are being passed meta files from multiple directories."""
255        self.seen = {}
256        self.obj_deps = []
257        self.src_deps = []
258        self.file_deps = []
259
260    def dirdeps(self, sep='\n'):
261        """return DIRDEPS"""
262        return sep.strip() + sep.join(self.obj_deps)
263
264    def src_dirdeps(self, sep='\n'):
265        """return SRC_DIRDEPS"""
266        return sep.strip() + sep.join(self.src_deps)
267
268    def file_depends(self, out=None):
269        """Append DPDEPS_${file} += ${RELDIR}
270        for each file we saw, to the output file."""
271        if not self.reldir:
272            return None
273        for f in sort_unique(self.file_deps):
274            print >> out, 'DPDEPS_%s += %s' % (f, self.reldir)
275
276    def seenit(self, dir):
277        """rememer that we have seen dir."""
278        self.seen[dir] = 1
279
280    def add(self, list, data, clue=''):
281        """add data to list if it isn't already there."""
282        if data not in list:
283            list.append(data)
284            if self.debug:
285                print >> self.debug_out, "%s: %sAdd: %s" % (self.name, clue, data)
286
287    def find_top(self, path, list):
288        """the logical tree may be split accross multiple trees"""
289        for top in list:
290            if path.startswith(top):
291                if self.debug > 2:
292                    print >> self.debug_out, "found in", top
293                return top
294        return None
295
296    def find_obj(self, objroot, dir, path, input):
297        """return path within objroot, taking care of .dirdep files"""
298        ddep = None
299        for ddepf in [path + '.dirdep', dir + '/.dirdep']:
300            if not ddep and os.path.exists(ddepf):
301                ddep = open(ddepf, 'rb').readline().strip('# \n')
302                if self.debug > 1:
303                    print >> self.debug_out, "found %s: %s\n" % (ddepf, ddep)
304                if ddep.endswith(self.machine):
305                    ddep = ddep[0:-(1+len(self.machine))]
306
307        if not ddep:
308            # no .dirdeps, so remember that we've seen the raw input
309            self.seenit(input)
310            self.seenit(dir)
311            if self.machine == 'none':
312                if dir.startswith(objroot):
313                    return dir.replace(objroot,'')
314                return None
315            m = self.dirdep_re.match(dir.replace(objroot,''))
316            if m:
317                ddep = m.group(2)
318                dmachine = m.group(1)
319                if dmachine != self.machine:
320                    if not (self.machine == 'host' and
321                            dmachine == self.host_target):
322                        if self.debug > 2:
323                            print >> self.debug_out, "adding .%s to %s" % (dmachine, ddep)
324                        ddep += '.' + dmachine
325
326        return ddep
327
328    def parse(self, name=None, file=None):
329        """A meta file looks like:
330
331	# Meta data file "path"
332	CMD "command-line"
333	CWD "cwd"
334	TARGET "target"
335	-- command output --
336	-- filemon acquired metadata --
337	# buildmon version 3
338	V 3
339	C "pid" "cwd"
340	E "pid" "path"
341        F "pid" "child"
342	R "pid" "path"
343	W "pid" "path"
344	X "pid" "status"
345        D "pid" "path"
346        L "pid" "src" "target"
347        M "pid" "old" "new"
348        S "pid" "path"
349        # Bye bye
350
351        We go to some effort to avoid processing a dependency more than once.
352        Of the above record types only C,E,F,L,R,V and W are of interest.
353        """
354
355        version = 0                     # unknown
356        if name:
357            self.name = name;
358        if file:
359            f = file
360            cwd = last_dir = self.cwd
361        else:
362            f = open(self.name, 'rb')
363        skip = True
364        pid_cwd = {}
365        pid_last_dir = {}
366        last_pid = 0
367
368        if self.curdir:
369            self.seenit(self.curdir)    # we ignore this
370
371        interesting = 'CEFLRV'
372        for line in f:
373            # ignore anything we don't care about
374            if not line[0] in interesting:
375                continue
376            if self.debug > 2:
377                print >> self.debug_out, "input:", line,
378            w = line.split()
379
380            if skip:
381                if w[0] == 'V':
382                    skip = False
383                    version = int(w[1])
384                    """
385                    if version < 4:
386                        # we cannot ignore 'W' records
387                        # as they may be 'rw'
388                        interesting += 'W'
389                    """
390                elif w[0] == 'CWD':
391                    self.cwd = cwd = last_dir = w[1]
392                    self.seenit(cwd)    # ignore this
393                    if self.debug:
394                        print >> self.debug_out, "%s: CWD=%s" % (self.name, cwd)
395                continue
396
397            pid = int(w[1])
398            if pid != last_pid:
399                if last_pid:
400                    pid_cwd[last_pid] = cwd
401                    pid_last_dir[last_pid] = last_dir
402                cwd = getv(pid_cwd, pid, self.cwd)
403                last_dir = getv(pid_last_dir, pid, self.cwd)
404                last_pid = pid
405
406            # process operations
407            if w[0] == 'F':
408                npid = int(w[2])
409                pid_cwd[npid] = cwd
410                pid_last_dir[npid] = cwd
411                last_pid = npid
412                continue
413            elif w[0] == 'C':
414                cwd = abspath(w[2], cwd, None, self.debug, self.debug_out)
415                if cwd.endswith('/.'):
416                    cwd = cwd[0:-2]
417                last_dir = cwd
418                if self.debug > 1:
419                    print >> self.debug_out, "cwd=", cwd
420                continue
421
422            if w[2] in self.seen:
423                if self.debug > 2:
424                    print >> self.debug_out, "seen:", w[2]
425                continue
426            # file operations
427            if w[0] in 'ML':
428                path = w[2].strip("'")
429            else:
430                path = w[2]
431            # we are never interested in .dirdep files as dependencies
432            if path.endswith('.dirdep'):
433                continue
434            # we don't want to resolve the last component if it is
435            # a symlink
436            path = resolve(path, cwd, last_dir, self.debug, self.debug_out)
437            if not path:
438                continue
439            dir,base = os.path.split(path)
440            if dir in self.seen:
441                if self.debug > 2:
442                    print >> self.debug_out, "seen:", dir
443                continue
444            # we can have a path in an objdir which is a link
445            # to the src dir, we may need to add dependencies for each
446            rdir = dir
447            dir = abspath(dir, cwd, last_dir, self.debug, self.debug_out)
448            if rdir == dir or rdir.find('./') > 0:
449                rdir = None
450            # now put path back together
451            path = '/'.join([dir,base])
452            if self.debug > 1:
453                print >> self.debug_out, "raw=%s rdir=%s dir=%s path=%s" % (w[2], rdir, dir, path)
454            if w[0] in 'SRWL':
455                if w[0] == 'W' and path.endswith('.dirdep'):
456                    continue
457                if path in [last_dir, cwd, self.cwd, self.curdir]:
458                    if self.debug > 1:
459                        print >> self.debug_out, "skipping:", path
460                    continue
461                if os.path.isdir(path):
462                    if w[0] in 'RW':
463                        last_dir = path;
464                    if self.debug > 1:
465                        print >> self.debug_out, "ldir=", last_dir
466                    continue
467
468            if w[0] in 'REWML':
469                # finally, we get down to it
470                if dir == self.cwd or dir == self.curdir:
471                    continue
472                srctop = self.find_top(path, self.srctops)
473                if srctop:
474                    if self.dpdeps:
475                        self.add(self.file_deps, path.replace(srctop,''), 'file')
476                    self.add(self.src_deps, dir.replace(srctop,''), 'src')
477                    self.seenit(w[2])
478                    self.seenit(dir)
479                    if rdir and not rdir.startswith(srctop):
480                        dir = rdir      # for below
481                        rdir = None
482                    else:
483                        continue
484
485                objroot = None
486                for dir in [dir,rdir]:
487                    if not dir:
488                        continue
489                    objroot = self.find_top(dir, self.objroots)
490                    if objroot:
491                        break
492                if objroot:
493                    ddep = self.find_obj(objroot, dir, path, w[2])
494                    if ddep:
495                        self.add(self.obj_deps, ddep, 'obj')
496                else:
497                    # don't waste time looking again
498                    self.seenit(w[2])
499                    self.seenit(dir)
500        if not file:
501            f.close()
502
503
504def main(argv, klass=MetaFile, xopts='', xoptf=None):
505    """Simple driver for class MetaFile.
506
507    Usage:
508    	script [options] [key=value ...] "meta" ...
509
510    Options and key=value pairs contribute to the
511    dictionary passed to MetaFile.
512
513    -S "SRCTOP"
514		add "SRCTOP" to the "SRCTOPS" list.
515
516    -C "CURDIR"
517
518    -O "OBJROOT"
519    		add "OBJROOT" to the "OBJROOTS" list.
520
521    -m "MACHINE"
522
523    -H "HOST_TARGET"
524
525    -D "DPDEPS"
526
527    -d	bumps debug level
528
529    """
530    import getopt
531
532    # import Psyco if we can
533    # it can speed things up quite a bit
534    have_psyco = 0
535    try:
536        import psyco
537        psyco.full()
538        have_psyco = 1
539    except:
540        pass
541
542    conf = {
543        'SRCTOPS': [],
544        'OBJROOTS': [],
545        }
546
547    try:
548        machine = os.environ['MACHINE']
549        if machine:
550            conf['MACHINE'] = machine
551        srctop = os.environ['SB_SRC']
552        if srctop:
553            conf['SRCTOPS'].append(srctop)
554        objroot = os.environ['SB_OBJROOT']
555        if objroot:
556            conf['OBJROOTS'].append(objroot)
557    except:
558        pass
559
560    debug = 0
561    output = True
562
563    opts, args = getopt.getopt(argv[1:], 'dS:C:O:R:m:D:H:q' + xopts)
564    for o, a in opts:
565        if o == '-d':
566            debug += 1
567        elif o == '-q':
568            output = False
569        elif o == '-H':
570            conf['HOST_TARGET'] = a
571        elif o == '-S':
572            if a not in conf['SRCTOPS']:
573                conf['SRCTOPS'].append(a)
574        elif o == '-C':
575            conf['CURDIR'] = a
576        elif o == '-O':
577            if a not in conf['OBJROOTS']:
578                conf['OBJROOTS'].append(a)
579        elif o == '-R':
580            conf['RELDIR'] = a
581        elif o == '-D':
582            conf['DPDEPS'] = a
583        elif o == '-m':
584            conf['MACHINE'] = a
585        elif xoptf:
586            xoptf(o, a, conf)
587
588    conf['debug'] = debug
589
590    # get any var=val assignments
591    eaten = []
592    for a in args:
593        if a.find('=') > 0:
594            k,v = a.split('=')
595            if k in ['SRCTOP','OBJROOT','SRCTOPS','OBJROOTS']:
596                if k == 'SRCTOP':
597                    k = 'SRCTOPS'
598                elif k == 'OBJROOT':
599                    k = 'OBJROOTS'
600                if v not in conf[k]:
601                    conf[k].append(v)
602            else:
603                conf[k] = v
604            eaten.append(a)
605            continue
606        break
607
608    for a in eaten:
609        args.remove(a)
610
611    debug_out = getv(conf, 'debug_out', sys.stderr)
612
613    if debug:
614        print >> debug_out, "config:"
615        print >> debug_out, "psyco=", have_psyco
616        for k,v in conf.items():
617            print >> debug_out, "%s=%s" % (k,v)
618
619    for a in args:
620        m = klass(a, conf)
621
622    if output:
623        print m.dirdeps()
624
625        print m.src_dirdeps('\nsrc:')
626
627        dpdeps = getv(conf, 'DPDEPS')
628        if dpdeps:
629            m.file_depends(open(dpdeps, 'wb'))
630
631    return m
632
633if __name__ == '__main__':
634    try:
635        main(sys.argv)
636    except:
637        # yes, this goes to stdout
638        print "ERROR: ", sys.exc_info()[1]
639        raise
640
641