import sys, os dag = file(sys.argv[1], 'r') rescue_dag = None for i in range(1, 1000): path = '%s.rescue%03d' % (sys.argv[1], i) if not os.path.isfile(path): break rescue_dag = path print 'Operating on rescue DAG %s' % rescue_dag rescue_dag = file(rescue_dag, 'r') dag = dag.readlines() rescue_dag = rescue_dag.readlines() jobs = {} jobstate = None for line in dag: line = line.split(' ') if line[0] == 'JOB': jobs[line[1].strip()] = {'done': False, 'children': [], 'deps': [], 'failed': False, 'last_state': ''} if line[0] == 'PARENT': deps = [] for dep in line[1:]: if dep == 'CHILD': break deps.append(dep) jobs[line[-1].strip()]['deps'] = deps if line[0] == 'JOBSTATE_LOG': jobstate = file(line[1].strip(), 'r') for line in rescue_dag: line = line.split(' ') if line[0] == 'DONE': jobs[line[1].strip()]['done'] = True if jobstate is not None: jobstate = jobstate.readlines() for line in jobstate: line = line.split(' ') if line[1] == 'INTERNAL': continue job = jobs[line[1]] job['last_state'] = line[2] if job['last_state'] == 'POST_SCRIPT_FAILURE' or job['last_state'] == 'PRE_SCRIPT_FAILURE': job['failed'] = True if job['last_state'] == 'POST_SCRIPT_SUCCESS': job['done'] = True job['failed'] = False jdone = 0 jfailed = 0 for j,job in jobs.iteritems(): if job['done']: jdone += 1 continue deps_met = True for dep in job['deps']: dep = jobs[dep] if j not in dep['children']: dep['children'].append(j) if not dep['done']: deps_met = False job['deps_met'] = deps_met if deps_met and job['last_state'] == '': job['failed'] = True if job['failed']: jfailed += 1 print '%d jobs complete' % jdone print '%d jobs failed' % jfailed def deepchildren(job, children): j = jobs[job] for c in j['children']: if c not in children: children.append(c) deepchildren(c, children) return children blocking = [] for job in jobs: j = jobs[job] if j['failed']: blocking.append((job, len(deepchildren(job, [])), len(j['children']), j['last_state'])) def missing_deps(port, depth=0): p = jobs[port] if p['failed'] or not p['done']: indent = '\t'*depth if p['failed']: print '%s%s failed' % (indent, port) else: print '%s%s incomplete' % (indent, port) for d in p['deps']: missing_deps(d, depth+1) blocking.sort(cmp=lambda a,b: cmp(a[1], b[1]), reverse=True) print 'Top ten failed jobs blocking other jobs:' for job in blocking[:10]: print '\t%s blocking %d dependent jobs (%d directly). Last seen: %s' % job