Project

General

Profile

Story #13376 » get_switch_errors.py

Jiayong Li, 06/20/2018 08:35 PM

 
#!/usr/bin python

import gzip

def is_header(line):
return line.startswith('#')

def get_GT(line):
fields = line.strip().split('\t')
FORMAT, sample = fields[8], fields[9]
FORMAT_fields = FORMAT.split(':')
sample_fields = sample.split(':')

GT_index = FORMAT_fields.index('GT')
GT = sample_fields[GT_index]
return GT

def is_phased_het(line):
GT = get_GT(line)

if '|' in GT:
GT_fields = GT.split('|')
return GT_fields[0] != GT_fields[1]
else:
return False

def is_phase_switch(bline, cline):
bGT = get_GT(bline)
cGT = get_GT(cline)

if '|' in bGT and '|' in cGT:
bGT_fields = bGT.split('|')
cGT_fields = cGT.split('|')
if bGT_fields[0] != bGT_fields[1] and cGT_fields[0] != cGT_fields[1]:
return bGT_fields[0] == cGT_fields[1] and bGT_fields[1] == cGT_fields[0]
else:
return False
else:
return False

def print_switch_errors(bfile, cfile):
b = gzip.open(bfile) if bfile.endswith(".gz") else open(bfile)
c = gzip.open(cfile) if cfile.endswith(".gz") else open(cfile)

bline = b.readline()
cline = c.readline()

last_het_is_switch = False

while not (bline == '' or cline == ''):
if is_header(bline):
bline = b.readline()
elif is_header(cline):
cline = c.readline()
else:
bfields = bline.strip().split('\t')
cfields = cline.strip().split('\t')
bCHROM, bPOS = bfields[0], int(bfields[1])
cCHROM, cPOS = cfields[0], int(cfields[1])
if bCHROM < cCHROM or (bCHROM == cCHROM and bPOS < cPOS):
if is_phased_het(bline):
if last_het_is_switch:
print bline,
last_het_is_switch = False
bline = b.readline()
elif bCHROM == cCHROM and bPOS == cPOS:
if is_phased_het(bline):
if last_het_is_switch:
if not is_phase_switch(bline, cline):
print bline,
last_het_is_switch = False
else:
if is_phase_switch(bline, cline):
print bline,
last_het_is_switch = True
bline = b.readline()
cline = c.readline()
else:
cline = c.readline()

b.close()
c.close()

def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('bfile', metavar='BFILE', help='baseline file')
parser.add_argument('cfile', metavar='CFILE', help='calls file')
args = parser.parse_args()

print_switch_errors(args.bfile, args.cfile)

if __name__ == '__main__':
main()
(2-2/2)