Untitled diff

Created Diff never expires
#LOH
#LOH
probefile=r'C:\Users\ly091\Documents\probes_SNP6.0_hg18.txt'
probefile=r'C:\Users\ly091\Documents\probes_SNP6.0_hg18.txt'
##SampleScoreFile = r'C:\Users\ly091\Documents\result8\Loss\tumor.Loss.Score.txt'
##SampleScoreFile = r'C:\Users\ly091\Documents\result8\Gain\tumor.Gain.Score.txt'
##ControlScoreFile = r'C:\Users\ly091\Documents\result8\Loss\Germline.Loss.Score.txt'
##ControlScoreFile = r'C:\Users\ly091\Documents\result8\Gain\Germline.Gain.Score.txt'
GScoreThreshold=0.01#minimum Gscore to be recorded
GScoreThreshold=0.01#minimum Gscore to be recorded
Recordstoread=10000000#for debug
Recordstoread=10000000#for debug
EchoPerLines=1000#verbose
EchoPerLines=1000#verbose
GaptoMerge=500#minimun width of a gap between two peaks to be merged, unit basepair
GaptoMerge=500#minimun width of a gap between two peaks to be merged, unit basepair
MinimumPeakWidth=2000#minimum Peak Width, unit base pair
MinimumPeakWidth=2000#minimum Peak Width, unit base pair
pvaluelimit=0.01
pvaluelimit=0.01
#Global variables
#Global variables
probes=list()#from the probe file, will be filled and sorted
probes=list()#from the probe file, will be filled and sorted
FirstMarkersonChromosomes=dict()#First markers of chromosomes
FirstMarkersonChromosomes=dict()#First markers of chromosomes
LastMarkersonChromosomes=dict() #Last markers of chromosomes
LastMarkersonChromosomes=dict() #Last markers of chromosomes
#Functions
#Functions
def mcnemar(a,b,c,d):
def mcnemar(a,b,c,d):
"""
"""
Input args:
Input args:
a, b, c, d- frequencies (a: ++, b: +-, c: -+, d: --) b and c is interchangable
a, b, c, d- frequencies (a: ++, b: +-, c: -+, d: --) b and c is interchangable
Output:
Output:
pvalue of test.
pvalue of test.
"""
"""
if b+c>0:
if b+c>0:
chi2testval = (abs(b-c) - 1)**2/ (b + c)
chi2testval = (abs(b-c) - 1)**2/ (b + c)
df = 1
df = 1
pvalue = 1 - stats.chi2.cdf(chi2testval, df)
pvalue = 1 - stats.chi2.cdf(chi2testval, df)
else:
else:
pvalue=1.0
pvalue=1.0
return pvalue
return pvalue
def AbeforeB(a,b):
def AbeforeB(a,b):
#compare postions of two probes, a and b contain: [chromosome, base pair position]
#compare postions of two probes, a and b contain: [chromosome, base pair position]
if a[0]<b[0]:
if a[0]<b[0]:
return True
return True
elif a[0]>b[0]:
elif a[0]>b[0]:
return False
return False
else: #case: in the same chromosome
else: #case: in the same chromosome
if a[1]<b[1]:
if a[1]<b[1]:
return True
return True
elif a[1]>b[1]:
elif a[1]>b[1]:
return False
return False
else:
else:
return True #same position, return True here
return True #same position, return True here
def LocateProbe(probe):#probe has: chromosome number, start size
def LocateProbe(probe):#probe has: chromosome number, start size
p1=0
p1=0
p2=len(probes)-1
p2=len(probes)-1
while True:
while True:
pcenter=int((p1+p2)/2)
pcenter=int((p1+p2)/2)
#print("p1="+str(p1)+"\tp2="+str(p2)+"\tpcenter="+str(pcenter))
#print("p1="+str(p1)+"\tp2="+str(p2)+"\tpcenter="+str(pcenter))
if probes[pcenter]==probe:
if probes[pcenter]==probe:
return pcenter
return pcenter
elif probes[p1]==probe:
elif probes[p1]==probe:
return p1
return p1
elif probes[p2]==probe:
elif probes[p2]==probe:
return p2
return p2
elif AbeforeB(probes[pcenter],probe):#take the right half
elif AbeforeB(probes[pcenter],probe):#take the right half
if p1!=pcenter:
if p1!=pcenter:
p1=pcenter
p1=pcenter
else:
else:
print("cannot find probe")
print("cannot find probe")
print(str(probe))
print(str(probe))
exit(577)
exit(577)
else: #take the left half
else: #take the left half
if p2!=pcenter:
if p2!=pcenter:
p2=pcenter
p2=pcenter
else:
else:
print("cannot find probe")
print("cannot find probe")
print(str(probe))
print(str(probe))
exit(577)
exit(577)
def LeftTarget(probe):#probe has: chromosome number, start size
def LeftTarget(probe):#probe has: chromosome number, start size
## print(probe)
## print(probe)
p1=0
p1=0
## print(len(gtarget_numbers))
## print(len(gtarget_numbers))
while p1<len(gtarget_numbers):
while p1<len(gtarget_numbers):
if gtarget_numbers[p1][1]==probe[0]:#same chromosome
if gtarget_numbers[p1][1]==probe[0]:#same chromosome
#print("goes here")
#print("goes here")
if gtarget_numbers[p1][2]>=probe[1]:#first meet, note gtarget_numbers is ordered
if gtarget_numbers[p1][2]>=probe[1]:#first meet, note gtarget_numbers is ordered
if p1==0:
if p1==0:
return 0
return 0
else:
else:
return p1-1
return p1-1
p1+=1
p1+=1
if p1==len(gtarget_numbers):
if p1==len(gtarget_numbers):
return -111111111111
return -111111111111
def RightTarget(probe):#probe has: chromosome number, start size
def RightTarget(probe):#probe has: chromosome number, start size
# print(probe)
# print(probe)
p1=1
p1=1
while p1<len(gtarget_numbers):
while p1<len(gtarget_numbers):
## print(gtarget_numbers[-p1])
## print(gtarget_numbers[-p1])
## print(p1)
## print(p1)
## print(probe)
## print(probe)
if gtarget_numbers[-p1][1]==probe[0]:#same chromosome
if gtarget_numbers[-p1][1]==probe[0]:#same chromosome
if gtarget_numbers[-p1][3]<=probe[1]:#first meet from right, note gtarget_numbers is ordered
if gtarget_numbers[-p1][3]<=probe[1]:#first meet from right, note gtarget_numbers is ordered
if p1==1:
if p1==1:
return len(gtarget_numbers)-1
return len(gtarget_numbers)-1
else:
else:
return len(gtarget_numbers)-p1+1
return len(gtarget_numbers)-p1+1
p1+=1
p1+=1
if p1==len(gtarget_numbers):
if p1==len(gtarget_numbers):
return p1
return p1
def CountMarkers(nchr,nstart,nend):
def CountMarkers(nchr,nstart,nend):
## print("in countmarkers")
## print("in countmarkers")
## print(nchr)
## print(nchr)
## print(nstart)
## print(nstart)
## print(nend)
## print(nend)
return LocateProbe([nchr,nend])-LocateProbe([nchr,nstart])+1
return LocateProbe([nchr,nend])-LocateProbe([nchr,nstart])+1
def PreviousProbeOf(element):
def PreviousProbeOf(element):
#element is a list with chromosome and probe position, e.g.[2,242953866]
#element is a list with chromosome and probe position, e.g.[2,242953866]
n=LocateProbe(element)-1
n=LocateProbe(element)-1
if n<0:
if n<0:
return 0
return 0
else:
else:
return n
return n
def NextProbeOf(element):
def NextProbeOf(element):
n= LocateProbe(element)+1
n= LocateProbe(element)+1
if n>len(probes)-1:
if n>len(probes)-1:
return len(probes)-1
return len(probes)-1
else:
else:
return n
return n
def RelationshipOf(seg1,seg2):
def RelationshipOf(seg1,seg2):
## function RelationshipOf(seg1 as list, seg2 as list)
## function RelationshipOf(seg1 as list, seg2 as list)
## compare two segments seg1 and seg2,
## compare two segments seg1 and seg2,
## seg1 and seg2 are two lists, each with the Min and Max values of the segment,
## seg1 and seg2 are two lists, each with the Min and Max values of the segment,
## Min should be the first value in the list and Max the second.
## Min should be the first value in the list and Max the second.
## return a string indicating the relationship between the two segments:
## return a string indicating the relationship between the two segments:
## "equal" : identical
## "equal" : identical
## "no relationship" : seg1 and seg2 are not in the same locus, they don't overlap
## "no relationship" : seg1 and seg2 are not in the same locus, they don't overlap
## "in": seg1 is entirely within seg2
## "in": seg1 is entirely within seg2
## "cover": seg1 is bigger than seg2 and completely contains seg2
## "cover": seg1 is bigger than seg2 and completely contains seg2
## "overlap": seg1 partially overlaps with seg2
## "overlap": seg1 partially overlaps with seg2
if seg1[0]==seg2[0] and seg1[1]==seg2[1]:
if seg1[0]==seg2[0] and seg1[1]==seg2[1]:
return "equal"
return "equal"
elif seg1[0]>=seg2[0] and seg1[1]<=seg2[1]:
elif seg1[0]>=seg2[0] and seg1[1]<=seg2[1]:
return "in"
return "in"
elif seg1[0]<=seg2[0] and seg1[1]>=seg2[1]:
elif seg1[0]<=seg2[0] and seg1[1]>=seg2[1]:
return "cover"
return "cover"
if (seg1[0]<seg2[0] and seg1[1]<seg2[1] and seg1[1]>seg2[0]) or (seg1[0]>seg2[0] and seg1[1]>seg2[1] and seg1[0]<seg2[1]) :
if (seg1[0]<seg2[0] and seg1[1]<seg2[1] and seg1[1]>seg2[0]) or (seg1[0]>seg2[0] and seg1[1]>seg2[1] and seg1[0]<seg2[1]) :
return "overlap"
return "overlap"
else :
else :
return "no relationship"
return "no relationship"
def ZeroFillSpaces(StartProbeIndex,EndProbeIndex):
def ZeroFillSpaces(StartProbeIndex,EndProbeIndex):
#return a list of lists each is a properly formated segment with value of 0.0
#return a list of lists each is a properly formated segment with value of 0.0
#StartProbeINdex and EndProbeIndex are two pointers to positions in the "probes" list which is sorted
#StartProbeINdex and EndProbeIndex are two pointers to positions in the "probes" list which is sorted
lists_of_zerofills=list()
lists_of_zerofills=list()
#print("dump in ZeroFillSpaces, StartProbeIndex, EndProbeIndex:"+str(StartProbeIndex)+","+str(EndProbeIndex))
#print("dump in ZeroFillSpaces, StartProbeIndex, EndProbeIndex:"+str(StartProbeIndex)+","+str(EndProbeIndex))
if StartProbeIndex< 0 or StartProbeIndex>= len(probes):
if StartProbeIndex< 0 or StartProbeIndex>= len(probes):
print("Invalid StartProbeIndex in function ZeroFillSpaces: "+str(StartProbeIndex))
print("Invalid StartProbeIndex in function ZeroFillSpaces: "+str(StartProbeIndex))
quit(767)
quit(767)
elif EndProbeIndex <0 or EndProbeIndex>=len(probes):
elif EndProbeIndex <0 or EndProbeIndex>=len(probes):
print("Invalid EndProbeIndex in function ZeroFillSpaces: "+str(EndProbeIndex))
print("Invalid EndProbeIndex in function ZeroFillSpaces: "+str(EndProbeIndex))
quit(768)
quit(768)
elif StartProbeIndex>=EndProbeIndex:
elif StartProbeIndex>=EndProbeIndex:
return lists_of_zerofills
return lists_of_zerofills
else:
else:
pass
pass
stChr=probes[StartProbeIndex][0]
stChr=probes[StartProbeIndex][0]
stBase=probes[StartProbeIndex][1]
stBase=probes[StartProbeIndex][1]
endChr=probes[EndProbeIndex][0]
endChr=probes[EndProbeIndex][0]
endBase=probes[EndProbeIndex][1]
endBase=probes[EndProbeIndex][1]
while stChr<=endChr:
while stChr<=endChr:
if stChr==endChr:
if stChr==endChr:
if stBase<endBase:
if stBase<endBase:
n=0
n=0
lists_of_zerofills.append([stChr,stBase,endBase,0.0,0.0,0.0,0.0])
lists_of_zerofills.append([stChr,stBase,endBase,0.0,0.0,0.0,0.0])
else:
else:
break
break
else:
else:
endMark=LastMarkersonChromosomes[stChr]
endMark=LastMarkersonChromosomes[stChr]
lists_of_zerofills.append([stChr,stBase,endMark,0.0,0.0,0.0,0.0])
lists_of_zerofills.append([stChr,stBase,endMark,0.0,0.0,0.0,0.0])
#move on to next chromosome
#move on to next chromosome
stChr+=1
stChr+=1
if stChr>24:
if stChr>24:
break
break
else:
else:
stBase=FirstMarkersonChromosomes[stChr]
stBase=FirstMarkersonChromosomes[stChr]
return lists_of_zerofills
return lists_of_zerofills
def getCytobands(chromosome,startn,endn):
def getCytobands(chromosome,startn,endn):
#note: chromosome is 1-22 and X or Y
#note: chromosome is 1-22 and X or Y
try:
try:
## cnx = mysql.connector.connect(user='genome', host='genome-mysql.cse.ucsc.edu',
## cnx = mysql.connector.connect(user='genome', host='genome-mysql.cse.ucsc.edu',
## database='hg19')
## database='hg19')
cnx = mysql.connector.connect(user='root', password='ssqll', host='127.0.0.1',port='3306',
cnx = mysql.connector.connect(user='root', password='ssqll', host='127.0.0.1',port='3306',
database='ucsc_hg18')
database='ucsc_hg18')
except mysql.connector.Error as err:
except mysql.connector.Error as err:
print("Error occured during obtaining cytoband information from local database.")
print("Error occured during obtaining cytoband information from local database.")
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exists")
print("Database does not exists")
else:
else:
print(err)
print(err)
cursor = cnx.cursor()
cursor = cnx.cursor()
query = ("SELECT name"
query = ("SELECT name"
" FROM cytoBand "
" FROM cytoBand "
" WHERE chrom=\"chr"+str(chromosome)+"\" "
" WHERE chrom=\"chr"+str(chromosome)+"\" "
" and ((chromStart<="+str(endn)+" AND chromEndStart>="+str(startn)+")"
" and ((chromStart<="+str(endn)+" AND chromEndStart>="+str(startn)+")"
" ORDER BY name")
" ORDER BY name")
#print(query)
#print(query)
cursor.execute(query)
cursor.execute(query)
cytobands=''
cytobands=''
for bandname in cursor:
for bandname in cursor:
cytobands=cytobands+''+bandname
cytobands=cytobands+''+bandname
cursor.close()
cursor.close()
cnx.close()
cnx.close()
return cytobands
return cytobands
def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap):
def getmatrix(region,SampleSegments,ControlSegments,codemap,arraymap):
#region is a peak, as a list, in order: chromosome, start, end
#region is a peak, as a list, in order: chromosome, start, end
peak=region
peak=region
#first initiate codemap to 0
#first initiate codemap to 0
for x in codemap.keys():
for x in codemap.keys():
codemap[x]=[0,0]
codemap[x]=[0,0]
for Array in SampleSegments:
for Array in SampleSegments:
cumulativelength=0
cumulativelength=0
for row in Array[1]:
for row in Array[1]:
if row[1]==peak[0]:#same chromosome
if row[1]==peak[0]:#same chromosome
segA=[peak[1],peak[2]]
segA=[peak[1],peak[2]]
segB=[row[2],row[3]]
segB=[row[2],row[3]]
re=RelationshipOf(segA,segB)
re=RelationshipOf(segA,segB)
if re=="no relationship":#do nothing "no relationship"
if re=="no relationship":#do nothing "no relationship"
#print("goes here")
#print("goes here")
pass
pass
elif re=="in" or re=="equal":
elif re=="in" or re=="equal":
codemap[arraymap[Array[0]]][0]=1
codemap[arraymap[Array[0]]][0]=1
break #this sample has been counted, so no need to check other segments in this sample
break #this sample has been counted, so no need to check other segments in this sample
elif re=="cover":
elif re=="cover":
sa=segA[1]-segA[0]
sa=segA[1]-segA[0]
sb=segB[1]-segB[0]
sb=segB[1]-segB[0]
if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"
if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"
codemap[arraymap[Array[0]]][0]=1
codemap[arraymap[Array[0]]][0]=1
break #this sample has been counted, so no need to check other segments in this sample
break #this sample has been counted, so no need to check other segments in this sample
else:
else:
cumulativelength+=sb
cumulativelength+=sb
#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
elif re=="overlap":
elif re=="overlap":
sa=segA[1]-segA[0]
sa=segA[1]-segA[0]
sb=segB[1]-segB[0]
sb=segB[1]-segB[0]
if segA[0]<segB[0]: #left overlap
if segA[0]<segB[0]: #left overlap
sb_overlap=segA[1]-segB[0]
sb_overlap=segA[1]-segB[0]
sb_outside=segB[1]-segA[1]
sb_outside=segB[1]-segA[1]
else: #right overlap
else: #right overlap
sb_overlap=segB[1]-segA[0]
sb_overlap=segB[1]-segA[0]
sb_outside=segA[0]-segA[0]
sb_outside=segA[0]-segA[0]
#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
codemap[arraymap[Array[0]]][0]=1
codemap[arraymap[Array[0]]][0]=1
break #this sample has been counted, so no need to check other segments in this sample
break #this sample has been counted, so no need to check other segments in this sample
else:
else:
cumulativelength+=sb_overlap
cumulativelength+=sb_overlap
#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
#print("\nSample not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
else:
else:
print("Strange relationship: "+re)
print("Strange relationship: "+re)
exit(436)
exit(436)
for Array in ControlSegments:
for Array in ControlSegments:
cumulativelength=0
cumulativelength=0
for row in Array[1]:
for row in Array[1]:
if row[1]==peak[0]:#same chromosome
if row[1]==peak[0]:#same chromosome
segA=[peak[1],peak[2]]
segA=[peak[1],peak[2]]
segB=[row[2],row[3]]
segB=[row[2],row[3]]
re=RelationshipOf(segA,segB)
re=RelationshipOf(segA,segB)
if re=="no relationship":#do nothing
if re=="no relationship":#do nothing
pass
pass
elif re=="in" or re=="equal":
elif re=="in" or re=="equal":
codemap[arraymap[Array[0]]][1]=1
codemap[arraymap[Array[0]]][1]=1
break #this Control has been counted, so no need to check other segments in this Control
break #this Control has been counted, so no need to check other segments in this Control
elif re=="cover":
elif re=="cover":
sa=segA[1]-segA[0]
sa=segA[1]-segA[0]
sb=segB[1]-segB[0]
sb=segB[1]-segB[0]
if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"
if 10*sb+cumulativelength>sa: #set the limit how small segB can be compared to segA in "cover"
codemap[arraymap[Array[0]]][1]=1
codemap[arraymap[Array[0]]][1]=1
break #this Control has been counted, so no need to check other segments in this Control
break #this Control has been counted, so no need to check other segments in this Control
else:
else:
cumulativelength+=sb
cumulativelength+=sb
#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.))
elif re=="overlap":
elif re=="overlap":
sa=segA[1]-segA[0]
sa=segA[1]-segA[0]
sb=segB[1]-segB[0]
sb=segB[1]-segB[0]
if segA[0]<segB[0]: #left overlap
if segA[0]<segB[0]: #left overlap
sb_overlap=segA[1]-segB[0]
sb_overlap=segA[1]-segB[0]
sb_outside=segB[1]-segA[1]
sb_outside=segB[1]-segA[1]
else: #right overlap
else: #right overlap
sb_overlap=segB[1]-segA[0]
sb_overlap=segB[1]-segA[0]
sb_outside=segA[0]-segA[0]
sb_outside=segA[0]-segA[0]
#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
#if 4*sb_overlap>sa and 3*sb_overlap>sb: #test of substantial overlap
if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
if cumulativelength+10*sb_overlap>sa: #test of substantial overlap
codemap[arraymap[Array[0]]][1]=1
codemap[arraymap[Array[0]]][1]=1
break #this Control has been counted, so no need to check other segments in this Control
break #this Control has been counted, so no need to check other segments in this Control
else:
else:
cumulativelength+=sb_overlap
cumulativelength+=sb_overlap
#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
#print("\nControl not counted. Relationship: "+re+". \nsa: "+str(sa/1000.)+". sb: "+str(sb/1000.)+ ". sb_overlap: "+str(sb_overlap/1000.)+". sb_outside: "+str(sb_outside/1000.))
else:
else:
print("Strange relationship: "+re)
print("Strange relationship: "+re)
exit(436)
exit(436)
#calculate a,b,c,d
#calculate a,b,c,d
a=b=c=d=0
a=b=c=d=0
for x in codemap.keys():
for x in codemap.keys():
if codemap[x][0]==1 and codemap[x][1]==1:# + +
if codemap[x][0]==1 and codemap[x][1]==1:# + +
a+=1
a+=1
elif codemap[x][0]==1 and codemap[x][1]==0:# + -
elif codemap[x][0]==1 and codemap[x][1]==0:# + -
b+=1
b+=1
elif codemap[x][0]==0 and codemap[x][1]==1:# + -
elif codemap[x][0]==0 and codemap[x][1]==1:# + -
c+=1
c+=1
elif codemap[x][0]==0 and codemap[x][1]==0:# + -
elif codemap[x][0]==0 and codemap[x][1]==0:# + -
d+=1
d+=1
return [a,b,c,d]
return [a,b,c,d]
# main execution starts here
# main execution starts here
import os
import os
import os.path
import os.path
import csv
import csv
import shutil
import shutil
import time
import time
import operator
import operator
import copy
import copy
import scipy
import scipy
import math
import math
from scipy import stats
from scipy import stats
import mysql.connector
import mysql.connector
from mysql.connector import errorcode
from mysql.connector import errorcode
startclock=time.clock()
startclock=time.clock()
print ("\n"*5)
print ("\n"*5)
print("The program starts at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
print("The program starts at: "+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
# read probes
# read probes
print("Now read probes.")
print("Now read probes.")
ifile = open(probefile, "r")
ifile = open(probefile, "r")
reader = csv.reader(ifile,delimiter='\t')
reader = csv.reader(ifile,delimiter='\t')
for row in reader:
for row in reader:
try:
try:
n=0
n=0
if row[1]=='X':
if row[1]=='X':
#print("probes on X found")
#print("probes on X found")
n=23
n=23
elif row[1]=='Y':
elif row[1]=='Y':
n=24
n=24
elif row[1]=='MT':#ignore MT
elif row[1]=='MT':#ignore MT
continue
continue
else:
else:
n=int(row[1])
n=int(row[1])
#note: start size position has been added by 1 as below!!
#note: start size position has been added by 1 as below!!
probes.append([n,int(row[2])])
probes.append([n,int(row[2])])
except:
except:
print(row)
print(row)
quit(963)
quit(963)
ifile.close()
ifile.close()
probes=sorted(probes,key=operator.itemgetter(0,1))
probes=sorted(probes,key=operator.itemgetter(0,1))
#initiate FirstMarkers and LastMarkers on chromosomes
#initiate FirstMarkers and LastMarkers on chromosomes
##FirstMarkersonChromosomes=dict()#First markers of chromosomes
##FirstMarkersonChromosomes=dict()#First markers of chromosomes
##LastMarkersonChromosomes=dict() #Last markers of chromosomes
##LastMarkersonChromosomes=dict() #Last markers of chromosomes
n=0
n=0
for index,p in enumerate(probes):
for index,p in enumerate(probes):
if p[0]!=n: #first occation of a chromosome
if p[0]!=n: #first occation of a chromosome
FirstMarkersonChromosomes[p[0]]=(p[1])
FirstMarkersonChromosomes[p[0]]=(p[1])
if n!=0: #not just start
if n!=0: #not just start
LastMarkersonChromosomes[probes[index-1][0]]=probes[index-1][1]
LastMarkersonChromosomes[probes[index-1][0]]=probes[index-1][1]
n=p[0]
n=p[0]
LastMarkersonChromosomes[probes[-1][0]]=probes[-1][1] #for Y chromosome, which is the last element in the probes list
LastMarkersonChromosomes[probes[-1][0]]=probes[-1][1] #for Y chromosome, which is the last element in the probes list
SegmentsDict=dict()
SegmentsDict=dict()
ScoresDict=dict()
ScoresDict=dict()
sampleinformationfile=r'C:\Users\ly091\Documents\FixedOPtimalPairs_LY7.15.txt'
sampleinformationfile=r'C:\Users\ly091\Documents\FixedOPtimalPairs_LY7.15.txt'
for loopcontrol in range(1,4):
for loopcontrol in range(1,4):
if loopcontrol ==1:
if loopcontrol ==1:
#FamilialTumor-FamilialGermline
#FamilialTumor-FamilialGermline
SampleSize99=ControlSize99=42
SampleSize99=ControlSize99=42
Samples = r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor.Loss.centroless.seg'
Samples = r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor.Gain.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\Loss\FamilialGermline.Loss.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\Gain\FamilialGermline.Gain.centroless.seg'
outputfile=r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor-FamiliarGermline.Loss.mc.gistic'
outputfile=r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor-FamiliarGermline.Gain.mc.gistic'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\FamilialTumor-FamiliarGermline.Loss.mc.statistics.txt'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\FamilialTumor-FamiliarGermline.Gain.mc.statistics.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\FamilialTumor-FamilialGermline.codes.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\FamilialTumor-FamilialGermline.codes.txt'
elif loopcontrol ==2:
elif loopcontrol ==2:
#SporadicTumor - SporadicGermline
#SporadicTumor - SporadicGermline
SampleSize99=ControlSize99=91
SampleSize99=ControlSize99=91
Samples = r'C:\Users\ly091\Documents\result8\Loss\sporadicTumor.Loss.centroless.seg'
Samples = r'C:\Users\ly091\Documents\result8\Gain\sporadicTumor.Gain.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\Loss\sporadicGermline.Loss.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\Gain\sporadicGermline.Gain.centroless.seg'
outputfile=r'C:\Users\ly091\Documents\result8\Loss\SporadicTumor-SporadicGermline.Loss.mc.gistic'
outputfile=r'C:\Users\ly091\Documents\result8\Gain\SporadicTumor-SporadicGermline.Gain.mc.gistic'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\SporadicTumor-SporadicGermline.Loss.mc.statistics.txt'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\SporadicTumor-SporadicGermline.Gain.mc.statistics.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\SporadicTumor-SporadicGermline.codes.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\SporadicTumor-SporadicGermline.codes.txt'
elif loopcontrol ==3:
elif loopcontrol ==3:
#Tumor-Germline
#Tumor-Germline
SampleSize99=ControlSize99=133
SampleSize99=ControlSize99=133
Samples = r'C:\Users\ly091\Documents\result8\Loss\tumor.Loss.centroless.seg'
Samples = r'C:\Users\ly091\Documents\result8\Gain\tumor.Gain.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\Loss\germline.Loss.centroless.seg'
Controls= r'C:\Users\ly091\Documents\result8\Gain\germline.Gain.centroless.seg'
outputfile=r'C:\Users\ly091\Documents\result8\Loss\Tumor-Germline.Loss.mc.gistic'
outputfile=r'C:\Users\ly091\Documents\result8\Gain\Tumor-Germline.Gain.mc.gistic'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\Loss\Tumor-Germline.Loss.mc.statistics.txt'
StatisticsOutput=r'C:\Users\ly091\Documents\result8\Gain\Tumor-Germline.Gain.mc.statistics.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\Tumor-Germline.codes.txt'
patientcodes=r'C:\Users\ly091\Documents\result8\paired samples\Tumor-Germline.codes.txt'
else:
else:
print("Error in control.")
print("Error in control.")
exit(444)
exit(444)
print("Now build the code map and the array map...")
print("Now build the code map and the array map...")
#build the code map
#build the code map
codemap=dict()#to store the sample information file
codemap=dict()#to store the sample information file
ifile = open(patientcodes, "r")
ifile = open(patientcodes, "r")
reader = csv.reader(ifile,delimiter='\t')
reader = csv.reader(ifile,delimiter='\t')
for row in reader:
for row in reader:
if len(row)==0:
if len(row)==0:
pass
pass
else:
else:
codemap[row[0]]=[0,0]#this is initialized to store the sample and control of this patients have segment in a particular genomic region
codemap[row[0]]=[0,0]#this is initialized to store the sample and control of this patients have segment in a particular genomic region
#0 is no, 1 is yes
#0 is no, 1 is yes
ifile.close()
ifile.close()
#build the array map
#build the array map
arraymap=dict()
arraymap=dict()
ifile = open(sampleinformationfile, "r")
ifile = open(sampleinformationfile, "r")
reader = csv.reader(ifile,delimiter='\t')
reader = csv.reader(ifile,delimiter='\t')
rownum = 0
rownum = 0
for row in reader:
for row in reader:
if rownum == 0:
if rownum == 0:
rownum+=1
rownum+=1
if row[17]!='Sample2':
if row[17]!='Sample2':
print("File error!")
print("File error!")
quit(883)
quit(883)
else:
else:
sample2=row[17]
sample2=row[17]
if sample2 in codemap:
if sample2 in codemap:
arraymap[row[0].split('.')[0]]=sample2
arraymap[row[0].split('.')[0]]=sample2
else:
else:
arraymap[row[0].split('.')[0]]='unused' #this is a tag for arrays that should not be used.
arraymap[row[0].split('.')[0]]='unused' #this is a tag for arrays that should not be used.
ifile.close()
ifile.close()
if Samples in SegmentsDict:
if Samples in SegmentsDict:
print("\n*** Use saved results for "+Samples +" ***\n")
print("\n*** Use saved results for "+Samples +" ***\n")
SampleSegments=copy.deepcopy(SegmentsDict[Samples])
SampleSegments=copy.deepcopy(SegmentsDict[Samples])
SampleScores=copy.deepcopy(ScoresDict[Samples])
SampleScores=copy.deepcopy(ScoresDict[Samples])
else:
else:
#read Samples:
#read Samples:
print("Now read Samples.")
print("Now read Samples.")
SampleSegments=list()
SampleSegments=list()
allsegments_unsorted=list()
allsegments_unsorted=list()
ifile = open(Samples, "r")
ifile = open(Samples, "r")
reader = csv.reader(ifile,delimiter='\t')
reader = csv.reader(ifile,delimiter='\t')
rownum = 0
rownum = 0
ndebug=0
ndebug=0
for row in reader:
for row in reader:
if rownum == 0:
if rownum == 0:
rownum+=1
rownum+=1
else:
else:
chromosome=0
chromosome=0
if row[1]=='X':
if row[1]=='X':
chromosome=23
chromosome=23
elif row[1]=='Y':
elif row[1]=='Y':
chromosome=24
chromosome=24
else:
else:
chromosome=int(row[1])
chromosome=int(row[1])
# if chromosome==4:
# if chromosome==4:
if True:
if True:
if ndebug>Recordstoread:
if ndebug>Recordstoread:
print("Warning: Sample file is not read to the end.")
print("Warning: Sample file is not read to the end.")
break
break
if arraymap[row[0]]!='unused':
if arraymap[row[0]]!='unused':
ndebug+=1
ndebug+=1
allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])])
allsegments_unsorted.append([row[0],chromosome, int(row[2]), int(row[3])])
ifile.close()
ifile.close()
allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3))
allsegments=sorted(allsegments_unsorted,key=operator.itemgetter(0,1,2,3))
array=''
array=''
for row in allsegments:
for row in allsegments:
if row[0]!=array:
if row[0]!=array:
array=row[0]
array=row[0]
newarray=[array,[row]]
newarray=[array,[row]]
SampleSegments.append(newarray)#order: array, [array, chromosome, start, end]
SampleSegments.append(newarray)#order: array, [array, chromosome, start, end]
else:
else:
SampleSegments[-1][1].append(row)
SampleSegments[-1][1].append(row)
SampleSegmentsBackup=copy.deepcopy(SampleSegments)#order: array, [array, chromosome, start, end], it is sorted
SampleSegmentsBackup=copy.deepcopy(SampleSegments)#order: array, [array, chromosome, start, end], it is sorted
cp=0
cp=0
SliceA=[]
SliceA=[]
SliceB=[]
SliceB=[]
pt=list() #list of pointers to the currently working segment in SampleSegments,
pt=list() #list of pointers to the currently working segment in SampleSegments,
#Calculate G-scores for Samples
#Calculate G-scores for Samples
SampleScores=list() #chromosome, start, end, p-value, G-score
SampleScores=list() #chromosome, start, end, p-value, G-score
#remember this is for LOH
#remember this is for LOH
print("Now Calculate G-Scores for Samples.")
print("Now Calculate G-Scores for Samples.")
#print(len(SampleSegments))
#print(len(SampleSegments))
for row in SampleSegments:
for row in SampleSegments:
pt.append(0)#initiate the pointers to 0
pt.append(0)#initiate the pointers to 0
#Find the slice
#Find the slice
while True:
while True:
#pre-set SliceA
#pre-set SliceA
#print("main while")
#print("main while")
empty=True
empty=True
for index, row in enumerate(SampleSegments):
for index, row in enumerate(SampleSegments):
if pt[index]<len(row[1]):
if pt[index]<len(row[1]):
## print(row[1][pt[index]])
## print(row[1][pt[index]])
lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
if lengthofsegment<2000000:#ignore very long segments:
if lengthofsegment<2000000:#ignore very long segments:
SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array
SliceA=row[1][pt[index]][:] #order: array, chromosome, start, end, just ignore array
empty=False
empty=False
break
break
#print(empty)
#print(empty)
if empty:
if empty:
break
break
## print("\nBefore refinement,SliceA"+str(SliceA))
## print("\nBefore refinement,SliceA"+str(SliceA))
#refine SliceA
#refine SliceA
for index,row in enumerate(SampleSegments):
for index,row in enumerate(SampleSegments):
if pt[index]<len(row[1]):
if pt[index]<len(row[1]):
lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
lengthofsegment=row[1][pt[index]][3]-row[1][pt[index]][2]
## print(lengthofsegment)
## print(lengthofsegment)
if lengthofsegment>=2000000:#ignore very long segments
if lengthofsegment>=2000000:#ignore very long segments
# pt[index]+=1
# pt[index]+=1
continue
continue
SliceB=row[1][pt[index]][:]
SliceB=row[1][pt[index]][:]
## print("\nIn refinement,SliceA:"+str(SliceA))
## print("\nIn refinement,SliceA:"+str(SliceA))
## print("In refinement, SliceB:"+str(SliceB))
## print("In refinement, SliceB:"+str(SliceB))
if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
pass
pass
elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
SliceA=SliceB[:]
SliceA=SliceB[:]
pass
pass
else: #same chromosome
else: #same chromosome
Chrom=SliceA[1]
Chrom=SliceA[1]
segA=SliceA[2:4]
segA=SliceA[2:4]
segB=SliceB[2:4]
segB=SliceB[2:4]
re=RelationshipOf(segA,segB)
re=RelationshipOf(segA,segB)
if re=='equal':
if re=='equal':
pass
pass
elif re=='no relationship':
elif re=='no relationship':
if segA[0]<segB[0]: #A is before B
if segA[0]<segB[0]: #A is before B
pass
pass
else: #A is after B
else: #A is after B
SliceA=SliceB[:]
SliceA=SliceB[:]
elif re=='cover':
elif re=='cover':
if SliceA[2]<SliceB[2]:
if SliceA[2]<SliceB[2]:
SliceA[3]=SliceB[2]
SliceA[3]=SliceB[2]
elif SliceA[2]==SliceB[2]:
elif SliceA[2]==SliceB[2]:
SliceA=SliceB[:]
SliceA=SliceB[:]
else:
else:
print("This is impossible!")
print("This is impossible!")
exit(388)
exit(388)
elif re=='in':
elif re=='in':
if SliceA[2]>SliceB[2]:
if SliceA[2]>SliceB[2]:
SliceA[2]=segB[0]
SliceA[2]=segB[0]
SliceA[3]=segA[0]
SliceA[3]=segA[0]
elif SliceA[2]==SliceB[2]:
elif SliceA[2]==SliceB[2]:
pass
pass
else:
else:
print("This is impossible!")
print("This is impossible!")
exit(554)
exit(554)
elif re=='overlap':
elif re=='overlap':
if SliceA[2]<SliceB[2]: #A before B
if SliceA[2]<SliceB[2]: #A before B
SliceA[3]=SliceB[2]
SliceA[3]=SliceB[2]
else: #A after B
else: #A after B
SliceA[3]=SliceA[2]
SliceA[3]=SliceA[2]
SliceA[2]=SliceB[2]
SliceA[2]=SliceB[2]
else:
else:
print(re)
print(re)
print("This is impossible!")
print("This is impossible!")
exit(323)
exit(323)
#use SliceA to go through all segments
#use SliceA to go through all segments
if cp%EchoPerLines==0:
if cp%EchoPerLines==0:
## if True:
## if True:
print(str(cp)+": Now working on Samples. SliceA after refinery: "+str(SliceA))
print(str(cp)+": Now working on Samples. SliceA after refinery: "+str(SliceA))
cp+=1
cp+=1
SumSample=0
SumSample=0
for index, row in enumerate(SampleSegments):
for index, row in enumerate(SampleSegments):
if pt[index]<len(row[1]):
if pt[index]<len(row[1]):
SliceB=row[1][pt[index]]#segment to be tested
SliceB=row[1][pt[index]]#segment to be tested
if SliceB[2]==SliceB[3]:#remove 0 length segments from the que.
if SliceB[2]==SliceB[3]:#remove 0 length segments from the que.
pt[index]+=1
pt[index]+=1
continue
continue
## print("\nSliceA: "+str(SliceA))
## print("\nSliceA: "+str(SliceA))
## print("SliceB: "+str(SliceB))
## print("SliceB: "+str(SliceB))
if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
if SliceA[1]<SliceB[1]: #A is before B in different chromosomes
## print("A is before B in different chromosomes.")
## print("A is before B in different chromosomes.")
continue
continue
elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
elif SliceA[1]>SliceB[1]:#A is after B in different chromsomes
pt[index]+=1
pt[index]+=1
continue
continue
else: #same chromosome
else: #same chromosome
Chrom=Slice
Chrom=SliceA[1]