Diff
checker
텍스트
텍스트
이미지
문서
Excel
폴더
Legal
Enterprise
데스크톱
요금제
로그인
데스크톱 앱 다운로드
텍스트 비교
두 텍스트 파일의 차이점을 찾아보세요
도구
기록
실시간 편집
변경 없는 행 숨기기
줄바꿈 비활성화
레이아웃
나란히 보기
합쳐 보기
비교 단위
스마트
단어
글자
구문 강조
언어 선택
제외
텍스트 변환
첫 변경으로
수정
Diffchecker Desktop
가장 안전하게 Diffchecker를 사용하는 방법. 데스크톱 앱을 사용하면 비교 데이터가 외부로 전송되지 않습니다!
데스크톱 앱 받기
Untitled diff
생성일
2년 전
비교 결과 만료 없음
초기화
내보내기
공유
설명
386 삭제
행
총
삭제
글자
총
삭제
이 기능을 계속 사용하려면 업그레이드해 주세요
Diff
checker
Pro
요금제 보기
563 행
복사
272 추가
행
총
추가
글자
총
추가
이 기능을 계속 사용하려면 업그레이드해 주세요
Diff
checker
Pro
요금제 보기
464 행
복사
복사
복사됨
복사
복사됨
import re
복사
복사됨
복사
복사됨
def getAnnotateInfoRow
_2(
from .utils_1 import Variant
row,
from .utils_for_marrvel_flatfile import (
genomeRef,
getClinVarUsingMarrvelFlatFile,
clinvarGeneDf,
getHGMDUsingFlatFile,
clinvarAlleleDf,
getAnnotateInfoRow_2,
omimGeneSortedDf,
)
omimAlleleList,
hgmdDf,
moduleList,
def getAnnotateInfoRow
s
_2(
decipherSortedDf,
varDf,
gnomadMetricsGeneSortedDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
):
복사
복사됨
복사
복사됨
# NOTE(JL): It is old implementation and not used.
# But left to for tracing purpose. Feel free to remove
def f(row):
return getAnnotateInfoRow_2(
row,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
)
annotateInfoDf = varDf.apply(f, axis=1, result_type='expand')
return annotateInfoDf
복사
복사됨
복사
복사됨
# CL 03-14-2023: commented all printing lines
# print('type of row:', type(row))
def getAnnotateInfoRow_3_1(row, genomeRef):
varObj = Variant()
varObj = Variant()
transcriptId = row.Feature
transcriptId = row.Feature
복사
복사됨
복사
복사됨
# s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A
####row[0]: 21_11039079_C/A
####s: ['21', '11039079', 'C/A']
# print('row[0]:', row[0])
# two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag
optFlag = 0
optFlag = 0
if row[0].find("/") != -1:
if row[0].find("/") != -1:
optFlag = 1
optFlag = 1
if optFlag == 0:
if optFlag == 0:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
ref = s[2]
ref = s[2]
alt = s[3]
alt = s[3]
elif optFlag == 1:
elif optFlag == 1:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
s = s[2].split("/")
s = s[2].split("/")
ref = s[0]
ref = s[0]
alt = s[1]
alt = s[1]
# get the start and stop from second column like '1:10203-10204'
# get the start and stop from second column like '1:10203-10204'
if "-" in row[1]:
if "-" in row[1]:
s = row[1].split(":")
s = row[1].split(":")
tmp = s[1]
tmp = s[1]
s = tmp.split("-")
s = tmp.split("-")
# print('s:',s)
# print('s:',s)
start = int(s[0])
start = int(s[0])
stop = int(s[1])
stop = int(s[1])
else:
else:
# start and stop the same
# start and stop the same
s = row[1].split(":")
s = row[1].split(":")
start = int(s[1])
start = int(s[1])
stop = int(s[1])
stop = int(s[1])
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# change chrom X and Y and MT to numbers
# change chrom X and Y and MT to numbers
if chrom == "X":
if chrom == "X":
chrom = 23
chrom = 23
elif chrom == "Y":
elif chrom == "Y":
chrom = 24
chrom = 24
elif chrom == "MT":
elif chrom == "MT":
chrom = 25
chrom = 25
elif re.search(r"GL", chrom):
elif re.search(r"GL", chrom):
chrom = 26
chrom = 26
chrom = int(chrom)
chrom = int(chrom)
# if it is hg38 get its hg19 coordinates
# if it is hg38 get its hg19 coordinates
# CL 03-14-2023: we have separate database for hg19 and hg38,
# CL 03-14-2023: we have separate database for hg19 and hg38,
# we don't need to use LiftOver which is inaccurate
# we don't need to use LiftOver which is inaccurate
# related codes commented and modified
# related codes commented and modified
if genomeRef == "hg38":
if genomeRef == "hg38":
varObj.hg38Chrom = chrom
varObj.hg38Chrom = chrom
varObj.hg38Pos = pos
varObj.hg38Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
복사
복사됨
복사
복사됨
"""
retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py
# retList=[newChrom, newPos]
varObj.hg19Chrom=retList[0]
varObj.hg19Pos=retList[1]
varObj.chrom=retList[0]
varObj.pos=retList[1]
#get the start
retList=gethg19LocFromHg38(chrom, start)
varObj.start=int(retList[1])
#get the stop
retList=gethg19LocFromHg38(chrom, stop)
varObj.stop=int(retList[1])
"""
else:
else:
varObj.hg19Chrom = chrom
varObj.hg19Chrom = chrom
varObj.hg19Pos = pos
varObj.hg19Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
geneSymbol = row.SYMBOL
geneSymbol = row.SYMBOL
# print('gene:', geneSymbol)
# print('gene:', geneSymbol)
varObj.geneSymbol = geneSymbol
varObj.geneSymbol = geneSymbol
varObj.CADD_phred = row.CADD_phred
varObj.CADD_phred = row.CADD_phred
varObj.CADD_PHRED = row.CADD_PHRED
varObj.CADD_PHRED = row.CADD_PHRED
# assign
# assign
varObj.ref = ref
varObj.ref = ref
varObj.alt = alt
varObj.alt = alt
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
# print('varId dash:', varObj.varId_dash)
# print('varId dash:', varObj.varId_dash)
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varObj.varId = varId
varObj.varId = varId
if "ZYG" in row:
if "ZYG" in row:
varObj.zyg = row.ZYG
varObj.zyg = row.ZYG
varObj.geneEnsId = row.Gene
varObj.geneEnsId = row.Gene
varObj.rsId = row.Existing_variation
varObj.rsId = row.Existing_variation
varObj.GERPpp_RS = row.GERPpp_RS
varObj.GERPpp_RS = row.GERPpp_RS
varObj.featureType = row.Feature_type
varObj.featureType = row.Feature_type
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.IMPACT = row.IMPACT
varObj.IMPACT = row.IMPACT
varObj.Consequence = row.Consequence
varObj.Consequence = row.Consequence
varObj.HGVSc = row.HGVSc
varObj.HGVSc = row.HGVSc
varObj.HGVSp = row.HGVSp
varObj.HGVSp = row.HGVSp
# dbnsfp attributes
# dbnsfp attributes
varObj.GERPpp_NR = row.GERPpp_NR
varObj.GERPpp_NR = row.GERPpp_NR
varObj.DANN_score = row.DANN_score
varObj.DANN_score = row.DANN_score
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_score = row.FATHMM_score
varObj.FATHMM_score = row.FATHMM_score
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.REVEL_score = row.REVEL_score
varObj.REVEL_score = row.REVEL_score
varObj.SIFT_score = row.SIFT_score
varObj.SIFT_score = row.SIFT_score
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_clnsig = (
varObj.clinvar_clnsig = (
row.clinvar_CLNSIG
row.clinvar_CLNSIG
) # CL: Clinvar SIG from clinvar.vcf.gz
) # CL: Clinvar SIG from clinvar.vcf.gz
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
varObj.clinvar_CLNREVSTAT = (
varObj.clinvar_CLNREVSTAT = (
row.clinvar_CLNREVSTAT
row.clinvar_CLNREVSTAT
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
varObj.clinvar_CLNSIGCONF = (
varObj.clinvar_CLNSIGCONF = (
row.clinvar_CLNSIGCONF
row.clinvar_CLNSIGCONF
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.M_CAP_score = row.M_CAP_score
varObj.M_CAP_score = row.M_CAP_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.Feature = row.Feature
varObj.Feature = row.Feature
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
if row.clinvar_CLNSIGCONF != "-":
if row.clinvar_CLNSIGCONF != "-":
clin_dict = dict()
clin_dict = dict()
for ro in row.clinvar_CLNSIGCONF.split("|_"):
for ro in row.clinvar_CLNSIGCONF.split("|_"):
temp = ro.split("(")
temp = ro.split("(")
clin_dict[temp[0]] = int(temp[1][0])
clin_dict[temp[0]] = int(temp[1][0])
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
"Likely_pathogenic", 0
"Likely_pathogenic", 0
)
)
varObj.clin_dict = clin_dict
varObj.clin_dict = clin_dict
varObj.clin_PLP = PLP_sum
varObj.clin_PLP = PLP_sum
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
else:
else:
if "benign" in row.clinvar_clnsig.lower():
if "benign" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 0
varObj.clin_PLP_perc = 0
elif "pathogenic" in row.clinvar_clnsig.lower():
elif "pathogenic" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 1
varObj.clin_PLP_perc = 1
else:
else:
varObj.clin_PLP_perc = "-"
varObj.clin_PLP_perc = "-"
varObj.clin_PLP = "-"
varObj.clin_PLP = "-"
varObj.clin_dict = "-"
varObj.clin_dict = "-"
if row.SpliceAI_pred != "-":
if row.SpliceAI_pred != "-":
varObj.spliceAI = row.SpliceAI_pred
varObj.spliceAI = row.SpliceAI_pred
temp = row.SpliceAI_pred.split("|")
temp = row.SpliceAI_pred.split("|")
varObj.spliceAImax = max(
varObj.spliceAImax = max(
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
)
)
else:
else:
varObj.spliceAI = "-"
varObj.spliceAI = "-"
varObj.spliceAImax = "-"
varObj.spliceAImax = "-"
복사
복사됨
복사
복사됨
if "conserve" in moduleList:
return vars(varObj)
# # get dgv: 1.3s
# # print('\nGetting DGV')
# dgvDictList = []
# typeList = []
# subtypeList = []
# dgvVarFound = 0
# dgvType = "-"
# dgvSubtype = "-"
# chromVal = int(varObj.chrom)
# posVal = int(varObj.pos)
# startVal = int(varObj.start)
# stopVal = int(varObj.stop)
# # CL 03-14-2023: changed column names to be compatible with hg38
# # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ]
# vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal]
# numRows = len(vals.index)
# if numRows > 0:
# dgvVarFound = 1
# # print('\tnumrows:',numRows)
# # print('\t type of vals:', type(vals))
# # print('\tvals:', vals)
# dgvType = vals.iloc[0]["type"]
# dgvSubtype = vals.iloc[0]["subType"]
# # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
# # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype)
# typeList.append(dgvType)
# subtypeList.append(dgvSubtype)
# retList = [dgvDictList, typeList, subtypeList, dgvVarFound]
복사
복사됨
복사
복사됨
# varObj.dgvDictList = retList[0]
# varObj.dgvTypeList = retList[1]
# varObj.dgvSubtypeList = retList[2]
# varObj.dgvVarFound = retList[3]
복사
복사됨
복사
복사됨
# get decipher: 0.6s
def getAnnotateInfoRow_3_2(
decipherDictList = []
varObj,
decipherDeletionObsList = []
decipherSortedDf,
decipherStudyList = []
):
decipherVarFound = 0
# get decipher: 0.6s
deletionObs = "-"
decipherDictList = []
# get the varaint object info from varObj
decipherDeletionObsList = []
chromVal = int(varObj.chrom)
decipherStudyList = []
posVal = int(varObj.pos)
decipherVarFound = 0
startVal = int(varObj.start)
deletionObs = "-"
stopVal = int(varObj.stop)
# get the varaint object info from varObj
chromVal = int(varObj.chrom)
# CL 03-14-2023: changed column names to be compatible with hg38
posVal = int(varObj.pos)
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
startVal = int(varObj.start)
if (chromVal, startVal, stopVal) in decipherSortedDf:
stopVal = int(varObj.stop)
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
복사
복사됨
복사
복사됨
decipherVarFound = 1
# CL 03-14-2023: changed column names to be compatible with hg38
deletionObs = vals.iloc[0]["deletion.obs"]
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
decipherDeletionObsList.append(deletionObs)
if (chromVal, startVal, stopVal) in decipherSortedDf:
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
복사
복사됨
복사
복사됨
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
decipherVarFound = 1
# print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs)
deletionObs = vals.iloc[0]["deletion.obs"]
retList = [
decipherDeletionObsList
.append(deletionObs)
decipherDictList,
decipherDeletionObsList
,
decipherStudyList,
decipherVarFound,
]
복사
복사됨
복사
복사됨
#
[decipherDictList,
decipherDeletionObs
List,decipherStudyList, decipherVarFound]
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
varObj.
decipherDictList
= retList[0]
#
print('\tdecipherVarFound:',decipherVarFound,'
decipherDeletionObs
:', deletionObs)
varObj.
decipherDeletionObsList
= retList[1]
retList = [
varObj.decipherStudyList = retList[2]
decipherDictList
,
varObj.decipherVarFound = retList[3]
decipherDeletionObsList
,
decipherStudyList,
# get gnomad gene metrics from gnomad file: 3.1s
decipherVarFound,
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
]
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
복사
복사됨
복사
복사됨
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
varObj.gnomadGeneZscore =
retList[0]
return {
varObj.gnomadGenePLI =
retList[1]
"decipherDictList":
retList[0]
,
varObj.gnomadGeneOELof =
retList[2]
# O/E lof
"decipherDeletionObsList":
retList[1]
,
varObj.gnomadGeneOELofUpper =
retList[3]
# O/E lof upper
"decipherStudyList":
retList[2]
,
"decipherVarFound":
retList[3]
,
}
복사
복사됨
복사
복사됨
if "curate" in moduleList:
# get
OMIM: 2s
def getAnnotateInfoRow_3_3(
# print('\nGetting OMIM')
varObj,
# varObj.omimList=jsonDict['omim']
gnomadMetricsGeneSortedDf,
# retList=[varFound, geneFound, omimDict,
omimGeneDict
,
omimAlleleDict
]
):
inputSnpList
= []
# get gnomad gene metrics from gnomad file: 3.1s
if "," in
varObj.
rsId
:
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
inputSnpList
=
varObj.
rsId.split(",")
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get
the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
return {
"gnomadGeneZscore": retList[0],
"gnomadGenePLI": retList[1],
"gnomadGeneOELof": retList[2], # O/E lof
"gnomadGeneOELofUpper": retList[3], # O/E lof upper
}
def getAnnotateInfoRow_3_4(
varObj,
omimGeneSortedDf,
):
# get OMIM: 2s
inputSnpList = []
if "," in varObj.rsId:
inputSnpList = varObj.rsId.split(",")
else:
inputSnpList = varObj.rsId
varFound = 0
geneFound = 0
omimDict = {}
omimGeneDict
= {}
omimAlleleDict
= {}
phenoList = []
phenoInhList = []
phenoMimList
= []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if
varObj.
geneSymbol in omimGeneSortedDf.index
:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict
=
omimGeneSortedDf.loc[
varObj.
geneSymbol]
snpList = []
for a in omimGeneDict["allelicVariants"]:
if "dbSnps" in a:
snpList.append(a["dbSnps"])
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
복사
복사됨
복사
복사됨
inputSnpList = varObj.rsId
varFound = 0
# print('\tinputSnpList:', inputSnpList)
varFound = 0
# get disease info from OMIM
geneFound = 0
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
omimDict = {}
for a in omimGeneDict["
phenotypes
"]:
omimGeneDict = {}
# print('type:', type(a))
omimAlleleDict = {}
pheno = a["phenotype"]
phenoList = []
if "
phenotypeMimNumber
" in a:
phenoInhList = []
phenoMim = a["phenotypeMimNumber"]
phenoMimList = []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if varObj.geneSymbol in omimGeneSortedDf.index:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol]
snpList = []
for a in omimGeneDict["
allelicVariants
"]:
# print('a:', a)
# print('type:', type(a))
if "
dbSnps
" in a:
snpList.append(a["dbSnps"])
# print('\tsnpList:', snpList)
# print('\tlen snpList:', len(snpList))
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
복사
복사됨
복사
복사됨
varFound = 0
phenoMim = "-"
if "phenotypeInheritance" in a:
# get disease info from OMIM
phenoInh = a["phenotypeInheritance"]
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
else:
for a in omimGeneDict["phenotypes"]:
phenoInh = "-"
# print('type:', type(a))
phenoList.append(pheno)
pheno = a["phenotype"]
phenoInhList.append(phenoInh)
if "phenotypeMimNumber" in a:
phenoMimList.append(str(phenoMim))
phenoMim = a["phenotypeMimNumber"]
# print('phenotype:', pheno,phenoMim,phenoInh)
else:
phenoMim = "-"
if "phenotypeInheritance" in a:
phenoInh = a["phenotypeInheritance"]
else:
phenoInh = "-"
phenoList.append(pheno)
phenoInhList.append(phenoInh)
phenoMimList.append(str(phenoMim))
# print('phenotype:', pheno,phenoMim,phenoInh)
# print('\tvarFound:', varFound)
# print('\tphenoList:', phenoList)
# print('\tphenoInhList:', phenoInhList)
# print('\tphenoMimList:', phenoMimList)
omimRet = [
varFound,
geneFound,
omimDict,
omimGeneDict,
omimAlleleDict,
phenoList,
phenoInhList,
phenoMimList,
]
복사
복사됨
복사
복사됨
varObj.omimVarFound = omimRet[0]
omimRet = [
varObj.omimG
eneFound
= omimRet[1]
varFound,
varObj.
omimDict
= omimRet[2]
g
eneFound
,
varObj.
omimGeneDict
= omimRet[3]
omimDict
,
varObj.
omimAlleleDict
= omimRet[4]
omimGeneDict
,
varObj.
phenoList
= omimRet[5]
omimAlleleDict
,
varObj.
phenoInhList
= omimRet[6]
phenoList
,
varObj.
phenoMimList
= omimRet[7]
phenoInhList
,
# print('OMIM res:')
phenoMimList
,
# print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound )
]
복사
복사됨
복사
복사됨
# get clinvar: 0.1s
return {
# print('\nReading clinVar')
"omimVarFound": omimRet[0],
clinVarRet = getClinVarUsingMarrvelFlatFile(
"omimGeneFound": omimRet[1],
varObj, clinvarAlleleDf, clinvarGeneDf
"omimDict": omimRet[2],
"omimGeneDict": omimRet[3],
"omimAlleleDict": omimRet[4],
"phenoList": omimRet[5],
"phenoInhList": omimRet[6],
"phenoMimList": omimRet[7],
}
def getAnnotateInfoRow_3_5(
varObj,
clinvarGeneDf,
clinvarAlleleDf,
):
clinVarRet = getClinVarUsingMarrvelFlatFile(
varObj, clinvarAlleleDf, clinvarGeneDf
)
clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
return {
"clinVarVarFound": clinVarRet[0],
"clinVarVarDict": clinVarRet[1],
"clinVarGeneFound": clinVarRet[2],
"clinVarGeneDict": clinVarRet[3],
"clinvarTotalNumVars": clinVarRet[4],
"clinvarNumP": clinVarRet[5],
"clinvarNumLP": clinVarRet[6],
"clinvarNumLB": clinVarRet[7],
"clinvarNumB": clinVarRet[8],
"clinvarTitle": clinVarRet[9],
"clinvarSignDesc": clinVarRet[10],
"clinvarCondition": clinVarRet[11],
}
def getAnnotateInfoRow_3_6(
varObj,
hgmdHPOScoreDf,
):
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf)
return {
"hgmdVarFound": hgmdRet[0],
"hgmdGeneFound": hgmdRet[1],
"hgmdVarPhenIdList": hgmdRet[2],
"hgmdVarHPOIdList": hgmdRet[3],
"hgmdVarHPOStrList": hgmdRet[4],
}
def getAnnotateInfoRows_3(
vepDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
def f1(row):
return getAnnotateInfoRow_3_1(row, genomeRef)
def f2(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_2(row, decipherSortedDf)
def f3(row):
if "conserve" not in moduleList:
return row
return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf)
def f4(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_4(row, omimGeneSortedDf)
def f5(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf)
def f6(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_6(
row, hgmdHPOScoreDf
)
)
복사
복사됨
복사
복사됨
varObj.clinVarVarFound = clinVarRet[0]
varObj.clinVarVarDict = clinVarRet[1]
varObj.clinVarGeneFound = clinVarRet[2]
varObj.clinVarGeneDict = clinVarRet[3]
varObj.clinvarTotalNumVars = clinVarRet[4]
varObj.clinvarNumP = clinVarRet[5]
varObj.clinvarNumLP = clinVarRet[6]
varObj.clinvarNumLB = clinVarRet[7]
varObj.clinvarNumB = clinVarRet[8]
varObj.clinvarTitle = clinVarRet[9]
varObj.clinvarSignDesc = (
row.clinvar_CLNSIG
) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
varObj.clinvarCondition = clinVarRet[11]
# print('clinVar res:')
"""
if debugFlag==1:
print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound)
print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB)
print('\tsignDesc:', varObj.clinvarSignDesc)
"""
# get HGMD: 0.3s
if "curate" in moduleList:
# print('\nReading HGMD')
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf)
# hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList
varObj.hgmdVarFound = hgmdRet[0]
varObj.hgmdGeneFound = hgmdRet[1]
varObj.hgmdVarPhenIdList = hgmdRet[2]
varObj.hgmdVarHPOIdList = hgmdRet[3]
varObj.hgmdVarHPOStrList = hgmdRet[4]
# print('HGMD results:')
# print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound,
# 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:',
# varObj.hgmdVarHPOIdList,
# 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList)
return {
"hg19Chrom": varObj.hg19Chrom,
"hg19Pos": varObj.hg19Pos,
"chrom": varObj.chrom,
"pos": varObj.pos,
"start": varObj.start,
"stop": varObj.stop,
"geneSymbol": varObj.geneSymbol,
"CADD_phred": varObj.CADD_phred,
"CADD_PHRED": varObj.CADD_PHRED,
"ref": varObj.ref,
"alt": varObj.alt,
"varId": varObj.varId,
"ZYG": varObj.zyg,
"HGVSc": varObj.HGVSc,
"HGVSp": varObj.HGVSp,
"Gene": varObj.geneEnsId,
"Existing_variation": varObj.rsId,
"GERPpp_RS": varObj.GERPpp_RS,
"Feature_type": varObj.featureType,
"gnomadAF": varObj.gnomadAF,
"gnomadAFg": varObj.gnomadAFg,
"CLIN_SIG": varObj.CLIN_SIG,
"LRT_Omega": varObj.LRT_Omega,
"LRT_score": varObj.LRT_score,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
# dbnsfp attributes
"GERPpp_NR": varObj.GERPpp_NR,
"DANN_score": varObj.DANN_score,
"FATHMM_pred": varObj.FATHMM_pred,
"FATHMM_score": varObj.FATHMM_score,
"GTEx_V8_gene": varObj.GTEx_V8_gene,
"GTEx_V8_tissue": varObj.GTEx_V8_tissue,
"Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score,
"Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score,
"REVEL_score": varObj.REVEL_score,
"SIFT_score": varObj.SIFT_score,
"clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz
"clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz
"clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
"clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz
"clin_code": varObj.clin_code, # CL: feature for ai
"fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score,
"LRT_score": varObj.LRT_score,
"LRT_Omega": varObj.LRT_Omega,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
"M_CAP_score": varObj.M_CAP_score,
"MutationAssessor_score": varObj.MutationAssessor_score,
"MutationTaster_score": varObj.MutationTaster_score,
"ESP6500_AA_AC": varObj.ESP6500_AA_AC,
"ESP6500_AA_AF": varObj.ESP6500_AA_AF,
"ESP6500_EA_AC": varObj.ESP6500_EA_AC,
"ESP6500_EA_AF": varObj.ESP6500_EA_AF,
# dbnsfp
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper,
"IMPACT": varObj.IMPACT,
"Consequence": varObj.Consequence,
"omimVarFound": varObj.omimVarFound,
"omimGeneFound": varObj.omimGeneFound,
"omimDict": varObj.omimDict,
"omimGeneDict": varObj.omimGeneDict,
"omimAlleleDict": varObj.omimAlleleDict,
"phenoList": varObj.phenoList,
"phenoInhList": varObj.phenoInhList,
"phenoMimList": varObj.phenoMimList,
"clinVarVarFound": varObj.clinVarVarFound,
"clinVarVarDict": varObj.clinVarVarDict,
"clinVarGeneFound": varObj.clinVarGeneFound,
"clinVarGeneDict": varObj.clinVarGeneDict,
"clinvarTotalNumVars": varObj.clinvarTotalNumVars,
"clinvarNumP": varObj.clinvarNumP,
"clinvarNumLP": varObj.clinvarNumLP,
"clinvarNumLB": varObj.clinvarNumLB,
"clinvarNumB": varObj.clinvarNumB,
"clinvarTitle": varObj.clinvarTitle,
"clinvarSignDesc": varObj.clinvarSignDesc,
"clinvarCondition": varObj.clinvarCondition,
"hgmdVarFound": varObj.hgmdVarFound,
"hgmdGeneFound": varObj.hgmdGeneFound,
"hgmdVarPhenIdList": varObj.hgmdVarPhenIdList,
"hgmdVarHPOIdList": varObj.hgmdVarHPOIdList,
"hgmdVarHPOStrList": varObj.hgmdVarHPOStrList,
"varId_dash": varObj.varId_dash,
"dgvDictList": varObj.dgvDictList,
"dgvTypeList": varObj.dgvTypeList,
"dgvSubtypeList": varObj.dgvSubtypeList,
"dgvVarFound": varObj.dgvVarFound,
"decipherDictList": varObj.decipherDictList,
"decipherDeletionObsList": varObj.decipherDeletionObsList,
"decipherStudyList": varObj.decipherStudyList,
"decipherVarFound": varObj.decipherVarFound,
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof,
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper,
# symptom
"SymptomMatched": varObj.SymptomMatched,
"symptomScore": varObj.symptomScore,
"symptomName": varObj.symptomName,
"omimSymptomSimScore": varObj.omimSymptomSimScore,
"omimSymMatchFlag": varObj.omimSymMatchFlag,
"hgmdSymptomScore": varObj.hgmdSymptomScore,
"hgmdSymptomSimScore": varObj.hgmdSymptomSimScore,
"hgmdSymMatchFlag": varObj.hgmdSymMatchFlag,
"clinVarSymMatchFlag": varObj.clinVarSymMatchFlag,
"VARIANT_CLASS": varObj.VARIANT_CLASS,
"Feature": varObj.Feature,
"hom": varObj.hom,
"hgmd_rs": varObj.hgmd_rs,
"hgmd_id": varObj.hgmd_id, # CL added
"hgmd_symbol": varObj.hgmd_symbol, # CL added
"hgmd_PHEN": varObj.hgmd_PHEN, # CL added
"hgmd_CLASS": varObj.hgmd_CLASS, # CL added
"clin_dict": varObj.clin_dict,
"clin_PLP": varObj.clin_PLP,
"clin_PLP_perc": varObj.clin_PLP_perc,
"spliceAI": varObj.spliceAI,
"spliceAImax": varObj.spliceAImax,
복사
복사됨
복사
복사됨
"zyg": varObj.zyg,
annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand')
'geneEnsId': varObj.geneEnsId,
df = annotateInfoDf.apply(f2, axis=1, result_type='expand')
'rsId': varObj.rsId
annotateInfoDf[df.columns] = df
}
df = annotateInfoDf.apply(f3, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f4, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f5, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f6, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
return annotateInfoDf
저장된 비교 결과
원본
파일 열기
def getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # CL 03-14-2023: commented all printing lines # print('type of row:', type(row)) varObj = Variant() transcriptId = row.Feature # s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A ####row[0]: 21_11039079_C/A ####s: ['21', '11039079', 'C/A'] # print('row[0]:', row[0]) # two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop """ retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py # retList=[newChrom, newPos] varObj.hg19Chrom=retList[0] varObj.hg19Pos=retList[1] varObj.chrom=retList[0] varObj.pos=retList[1] #get the start retList=gethg19LocFromHg38(chrom, start) varObj.start=int(retList[1]) #get the stop retList=gethg19LocFromHg38(chrom, stop) varObj.stop=int(retList[1]) """ else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" if "conserve" in moduleList: # # get dgv: 1.3s # # print('\nGetting DGV') # dgvDictList = [] # typeList = [] # subtypeList = [] # dgvVarFound = 0 # dgvType = "-" # dgvSubtype = "-" # chromVal = int(varObj.chrom) # posVal = int(varObj.pos) # startVal = int(varObj.start) # stopVal = int(varObj.stop) # # CL 03-14-2023: changed column names to be compatible with hg38 # # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ] # vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal] # numRows = len(vals.index) # if numRows > 0: # dgvVarFound = 1 # # print('\tnumrows:',numRows) # # print('\t type of vals:', type(vals)) # # print('\tvals:', vals) # dgvType = vals.iloc[0]["type"] # dgvSubtype = vals.iloc[0]["subType"] # # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype) # typeList.append(dgvType) # subtypeList.append(dgvSubtype) # retList = [dgvDictList, typeList, subtypeList, dgvVarFound] # varObj.dgvDictList = retList[0] # varObj.dgvTypeList = retList[1] # varObj.dgvSubtypeList = retList[2] # varObj.dgvVarFound = retList[3] # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.decipherDictList = retList[0] varObj.decipherDeletionObsList = retList[1] varObj.decipherStudyList = retList[2] varObj.decipherVarFound = retList[3] # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.gnomadGeneZscore = retList[0] varObj.gnomadGenePLI = retList[1] varObj.gnomadGeneOELof = retList[2] # O/E lof varObj.gnomadGeneOELofUpper = retList[3] # O/E lof upper if "curate" in moduleList: # get OMIM: 2s # print('\nGetting OMIM') # varObj.omimList=jsonDict['omim'] # retList=[varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict] inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId # print('\tinputSnpList:', inputSnpList) varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: # print('a:', a) # print('type:', type(a)) if "dbSnps" in a: snpList.append(a["dbSnps"]) # print('\tsnpList:', snpList) # print('\tlen snpList:', len(snpList)) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) # print('\tvarFound:', varFound) # print('\tphenoList:', phenoList) # print('\tphenoInhList:', phenoInhList) # print('\tphenoMimList:', phenoMimList) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] varObj.omimVarFound = omimRet[0] varObj.omimGeneFound = omimRet[1] varObj.omimDict = omimRet[2] varObj.omimGeneDict = omimRet[3] varObj.omimAlleleDict = omimRet[4] varObj.phenoList = omimRet[5] varObj.phenoInhList = omimRet[6] varObj.phenoMimList = omimRet[7] # print('OMIM res:') # print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound ) # get clinvar: 0.1s # print('\nReading clinVar') clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) varObj.clinVarVarFound = clinVarRet[0] varObj.clinVarVarDict = clinVarRet[1] varObj.clinVarGeneFound = clinVarRet[2] varObj.clinVarGeneDict = clinVarRet[3] varObj.clinvarTotalNumVars = clinVarRet[4] varObj.clinvarNumP = clinVarRet[5] varObj.clinvarNumLP = clinVarRet[6] varObj.clinvarNumLB = clinVarRet[7] varObj.clinvarNumB = clinVarRet[8] varObj.clinvarTitle = clinVarRet[9] varObj.clinvarSignDesc = ( row.clinvar_CLNSIG ) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation varObj.clinvarCondition = clinVarRet[11] # print('clinVar res:') """ if debugFlag==1: print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound) print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB) print('\tsignDesc:', varObj.clinvarSignDesc) """ # get HGMD: 0.3s if "curate" in moduleList: # print('\nReading HGMD') hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf) # hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList varObj.hgmdVarFound = hgmdRet[0] varObj.hgmdGeneFound = hgmdRet[1] varObj.hgmdVarPhenIdList = hgmdRet[2] varObj.hgmdVarHPOIdList = hgmdRet[3] varObj.hgmdVarHPOStrList = hgmdRet[4] # print('HGMD results:') # print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound, # 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:', # varObj.hgmdVarHPOIdList, # 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList) return { "hg19Chrom": varObj.hg19Chrom, "hg19Pos": varObj.hg19Pos, "chrom": varObj.chrom, "pos": varObj.pos, "start": varObj.start, "stop": varObj.stop, "geneSymbol": varObj.geneSymbol, "CADD_phred": varObj.CADD_phred, "CADD_PHRED": varObj.CADD_PHRED, "ref": varObj.ref, "alt": varObj.alt, "varId": varObj.varId, "ZYG": varObj.zyg, "HGVSc": varObj.HGVSc, "HGVSp": varObj.HGVSp, "Gene": varObj.geneEnsId, "Existing_variation": varObj.rsId, "GERPpp_RS": varObj.GERPpp_RS, "Feature_type": varObj.featureType, "gnomadAF": varObj.gnomadAF, "gnomadAFg": varObj.gnomadAFg, "CLIN_SIG": varObj.CLIN_SIG, "LRT_Omega": varObj.LRT_Omega, "LRT_score": varObj.LRT_score, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, # dbnsfp attributes "GERPpp_NR": varObj.GERPpp_NR, "DANN_score": varObj.DANN_score, "FATHMM_pred": varObj.FATHMM_pred, "FATHMM_score": varObj.FATHMM_score, "GTEx_V8_gene": varObj.GTEx_V8_gene, "GTEx_V8_tissue": varObj.GTEx_V8_tissue, "Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score, "Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score, "REVEL_score": varObj.REVEL_score, "SIFT_score": varObj.SIFT_score, "clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz "clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz "clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only "clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz "clin_code": varObj.clin_code, # CL: feature for ai "fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score, "LRT_score": varObj.LRT_score, "LRT_Omega": varObj.LRT_Omega, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, "M_CAP_score": varObj.M_CAP_score, "MutationAssessor_score": varObj.MutationAssessor_score, "MutationTaster_score": varObj.MutationTaster_score, "ESP6500_AA_AC": varObj.ESP6500_AA_AC, "ESP6500_AA_AF": varObj.ESP6500_AA_AF, "ESP6500_EA_AC": varObj.ESP6500_EA_AC, "ESP6500_EA_AF": varObj.ESP6500_EA_AF, # dbnsfp "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper, "IMPACT": varObj.IMPACT, "Consequence": varObj.Consequence, "omimVarFound": varObj.omimVarFound, "omimGeneFound": varObj.omimGeneFound, "omimDict": varObj.omimDict, "omimGeneDict": varObj.omimGeneDict, "omimAlleleDict": varObj.omimAlleleDict, "phenoList": varObj.phenoList, "phenoInhList": varObj.phenoInhList, "phenoMimList": varObj.phenoMimList, "clinVarVarFound": varObj.clinVarVarFound, "clinVarVarDict": varObj.clinVarVarDict, "clinVarGeneFound": varObj.clinVarGeneFound, "clinVarGeneDict": varObj.clinVarGeneDict, "clinvarTotalNumVars": varObj.clinvarTotalNumVars, "clinvarNumP": varObj.clinvarNumP, "clinvarNumLP": varObj.clinvarNumLP, "clinvarNumLB": varObj.clinvarNumLB, "clinvarNumB": varObj.clinvarNumB, "clinvarTitle": varObj.clinvarTitle, "clinvarSignDesc": varObj.clinvarSignDesc, "clinvarCondition": varObj.clinvarCondition, "hgmdVarFound": varObj.hgmdVarFound, "hgmdGeneFound": varObj.hgmdGeneFound, "hgmdVarPhenIdList": varObj.hgmdVarPhenIdList, "hgmdVarHPOIdList": varObj.hgmdVarHPOIdList, "hgmdVarHPOStrList": varObj.hgmdVarHPOStrList, "varId_dash": varObj.varId_dash, "dgvDictList": varObj.dgvDictList, "dgvTypeList": varObj.dgvTypeList, "dgvSubtypeList": varObj.dgvSubtypeList, "dgvVarFound": varObj.dgvVarFound, "decipherDictList": varObj.decipherDictList, "decipherDeletionObsList": varObj.decipherDeletionObsList, "decipherStudyList": varObj.decipherStudyList, "decipherVarFound": varObj.decipherVarFound, "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # symptom "SymptomMatched": varObj.SymptomMatched, "symptomScore": varObj.symptomScore, "symptomName": varObj.symptomName, "omimSymptomSimScore": varObj.omimSymptomSimScore, "omimSymMatchFlag": varObj.omimSymMatchFlag, "hgmdSymptomScore": varObj.hgmdSymptomScore, "hgmdSymptomSimScore": varObj.hgmdSymptomSimScore, "hgmdSymMatchFlag": varObj.hgmdSymMatchFlag, "clinVarSymMatchFlag": varObj.clinVarSymMatchFlag, "VARIANT_CLASS": varObj.VARIANT_CLASS, "Feature": varObj.Feature, "hom": varObj.hom, "hgmd_rs": varObj.hgmd_rs, "hgmd_id": varObj.hgmd_id, # CL added "hgmd_symbol": varObj.hgmd_symbol, # CL added "hgmd_PHEN": varObj.hgmd_PHEN, # CL added "hgmd_CLASS": varObj.hgmd_CLASS, # CL added "clin_dict": varObj.clin_dict, "clin_PLP": varObj.clin_PLP, "clin_PLP_perc": varObj.clin_PLP_perc, "spliceAI": varObj.spliceAI, "spliceAImax": varObj.spliceAImax, "zyg": varObj.zyg, 'geneEnsId': varObj.geneEnsId, 'rsId': varObj.rsId }
수정본
파일 열기
import re from .utils_1 import Variant from .utils_for_marrvel_flatfile import ( getClinVarUsingMarrvelFlatFile, getHGMDUsingFlatFile, getAnnotateInfoRow_2, ) def getAnnotateInfoRows_2( varDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # NOTE(JL): It is old implementation and not used. # But left to for tracing purpose. Feel free to remove def f(row): return getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ) annotateInfoDf = varDf.apply(f, axis=1, result_type='expand') return annotateInfoDf def getAnnotateInfoRow_3_1(row, genomeRef): varObj = Variant() transcriptId = row.Feature optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" return vars(varObj) def getAnnotateInfoRow_3_2( varObj, decipherSortedDf, ): # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] return { "decipherDictList": retList[0], "decipherDeletionObsList": retList[1], "decipherStudyList": retList[2], "decipherVarFound": retList[3], } def getAnnotateInfoRow_3_3( varObj, gnomadMetricsGeneSortedDf, ): # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] return { "gnomadGeneZscore": retList[0], "gnomadGenePLI": retList[1], "gnomadGeneOELof": retList[2], # O/E lof "gnomadGeneOELofUpper": retList[3], # O/E lof upper } def getAnnotateInfoRow_3_4( varObj, omimGeneSortedDf, ): # get OMIM: 2s inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: if "dbSnps" in a: snpList.append(a["dbSnps"]) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] return { "omimVarFound": omimRet[0], "omimGeneFound": omimRet[1], "omimDict": omimRet[2], "omimGeneDict": omimRet[3], "omimAlleleDict": omimRet[4], "phenoList": omimRet[5], "phenoInhList": omimRet[6], "phenoMimList": omimRet[7], } def getAnnotateInfoRow_3_5( varObj, clinvarGeneDf, clinvarAlleleDf, ): clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation return { "clinVarVarFound": clinVarRet[0], "clinVarVarDict": clinVarRet[1], "clinVarGeneFound": clinVarRet[2], "clinVarGeneDict": clinVarRet[3], "clinvarTotalNumVars": clinVarRet[4], "clinvarNumP": clinVarRet[5], "clinvarNumLP": clinVarRet[6], "clinvarNumLB": clinVarRet[7], "clinvarNumB": clinVarRet[8], "clinvarTitle": clinVarRet[9], "clinvarSignDesc": clinVarRet[10], "clinvarCondition": clinVarRet[11], } def getAnnotateInfoRow_3_6( varObj, hgmdHPOScoreDf, ): hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf) return { "hgmdVarFound": hgmdRet[0], "hgmdGeneFound": hgmdRet[1], "hgmdVarPhenIdList": hgmdRet[2], "hgmdVarHPOIdList": hgmdRet[3], "hgmdVarHPOStrList": hgmdRet[4], } def getAnnotateInfoRows_3( vepDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): def f1(row): return getAnnotateInfoRow_3_1(row, genomeRef) def f2(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_2(row, decipherSortedDf) def f3(row): if "conserve" not in moduleList: return row return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf) def f4(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_4(row, omimGeneSortedDf) def f5(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf) def f6(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_6( row, hgmdHPOScoreDf ) annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand') df = annotateInfoDf.apply(f2, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f3, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f4, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f5, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f6, axis=1, result_type='expand') annotateInfoDf[df.columns] = df return annotateInfoDf
비교하기