Diff
checker
文本
文本
圖像
文檔
Excel
文件夾
Legal
Enterprise
桌面版
定價
登入
下載 Diffchecker 桌面版
比較文本
尋找兩個文字檔案之間的差異
工具
歷史
即時編輯器
摺疊未變更行
關閉換行
檢視
拆分
統一
比對精度
智能
單詞
字符
語法突出顯示
選擇語法
忽略
文字轉換
前往第一個差異
編輯輸入
Diffchecker Desktop
執行Diffchecker最安全的方式。取得Diffchecker桌面應用程式:您的差異永遠不會離開您的電腦!
取得桌面版
Untitled diff
建立於
2 年前
差異永不過期
清除
匯出
分享
解釋
386 刪除
行
總計
刪除
字符
總計
刪除
要繼續使用此功能,請升級到
Diff
checker
Pro
查看價格
563 行
全部複製
272 新增
行
總計
新增
字符
總計
新增
要繼續使用此功能,請升級到
Diff
checker
Pro
查看價格
464 行
全部複製
複製
已複製
複製
已複製
import re
複製
已複製
複製
已複製
def getAnnotateInfoRow
_2(
from .utils_1 import Variant
row,
from .utils_for_marrvel_flatfile import (
genomeRef,
getClinVarUsingMarrvelFlatFile,
clinvarGeneDf,
getHGMDUsingFlatFile,
clinvarAlleleDf,
getAnnotateInfoRow_2,
omimGeneSortedDf,
)
omimAlleleList,
hgmdDf,
moduleList,
def getAnnotateInfoRow
s
_2(
decipherSortedDf,
varDf,
gnomadMetricsGeneSortedDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
):
複製
已複製
複製
已複製
# NOTE(JL): It is old implementation and not used.
# But left to for tracing purpose. Feel free to remove
def f(row):
return getAnnotateInfoRow_2(
row,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
)
annotateInfoDf = varDf.apply(f, axis=1, result_type='expand')
return annotateInfoDf
複製
已複製
複製
已複製
# CL 03-14-2023: commented all printing lines
# print('type of row:', type(row))
def getAnnotateInfoRow_3_1(row, genomeRef):
varObj = Variant()
varObj = Variant()
transcriptId = row.Feature
transcriptId = row.Feature
複製
已複製
複製
已複製
# s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A
####row[0]: 21_11039079_C/A
####s: ['21', '11039079', 'C/A']
# print('row[0]:', row[0])
# two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag
optFlag = 0
optFlag = 0
if row[0].find("/") != -1:
if row[0].find("/") != -1:
optFlag = 1
optFlag = 1
if optFlag == 0:
if optFlag == 0:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
ref = s[2]
ref = s[2]
alt = s[3]
alt = s[3]
elif optFlag == 1:
elif optFlag == 1:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
s = s[2].split("/")
s = s[2].split("/")
ref = s[0]
ref = s[0]
alt = s[1]
alt = s[1]
# get the start and stop from second column like '1:10203-10204'
# get the start and stop from second column like '1:10203-10204'
if "-" in row[1]:
if "-" in row[1]:
s = row[1].split(":")
s = row[1].split(":")
tmp = s[1]
tmp = s[1]
s = tmp.split("-")
s = tmp.split("-")
# print('s:',s)
# print('s:',s)
start = int(s[0])
start = int(s[0])
stop = int(s[1])
stop = int(s[1])
else:
else:
# start and stop the same
# start and stop the same
s = row[1].split(":")
s = row[1].split(":")
start = int(s[1])
start = int(s[1])
stop = int(s[1])
stop = int(s[1])
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# change chrom X and Y and MT to numbers
# change chrom X and Y and MT to numbers
if chrom == "X":
if chrom == "X":
chrom = 23
chrom = 23
elif chrom == "Y":
elif chrom == "Y":
chrom = 24
chrom = 24
elif chrom == "MT":
elif chrom == "MT":
chrom = 25
chrom = 25
elif re.search(r"GL", chrom):
elif re.search(r"GL", chrom):
chrom = 26
chrom = 26
chrom = int(chrom)
chrom = int(chrom)
# if it is hg38 get its hg19 coordinates
# if it is hg38 get its hg19 coordinates
# CL 03-14-2023: we have separate database for hg19 and hg38,
# CL 03-14-2023: we have separate database for hg19 and hg38,
# we don't need to use LiftOver which is inaccurate
# we don't need to use LiftOver which is inaccurate
# related codes commented and modified
# related codes commented and modified
if genomeRef == "hg38":
if genomeRef == "hg38":
varObj.hg38Chrom = chrom
varObj.hg38Chrom = chrom
varObj.hg38Pos = pos
varObj.hg38Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
複製
已複製
複製
已複製
"""
retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py
# retList=[newChrom, newPos]
varObj.hg19Chrom=retList[0]
varObj.hg19Pos=retList[1]
varObj.chrom=retList[0]
varObj.pos=retList[1]
#get the start
retList=gethg19LocFromHg38(chrom, start)
varObj.start=int(retList[1])
#get the stop
retList=gethg19LocFromHg38(chrom, stop)
varObj.stop=int(retList[1])
"""
else:
else:
varObj.hg19Chrom = chrom
varObj.hg19Chrom = chrom
varObj.hg19Pos = pos
varObj.hg19Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
geneSymbol = row.SYMBOL
geneSymbol = row.SYMBOL
# print('gene:', geneSymbol)
# print('gene:', geneSymbol)
varObj.geneSymbol = geneSymbol
varObj.geneSymbol = geneSymbol
varObj.CADD_phred = row.CADD_phred
varObj.CADD_phred = row.CADD_phred
varObj.CADD_PHRED = row.CADD_PHRED
varObj.CADD_PHRED = row.CADD_PHRED
# assign
# assign
varObj.ref = ref
varObj.ref = ref
varObj.alt = alt
varObj.alt = alt
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
# print('varId dash:', varObj.varId_dash)
# print('varId dash:', varObj.varId_dash)
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varObj.varId = varId
varObj.varId = varId
if "ZYG" in row:
if "ZYG" in row:
varObj.zyg = row.ZYG
varObj.zyg = row.ZYG
varObj.geneEnsId = row.Gene
varObj.geneEnsId = row.Gene
varObj.rsId = row.Existing_variation
varObj.rsId = row.Existing_variation
varObj.GERPpp_RS = row.GERPpp_RS
varObj.GERPpp_RS = row.GERPpp_RS
varObj.featureType = row.Feature_type
varObj.featureType = row.Feature_type
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.IMPACT = row.IMPACT
varObj.IMPACT = row.IMPACT
varObj.Consequence = row.Consequence
varObj.Consequence = row.Consequence
varObj.HGVSc = row.HGVSc
varObj.HGVSc = row.HGVSc
varObj.HGVSp = row.HGVSp
varObj.HGVSp = row.HGVSp
# dbnsfp attributes
# dbnsfp attributes
varObj.GERPpp_NR = row.GERPpp_NR
varObj.GERPpp_NR = row.GERPpp_NR
varObj.DANN_score = row.DANN_score
varObj.DANN_score = row.DANN_score
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_score = row.FATHMM_score
varObj.FATHMM_score = row.FATHMM_score
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.REVEL_score = row.REVEL_score
varObj.REVEL_score = row.REVEL_score
varObj.SIFT_score = row.SIFT_score
varObj.SIFT_score = row.SIFT_score
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_clnsig = (
varObj.clinvar_clnsig = (
row.clinvar_CLNSIG
row.clinvar_CLNSIG
) # CL: Clinvar SIG from clinvar.vcf.gz
) # CL: Clinvar SIG from clinvar.vcf.gz
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
varObj.clinvar_CLNREVSTAT = (
varObj.clinvar_CLNREVSTAT = (
row.clinvar_CLNREVSTAT
row.clinvar_CLNREVSTAT
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
varObj.clinvar_CLNSIGCONF = (
varObj.clinvar_CLNSIGCONF = (
row.clinvar_CLNSIGCONF
row.clinvar_CLNSIGCONF
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.M_CAP_score = row.M_CAP_score
varObj.M_CAP_score = row.M_CAP_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.Feature = row.Feature
varObj.Feature = row.Feature
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
if row.clinvar_CLNSIGCONF != "-":
if row.clinvar_CLNSIGCONF != "-":
clin_dict = dict()
clin_dict = dict()
for ro in row.clinvar_CLNSIGCONF.split("|_"):
for ro in row.clinvar_CLNSIGCONF.split("|_"):
temp = ro.split("(")
temp = ro.split("(")
clin_dict[temp[0]] = int(temp[1][0])
clin_dict[temp[0]] = int(temp[1][0])
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
"Likely_pathogenic", 0
"Likely_pathogenic", 0
)
)
varObj.clin_dict = clin_dict
varObj.clin_dict = clin_dict
varObj.clin_PLP = PLP_sum
varObj.clin_PLP = PLP_sum
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
else:
else:
if "benign" in row.clinvar_clnsig.lower():
if "benign" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 0
varObj.clin_PLP_perc = 0
elif "pathogenic" in row.clinvar_clnsig.lower():
elif "pathogenic" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 1
varObj.clin_PLP_perc = 1
else:
else:
varObj.clin_PLP_perc = "-"
varObj.clin_PLP_perc = "-"
varObj.clin_PLP = "-"
varObj.clin_PLP = "-"
varObj.clin_dict = "-"
varObj.clin_dict = "-"
if row.SpliceAI_pred != "-":
if row.SpliceAI_pred != "-":
varObj.spliceAI = row.SpliceAI_pred
varObj.spliceAI = row.SpliceAI_pred
temp = row.SpliceAI_pred.split("|")
temp = row.SpliceAI_pred.split("|")
varObj.spliceAImax = max(
varObj.spliceAImax = max(
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
)
)
else:
else:
varObj.spliceAI = "-"
varObj.spliceAI = "-"
varObj.spliceAImax = "-"
varObj.spliceAImax = "-"
複製
已複製
複製
已複製
if "conserve" in moduleList:
return vars(varObj)
# # get dgv: 1.3s
# # print('\nGetting DGV')
# dgvDictList = []
# typeList = []
# subtypeList = []
# dgvVarFound = 0
# dgvType = "-"
# dgvSubtype = "-"
# chromVal = int(varObj.chrom)
# posVal = int(varObj.pos)
# startVal = int(varObj.start)
# stopVal = int(varObj.stop)
# # CL 03-14-2023: changed column names to be compatible with hg38
# # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ]
# vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal]
# numRows = len(vals.index)
# if numRows > 0:
# dgvVarFound = 1
# # print('\tnumrows:',numRows)
# # print('\t type of vals:', type(vals))
# # print('\tvals:', vals)
# dgvType = vals.iloc[0]["type"]
# dgvSubtype = vals.iloc[0]["subType"]
# # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
# # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype)
# typeList.append(dgvType)
# subtypeList.append(dgvSubtype)
# retList = [dgvDictList, typeList, subtypeList, dgvVarFound]
複製
已複製
複製
已複製
# varObj.dgvDictList = retList[0]
# varObj.dgvTypeList = retList[1]
# varObj.dgvSubtypeList = retList[2]
# varObj.dgvVarFound = retList[3]
複製
已複製
複製
已複製
# get decipher: 0.6s
def getAnnotateInfoRow_3_2(
decipherDictList = []
varObj,
decipherDeletionObsList = []
decipherSortedDf,
decipherStudyList = []
):
decipherVarFound = 0
# get decipher: 0.6s
deletionObs = "-"
decipherDictList = []
# get the varaint object info from varObj
decipherDeletionObsList = []
chromVal = int(varObj.chrom)
decipherStudyList = []
posVal = int(varObj.pos)
decipherVarFound = 0
startVal = int(varObj.start)
deletionObs = "-"
stopVal = int(varObj.stop)
# get the varaint object info from varObj
chromVal = int(varObj.chrom)
# CL 03-14-2023: changed column names to be compatible with hg38
posVal = int(varObj.pos)
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
startVal = int(varObj.start)
if (chromVal, startVal, stopVal) in decipherSortedDf:
stopVal = int(varObj.stop)
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
複製
已複製
複製
已複製
decipherVarFound = 1
# CL 03-14-2023: changed column names to be compatible with hg38
deletionObs = vals.iloc[0]["deletion.obs"]
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
decipherDeletionObsList.append(deletionObs)
if (chromVal, startVal, stopVal) in decipherSortedDf:
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
複製
已複製
複製
已複製
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
decipherVarFound = 1
# print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs)
deletionObs = vals.iloc[0]["deletion.obs"]
retList = [
decipherDeletionObsList
.append(deletionObs)
decipherDictList,
decipherDeletionObsList
,
decipherStudyList,
decipherVarFound,
]
複製
已複製
複製
已複製
#
[decipherDictList,
decipherDeletionObs
List,decipherStudyList, decipherVarFound]
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
varObj.
decipherDictList
= retList[0]
#
print('\tdecipherVarFound:',decipherVarFound,'
decipherDeletionObs
:', deletionObs)
varObj.
decipherDeletionObsList
= retList[1]
retList = [
varObj.decipherStudyList = retList[2]
decipherDictList
,
varObj.decipherVarFound = retList[3]
decipherDeletionObsList
,
decipherStudyList,
# get gnomad gene metrics from gnomad file: 3.1s
decipherVarFound,
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
]
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
複製
已複製
複製
已複製
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
varObj.gnomadGeneZscore =
retList[0]
return {
varObj.gnomadGenePLI =
retList[1]
"decipherDictList":
retList[0]
,
varObj.gnomadGeneOELof =
retList[2]
# O/E lof
"decipherDeletionObsList":
retList[1]
,
varObj.gnomadGeneOELofUpper =
retList[3]
# O/E lof upper
"decipherStudyList":
retList[2]
,
"decipherVarFound":
retList[3]
,
}
複製
已複製
複製
已複製
if "curate" in moduleList:
# get
OMIM: 2s
def getAnnotateInfoRow_3_3(
# print('\nGetting OMIM')
varObj,
# varObj.omimList=jsonDict['omim']
gnomadMetricsGeneSortedDf,
# retList=[varFound, geneFound, omimDict,
omimGeneDict
,
omimAlleleDict
]
):
inputSnpList
= []
# get gnomad gene metrics from gnomad file: 3.1s
if "," in
varObj.
rsId
:
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
inputSnpList
=
varObj.
rsId.split(",")
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get
the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
return {
"gnomadGeneZscore": retList[0],
"gnomadGenePLI": retList[1],
"gnomadGeneOELof": retList[2], # O/E lof
"gnomadGeneOELofUpper": retList[3], # O/E lof upper
}
def getAnnotateInfoRow_3_4(
varObj,
omimGeneSortedDf,
):
# get OMIM: 2s
inputSnpList = []
if "," in varObj.rsId:
inputSnpList = varObj.rsId.split(",")
else:
inputSnpList = varObj.rsId
varFound = 0
geneFound = 0
omimDict = {}
omimGeneDict
= {}
omimAlleleDict
= {}
phenoList = []
phenoInhList = []
phenoMimList
= []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if
varObj.
geneSymbol in omimGeneSortedDf.index
:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict
=
omimGeneSortedDf.loc[
varObj.
geneSymbol]
snpList = []
for a in omimGeneDict["allelicVariants"]:
if "dbSnps" in a:
snpList.append(a["dbSnps"])
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
複製
已複製
複製
已複製
inputSnpList = varObj.rsId
varFound = 0
# print('\tinputSnpList:', inputSnpList)
varFound = 0
# get disease info from OMIM
geneFound = 0
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
omimDict = {}
for a in omimGeneDict["
phenotypes
"]:
omimGeneDict = {}
# print('type:', type(a))
omimAlleleDict = {}
pheno = a["phenotype"]
phenoList = []
if "
phenotypeMimNumber
" in a:
phenoInhList = []
phenoMim = a["phenotypeMimNumber"]
phenoMimList = []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if varObj.geneSymbol in omimGeneSortedDf.index:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol]
snpList = []
for a in omimGeneDict["
allelicVariants
"]:
# print('a:', a)
# print('type:', type(a))
if "
dbSnps
" in a:
snpList.append(a["dbSnps"])
# print('\tsnpList:', snpList)
# print('\tlen snpList:', len(snpList))
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
複製
已複製
複製
已複製
varFound = 0
phenoMim = "-"
if "phenotypeInheritance" in a:
# get disease info from OMIM
phenoInh = a["phenotypeInheritance"]
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
else:
for a in omimGeneDict["phenotypes"]:
phenoInh = "-"
# print('type:', type(a))
phenoList.append(pheno)
pheno = a["phenotype"]
phenoInhList.append(phenoInh)
if "phenotypeMimNumber" in a:
phenoMimList.append(str(phenoMim))
phenoMim = a["phenotypeMimNumber"]
# print('phenotype:', pheno,phenoMim,phenoInh)
else:
phenoMim = "-"
if "phenotypeInheritance" in a:
phenoInh = a["phenotypeInheritance"]
else:
phenoInh = "-"
phenoList.append(pheno)
phenoInhList.append(phenoInh)
phenoMimList.append(str(phenoMim))
# print('phenotype:', pheno,phenoMim,phenoInh)
# print('\tvarFound:', varFound)
# print('\tphenoList:', phenoList)
# print('\tphenoInhList:', phenoInhList)
# print('\tphenoMimList:', phenoMimList)
omimRet = [
varFound,
geneFound,
omimDict,
omimGeneDict,
omimAlleleDict,
phenoList,
phenoInhList,
phenoMimList,
]
複製
已複製
複製
已複製
varObj.omimVarFound = omimRet[0]
omimRet = [
varObj.omimG
eneFound
= omimRet[1]
varFound,
varObj.
omimDict
= omimRet[2]
g
eneFound
,
varObj.
omimGeneDict
= omimRet[3]
omimDict
,
varObj.
omimAlleleDict
= omimRet[4]
omimGeneDict
,
varObj.
phenoList
= omimRet[5]
omimAlleleDict
,
varObj.
phenoInhList
= omimRet[6]
phenoList
,
varObj.
phenoMimList
= omimRet[7]
phenoInhList
,
# print('OMIM res:')
phenoMimList
,
# print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound )
]
複製
已複製
複製
已複製
# get clinvar: 0.1s
return {
# print('\nReading clinVar')
"omimVarFound": omimRet[0],
clinVarRet = getClinVarUsingMarrvelFlatFile(
"omimGeneFound": omimRet[1],
varObj, clinvarAlleleDf, clinvarGeneDf
"omimDict": omimRet[2],
"omimGeneDict": omimRet[3],
"omimAlleleDict": omimRet[4],
"phenoList": omimRet[5],
"phenoInhList": omimRet[6],
"phenoMimList": omimRet[7],
}
def getAnnotateInfoRow_3_5(
varObj,
clinvarGeneDf,
clinvarAlleleDf,
):
clinVarRet = getClinVarUsingMarrvelFlatFile(
varObj, clinvarAlleleDf, clinvarGeneDf
)
clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
return {
"clinVarVarFound": clinVarRet[0],
"clinVarVarDict": clinVarRet[1],
"clinVarGeneFound": clinVarRet[2],
"clinVarGeneDict": clinVarRet[3],
"clinvarTotalNumVars": clinVarRet[4],
"clinvarNumP": clinVarRet[5],
"clinvarNumLP": clinVarRet[6],
"clinvarNumLB": clinVarRet[7],
"clinvarNumB": clinVarRet[8],
"clinvarTitle": clinVarRet[9],
"clinvarSignDesc": clinVarRet[10],
"clinvarCondition": clinVarRet[11],
}
def getAnnotateInfoRow_3_6(
varObj,
hgmdHPOScoreDf,
):
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf)
return {
"hgmdVarFound": hgmdRet[0],
"hgmdGeneFound": hgmdRet[1],
"hgmdVarPhenIdList": hgmdRet[2],
"hgmdVarHPOIdList": hgmdRet[3],
"hgmdVarHPOStrList": hgmdRet[4],
}
def getAnnotateInfoRows_3(
vepDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
def f1(row):
return getAnnotateInfoRow_3_1(row, genomeRef)
def f2(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_2(row, decipherSortedDf)
def f3(row):
if "conserve" not in moduleList:
return row
return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf)
def f4(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_4(row, omimGeneSortedDf)
def f5(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf)
def f6(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_6(
row, hgmdHPOScoreDf
)
)
複製
已複製
複製
已複製
varObj.clinVarVarFound = clinVarRet[0]
varObj.clinVarVarDict = clinVarRet[1]
varObj.clinVarGeneFound = clinVarRet[2]
varObj.clinVarGeneDict = clinVarRet[3]
varObj.clinvarTotalNumVars = clinVarRet[4]
varObj.clinvarNumP = clinVarRet[5]
varObj.clinvarNumLP = clinVarRet[6]
varObj.clinvarNumLB = clinVarRet[7]
varObj.clinvarNumB = clinVarRet[8]
varObj.clinvarTitle = clinVarRet[9]
varObj.clinvarSignDesc = (
row.clinvar_CLNSIG
) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
varObj.clinvarCondition = clinVarRet[11]
# print('clinVar res:')
"""
if debugFlag==1:
print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound)
print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB)
print('\tsignDesc:', varObj.clinvarSignDesc)
"""
# get HGMD: 0.3s
if "curate" in moduleList:
# print('\nReading HGMD')
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf)
# hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList
varObj.hgmdVarFound = hgmdRet[0]
varObj.hgmdGeneFound = hgmdRet[1]
varObj.hgmdVarPhenIdList = hgmdRet[2]
varObj.hgmdVarHPOIdList = hgmdRet[3]
varObj.hgmdVarHPOStrList = hgmdRet[4]
# print('HGMD results:')
# print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound,
# 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:',
# varObj.hgmdVarHPOIdList,
# 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList)
return {
"hg19Chrom": varObj.hg19Chrom,
"hg19Pos": varObj.hg19Pos,
"chrom": varObj.chrom,
"pos": varObj.pos,
"start": varObj.start,
"stop": varObj.stop,
"geneSymbol": varObj.geneSymbol,
"CADD_phred": varObj.CADD_phred,
"CADD_PHRED": varObj.CADD_PHRED,
"ref": varObj.ref,
"alt": varObj.alt,
"varId": varObj.varId,
"ZYG": varObj.zyg,
"HGVSc": varObj.HGVSc,
"HGVSp": varObj.HGVSp,
"Gene": varObj.geneEnsId,
"Existing_variation": varObj.rsId,
"GERPpp_RS": varObj.GERPpp_RS,
"Feature_type": varObj.featureType,
"gnomadAF": varObj.gnomadAF,
"gnomadAFg": varObj.gnomadAFg,
"CLIN_SIG": varObj.CLIN_SIG,
"LRT_Omega": varObj.LRT_Omega,
"LRT_score": varObj.LRT_score,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
# dbnsfp attributes
"GERPpp_NR": varObj.GERPpp_NR,
"DANN_score": varObj.DANN_score,
"FATHMM_pred": varObj.FATHMM_pred,
"FATHMM_score": varObj.FATHMM_score,
"GTEx_V8_gene": varObj.GTEx_V8_gene,
"GTEx_V8_tissue": varObj.GTEx_V8_tissue,
"Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score,
"Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score,
"REVEL_score": varObj.REVEL_score,
"SIFT_score": varObj.SIFT_score,
"clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz
"clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz
"clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
"clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz
"clin_code": varObj.clin_code, # CL: feature for ai
"fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score,
"LRT_score": varObj.LRT_score,
"LRT_Omega": varObj.LRT_Omega,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
"M_CAP_score": varObj.M_CAP_score,
"MutationAssessor_score": varObj.MutationAssessor_score,
"MutationTaster_score": varObj.MutationTaster_score,
"ESP6500_AA_AC": varObj.ESP6500_AA_AC,
"ESP6500_AA_AF": varObj.ESP6500_AA_AF,
"ESP6500_EA_AC": varObj.ESP6500_EA_AC,
"ESP6500_EA_AF": varObj.ESP6500_EA_AF,
# dbnsfp
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper,
"IMPACT": varObj.IMPACT,
"Consequence": varObj.Consequence,
"omimVarFound": varObj.omimVarFound,
"omimGeneFound": varObj.omimGeneFound,
"omimDict": varObj.omimDict,
"omimGeneDict": varObj.omimGeneDict,
"omimAlleleDict": varObj.omimAlleleDict,
"phenoList": varObj.phenoList,
"phenoInhList": varObj.phenoInhList,
"phenoMimList": varObj.phenoMimList,
"clinVarVarFound": varObj.clinVarVarFound,
"clinVarVarDict": varObj.clinVarVarDict,
"clinVarGeneFound": varObj.clinVarGeneFound,
"clinVarGeneDict": varObj.clinVarGeneDict,
"clinvarTotalNumVars": varObj.clinvarTotalNumVars,
"clinvarNumP": varObj.clinvarNumP,
"clinvarNumLP": varObj.clinvarNumLP,
"clinvarNumLB": varObj.clinvarNumLB,
"clinvarNumB": varObj.clinvarNumB,
"clinvarTitle": varObj.clinvarTitle,
"clinvarSignDesc": varObj.clinvarSignDesc,
"clinvarCondition": varObj.clinvarCondition,
"hgmdVarFound": varObj.hgmdVarFound,
"hgmdGeneFound": varObj.hgmdGeneFound,
"hgmdVarPhenIdList": varObj.hgmdVarPhenIdList,
"hgmdVarHPOIdList": varObj.hgmdVarHPOIdList,
"hgmdVarHPOStrList": varObj.hgmdVarHPOStrList,
"varId_dash": varObj.varId_dash,
"dgvDictList": varObj.dgvDictList,
"dgvTypeList": varObj.dgvTypeList,
"dgvSubtypeList": varObj.dgvSubtypeList,
"dgvVarFound": varObj.dgvVarFound,
"decipherDictList": varObj.decipherDictList,
"decipherDeletionObsList": varObj.decipherDeletionObsList,
"decipherStudyList": varObj.decipherStudyList,
"decipherVarFound": varObj.decipherVarFound,
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof,
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper,
# symptom
"SymptomMatched": varObj.SymptomMatched,
"symptomScore": varObj.symptomScore,
"symptomName": varObj.symptomName,
"omimSymptomSimScore": varObj.omimSymptomSimScore,
"omimSymMatchFlag": varObj.omimSymMatchFlag,
"hgmdSymptomScore": varObj.hgmdSymptomScore,
"hgmdSymptomSimScore": varObj.hgmdSymptomSimScore,
"hgmdSymMatchFlag": varObj.hgmdSymMatchFlag,
"clinVarSymMatchFlag": varObj.clinVarSymMatchFlag,
"VARIANT_CLASS": varObj.VARIANT_CLASS,
"Feature": varObj.Feature,
"hom": varObj.hom,
"hgmd_rs": varObj.hgmd_rs,
"hgmd_id": varObj.hgmd_id, # CL added
"hgmd_symbol": varObj.hgmd_symbol, # CL added
"hgmd_PHEN": varObj.hgmd_PHEN, # CL added
"hgmd_CLASS": varObj.hgmd_CLASS, # CL added
"clin_dict": varObj.clin_dict,
"clin_PLP": varObj.clin_PLP,
"clin_PLP_perc": varObj.clin_PLP_perc,
"spliceAI": varObj.spliceAI,
"spliceAImax": varObj.spliceAImax,
複製
已複製
複製
已複製
"zyg": varObj.zyg,
annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand')
'geneEnsId': varObj.geneEnsId,
df = annotateInfoDf.apply(f2, axis=1, result_type='expand')
'rsId': varObj.rsId
annotateInfoDf[df.columns] = df
}
df = annotateInfoDf.apply(f3, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f4, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f5, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f6, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
return annotateInfoDf
已保存差異
原始文本
開啟檔案
def getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # CL 03-14-2023: commented all printing lines # print('type of row:', type(row)) varObj = Variant() transcriptId = row.Feature # s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A ####row[0]: 21_11039079_C/A ####s: ['21', '11039079', 'C/A'] # print('row[0]:', row[0]) # two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop """ retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py # retList=[newChrom, newPos] varObj.hg19Chrom=retList[0] varObj.hg19Pos=retList[1] varObj.chrom=retList[0] varObj.pos=retList[1] #get the start retList=gethg19LocFromHg38(chrom, start) varObj.start=int(retList[1]) #get the stop retList=gethg19LocFromHg38(chrom, stop) varObj.stop=int(retList[1]) """ else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" if "conserve" in moduleList: # # get dgv: 1.3s # # print('\nGetting DGV') # dgvDictList = [] # typeList = [] # subtypeList = [] # dgvVarFound = 0 # dgvType = "-" # dgvSubtype = "-" # chromVal = int(varObj.chrom) # posVal = int(varObj.pos) # startVal = int(varObj.start) # stopVal = int(varObj.stop) # # CL 03-14-2023: changed column names to be compatible with hg38 # # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ] # vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal] # numRows = len(vals.index) # if numRows > 0: # dgvVarFound = 1 # # print('\tnumrows:',numRows) # # print('\t type of vals:', type(vals)) # # print('\tvals:', vals) # dgvType = vals.iloc[0]["type"] # dgvSubtype = vals.iloc[0]["subType"] # # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype) # typeList.append(dgvType) # subtypeList.append(dgvSubtype) # retList = [dgvDictList, typeList, subtypeList, dgvVarFound] # varObj.dgvDictList = retList[0] # varObj.dgvTypeList = retList[1] # varObj.dgvSubtypeList = retList[2] # varObj.dgvVarFound = retList[3] # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.decipherDictList = retList[0] varObj.decipherDeletionObsList = retList[1] varObj.decipherStudyList = retList[2] varObj.decipherVarFound = retList[3] # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.gnomadGeneZscore = retList[0] varObj.gnomadGenePLI = retList[1] varObj.gnomadGeneOELof = retList[2] # O/E lof varObj.gnomadGeneOELofUpper = retList[3] # O/E lof upper if "curate" in moduleList: # get OMIM: 2s # print('\nGetting OMIM') # varObj.omimList=jsonDict['omim'] # retList=[varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict] inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId # print('\tinputSnpList:', inputSnpList) varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: # print('a:', a) # print('type:', type(a)) if "dbSnps" in a: snpList.append(a["dbSnps"]) # print('\tsnpList:', snpList) # print('\tlen snpList:', len(snpList)) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) # print('\tvarFound:', varFound) # print('\tphenoList:', phenoList) # print('\tphenoInhList:', phenoInhList) # print('\tphenoMimList:', phenoMimList) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] varObj.omimVarFound = omimRet[0] varObj.omimGeneFound = omimRet[1] varObj.omimDict = omimRet[2] varObj.omimGeneDict = omimRet[3] varObj.omimAlleleDict = omimRet[4] varObj.phenoList = omimRet[5] varObj.phenoInhList = omimRet[6] varObj.phenoMimList = omimRet[7] # print('OMIM res:') # print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound ) # get clinvar: 0.1s # print('\nReading clinVar') clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) varObj.clinVarVarFound = clinVarRet[0] varObj.clinVarVarDict = clinVarRet[1] varObj.clinVarGeneFound = clinVarRet[2] varObj.clinVarGeneDict = clinVarRet[3] varObj.clinvarTotalNumVars = clinVarRet[4] varObj.clinvarNumP = clinVarRet[5] varObj.clinvarNumLP = clinVarRet[6] varObj.clinvarNumLB = clinVarRet[7] varObj.clinvarNumB = clinVarRet[8] varObj.clinvarTitle = clinVarRet[9] varObj.clinvarSignDesc = ( row.clinvar_CLNSIG ) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation varObj.clinvarCondition = clinVarRet[11] # print('clinVar res:') """ if debugFlag==1: print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound) print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB) print('\tsignDesc:', varObj.clinvarSignDesc) """ # get HGMD: 0.3s if "curate" in moduleList: # print('\nReading HGMD') hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf) # hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList varObj.hgmdVarFound = hgmdRet[0] varObj.hgmdGeneFound = hgmdRet[1] varObj.hgmdVarPhenIdList = hgmdRet[2] varObj.hgmdVarHPOIdList = hgmdRet[3] varObj.hgmdVarHPOStrList = hgmdRet[4] # print('HGMD results:') # print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound, # 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:', # varObj.hgmdVarHPOIdList, # 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList) return { "hg19Chrom": varObj.hg19Chrom, "hg19Pos": varObj.hg19Pos, "chrom": varObj.chrom, "pos": varObj.pos, "start": varObj.start, "stop": varObj.stop, "geneSymbol": varObj.geneSymbol, "CADD_phred": varObj.CADD_phred, "CADD_PHRED": varObj.CADD_PHRED, "ref": varObj.ref, "alt": varObj.alt, "varId": varObj.varId, "ZYG": varObj.zyg, "HGVSc": varObj.HGVSc, "HGVSp": varObj.HGVSp, "Gene": varObj.geneEnsId, "Existing_variation": varObj.rsId, "GERPpp_RS": varObj.GERPpp_RS, "Feature_type": varObj.featureType, "gnomadAF": varObj.gnomadAF, "gnomadAFg": varObj.gnomadAFg, "CLIN_SIG": varObj.CLIN_SIG, "LRT_Omega": varObj.LRT_Omega, "LRT_score": varObj.LRT_score, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, # dbnsfp attributes "GERPpp_NR": varObj.GERPpp_NR, "DANN_score": varObj.DANN_score, "FATHMM_pred": varObj.FATHMM_pred, "FATHMM_score": varObj.FATHMM_score, "GTEx_V8_gene": varObj.GTEx_V8_gene, "GTEx_V8_tissue": varObj.GTEx_V8_tissue, "Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score, "Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score, "REVEL_score": varObj.REVEL_score, "SIFT_score": varObj.SIFT_score, "clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz "clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz "clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only "clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz "clin_code": varObj.clin_code, # CL: feature for ai "fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score, "LRT_score": varObj.LRT_score, "LRT_Omega": varObj.LRT_Omega, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, "M_CAP_score": varObj.M_CAP_score, "MutationAssessor_score": varObj.MutationAssessor_score, "MutationTaster_score": varObj.MutationTaster_score, "ESP6500_AA_AC": varObj.ESP6500_AA_AC, "ESP6500_AA_AF": varObj.ESP6500_AA_AF, "ESP6500_EA_AC": varObj.ESP6500_EA_AC, "ESP6500_EA_AF": varObj.ESP6500_EA_AF, # dbnsfp "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper, "IMPACT": varObj.IMPACT, "Consequence": varObj.Consequence, "omimVarFound": varObj.omimVarFound, "omimGeneFound": varObj.omimGeneFound, "omimDict": varObj.omimDict, "omimGeneDict": varObj.omimGeneDict, "omimAlleleDict": varObj.omimAlleleDict, "phenoList": varObj.phenoList, "phenoInhList": varObj.phenoInhList, "phenoMimList": varObj.phenoMimList, "clinVarVarFound": varObj.clinVarVarFound, "clinVarVarDict": varObj.clinVarVarDict, "clinVarGeneFound": varObj.clinVarGeneFound, "clinVarGeneDict": varObj.clinVarGeneDict, "clinvarTotalNumVars": varObj.clinvarTotalNumVars, "clinvarNumP": varObj.clinvarNumP, "clinvarNumLP": varObj.clinvarNumLP, "clinvarNumLB": varObj.clinvarNumLB, "clinvarNumB": varObj.clinvarNumB, "clinvarTitle": varObj.clinvarTitle, "clinvarSignDesc": varObj.clinvarSignDesc, "clinvarCondition": varObj.clinvarCondition, "hgmdVarFound": varObj.hgmdVarFound, "hgmdGeneFound": varObj.hgmdGeneFound, "hgmdVarPhenIdList": varObj.hgmdVarPhenIdList, "hgmdVarHPOIdList": varObj.hgmdVarHPOIdList, "hgmdVarHPOStrList": varObj.hgmdVarHPOStrList, "varId_dash": varObj.varId_dash, "dgvDictList": varObj.dgvDictList, "dgvTypeList": varObj.dgvTypeList, "dgvSubtypeList": varObj.dgvSubtypeList, "dgvVarFound": varObj.dgvVarFound, "decipherDictList": varObj.decipherDictList, "decipherDeletionObsList": varObj.decipherDeletionObsList, "decipherStudyList": varObj.decipherStudyList, "decipherVarFound": varObj.decipherVarFound, "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # symptom "SymptomMatched": varObj.SymptomMatched, "symptomScore": varObj.symptomScore, "symptomName": varObj.symptomName, "omimSymptomSimScore": varObj.omimSymptomSimScore, "omimSymMatchFlag": varObj.omimSymMatchFlag, "hgmdSymptomScore": varObj.hgmdSymptomScore, "hgmdSymptomSimScore": varObj.hgmdSymptomSimScore, "hgmdSymMatchFlag": varObj.hgmdSymMatchFlag, "clinVarSymMatchFlag": varObj.clinVarSymMatchFlag, "VARIANT_CLASS": varObj.VARIANT_CLASS, "Feature": varObj.Feature, "hom": varObj.hom, "hgmd_rs": varObj.hgmd_rs, "hgmd_id": varObj.hgmd_id, # CL added "hgmd_symbol": varObj.hgmd_symbol, # CL added "hgmd_PHEN": varObj.hgmd_PHEN, # CL added "hgmd_CLASS": varObj.hgmd_CLASS, # CL added "clin_dict": varObj.clin_dict, "clin_PLP": varObj.clin_PLP, "clin_PLP_perc": varObj.clin_PLP_perc, "spliceAI": varObj.spliceAI, "spliceAImax": varObj.spliceAImax, "zyg": varObj.zyg, 'geneEnsId': varObj.geneEnsId, 'rsId': varObj.rsId }
更改後文本
開啟檔案
import re from .utils_1 import Variant from .utils_for_marrvel_flatfile import ( getClinVarUsingMarrvelFlatFile, getHGMDUsingFlatFile, getAnnotateInfoRow_2, ) def getAnnotateInfoRows_2( varDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # NOTE(JL): It is old implementation and not used. # But left to for tracing purpose. Feel free to remove def f(row): return getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ) annotateInfoDf = varDf.apply(f, axis=1, result_type='expand') return annotateInfoDf def getAnnotateInfoRow_3_1(row, genomeRef): varObj = Variant() transcriptId = row.Feature optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" return vars(varObj) def getAnnotateInfoRow_3_2( varObj, decipherSortedDf, ): # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] return { "decipherDictList": retList[0], "decipherDeletionObsList": retList[1], "decipherStudyList": retList[2], "decipherVarFound": retList[3], } def getAnnotateInfoRow_3_3( varObj, gnomadMetricsGeneSortedDf, ): # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] return { "gnomadGeneZscore": retList[0], "gnomadGenePLI": retList[1], "gnomadGeneOELof": retList[2], # O/E lof "gnomadGeneOELofUpper": retList[3], # O/E lof upper } def getAnnotateInfoRow_3_4( varObj, omimGeneSortedDf, ): # get OMIM: 2s inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: if "dbSnps" in a: snpList.append(a["dbSnps"]) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] return { "omimVarFound": omimRet[0], "omimGeneFound": omimRet[1], "omimDict": omimRet[2], "omimGeneDict": omimRet[3], "omimAlleleDict": omimRet[4], "phenoList": omimRet[5], "phenoInhList": omimRet[6], "phenoMimList": omimRet[7], } def getAnnotateInfoRow_3_5( varObj, clinvarGeneDf, clinvarAlleleDf, ): clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation return { "clinVarVarFound": clinVarRet[0], "clinVarVarDict": clinVarRet[1], "clinVarGeneFound": clinVarRet[2], "clinVarGeneDict": clinVarRet[3], "clinvarTotalNumVars": clinVarRet[4], "clinvarNumP": clinVarRet[5], "clinvarNumLP": clinVarRet[6], "clinvarNumLB": clinVarRet[7], "clinvarNumB": clinVarRet[8], "clinvarTitle": clinVarRet[9], "clinvarSignDesc": clinVarRet[10], "clinvarCondition": clinVarRet[11], } def getAnnotateInfoRow_3_6( varObj, hgmdHPOScoreDf, ): hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf) return { "hgmdVarFound": hgmdRet[0], "hgmdGeneFound": hgmdRet[1], "hgmdVarPhenIdList": hgmdRet[2], "hgmdVarHPOIdList": hgmdRet[3], "hgmdVarHPOStrList": hgmdRet[4], } def getAnnotateInfoRows_3( vepDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): def f1(row): return getAnnotateInfoRow_3_1(row, genomeRef) def f2(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_2(row, decipherSortedDf) def f3(row): if "conserve" not in moduleList: return row return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf) def f4(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_4(row, omimGeneSortedDf) def f5(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf) def f6(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_6( row, hgmdHPOScoreDf ) annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand') df = annotateInfoDf.apply(f2, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f3, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f4, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f5, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f6, axis=1, result_type='expand') annotateInfoDf[df.columns] = df return annotateInfoDf
尋找差異