Diff
checker
Texto
Texto
Imagens
Documentos
Excel
Pastas
Legal
Enterprise
Aplicativo para desktop
Preços
Fazer login
Baixar o Diffchecker Desktop
Comparar texto
Encontre a diferença entre dois arquivos de texto
Ferramentas
Histórico
Editor live
Recolher inalteradas
Sem quebra de linha
Layout
Dividido
Unificado
Nível de detalhe
Inteligente
Palavra
Caractere
Realce de sintaxe
Escolher sintaxe
Ignorar
Transformar texto
Ir à primeira mudança
Editar entrada
Diffchecker Desktop
A maneira mais segura de usar o Diffchecker. Obtenha o aplicativo Diffchecker Desktop: seus diffs nunca saem do seu computador!
Obter Desktop
Untitled diff
Criado
há 2 anos
O diff nunca expira
Limpar
Exportar
Compartilhar
Explicar
386 remoções
Linhas
Total
Removido
Caracteres
Total
Removido
Para continuar usando este recurso, atualize para
Diff
checker
Pro
Ver preços
563 linhas
Copiar tudo
272 adições
Linhas
Total
Adicionado
Caracteres
Total
Adicionado
Para continuar usando este recurso, atualize para
Diff
checker
Pro
Ver preços
464 linhas
Copiar tudo
Copiar
Copiado
Copiar
Copiado
import re
Copiar
Copiado
Copiar
Copiado
def getAnnotateInfoRow
_2(
from .utils_1 import Variant
row,
from .utils_for_marrvel_flatfile import (
genomeRef,
getClinVarUsingMarrvelFlatFile,
clinvarGeneDf,
getHGMDUsingFlatFile,
clinvarAlleleDf,
getAnnotateInfoRow_2,
omimGeneSortedDf,
)
omimAlleleList,
hgmdDf,
moduleList,
def getAnnotateInfoRow
s
_2(
decipherSortedDf,
varDf,
gnomadMetricsGeneSortedDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
):
Copiar
Copiado
Copiar
Copiado
# NOTE(JL): It is old implementation and not used.
# But left to for tracing purpose. Feel free to remove
def f(row):
return getAnnotateInfoRow_2(
row,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
)
annotateInfoDf = varDf.apply(f, axis=1, result_type='expand')
return annotateInfoDf
Copiar
Copiado
Copiar
Copiado
# CL 03-14-2023: commented all printing lines
# print('type of row:', type(row))
def getAnnotateInfoRow_3_1(row, genomeRef):
varObj = Variant()
varObj = Variant()
transcriptId = row.Feature
transcriptId = row.Feature
Copiar
Copiado
Copiar
Copiado
# s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A
####row[0]: 21_11039079_C/A
####s: ['21', '11039079', 'C/A']
# print('row[0]:', row[0])
# two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag
optFlag = 0
optFlag = 0
if row[0].find("/") != -1:
if row[0].find("/") != -1:
optFlag = 1
optFlag = 1
if optFlag == 0:
if optFlag == 0:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
ref = s[2]
ref = s[2]
alt = s[3]
alt = s[3]
elif optFlag == 1:
elif optFlag == 1:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
s = s[2].split("/")
s = s[2].split("/")
ref = s[0]
ref = s[0]
alt = s[1]
alt = s[1]
# get the start and stop from second column like '1:10203-10204'
# get the start and stop from second column like '1:10203-10204'
if "-" in row[1]:
if "-" in row[1]:
s = row[1].split(":")
s = row[1].split(":")
tmp = s[1]
tmp = s[1]
s = tmp.split("-")
s = tmp.split("-")
# print('s:',s)
# print('s:',s)
start = int(s[0])
start = int(s[0])
stop = int(s[1])
stop = int(s[1])
else:
else:
# start and stop the same
# start and stop the same
s = row[1].split(":")
s = row[1].split(":")
start = int(s[1])
start = int(s[1])
stop = int(s[1])
stop = int(s[1])
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# change chrom X and Y and MT to numbers
# change chrom X and Y and MT to numbers
if chrom == "X":
if chrom == "X":
chrom = 23
chrom = 23
elif chrom == "Y":
elif chrom == "Y":
chrom = 24
chrom = 24
elif chrom == "MT":
elif chrom == "MT":
chrom = 25
chrom = 25
elif re.search(r"GL", chrom):
elif re.search(r"GL", chrom):
chrom = 26
chrom = 26
chrom = int(chrom)
chrom = int(chrom)
# if it is hg38 get its hg19 coordinates
# if it is hg38 get its hg19 coordinates
# CL 03-14-2023: we have separate database for hg19 and hg38,
# CL 03-14-2023: we have separate database for hg19 and hg38,
# we don't need to use LiftOver which is inaccurate
# we don't need to use LiftOver which is inaccurate
# related codes commented and modified
# related codes commented and modified
if genomeRef == "hg38":
if genomeRef == "hg38":
varObj.hg38Chrom = chrom
varObj.hg38Chrom = chrom
varObj.hg38Pos = pos
varObj.hg38Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
Copiar
Copiado
Copiar
Copiado
"""
retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py
# retList=[newChrom, newPos]
varObj.hg19Chrom=retList[0]
varObj.hg19Pos=retList[1]
varObj.chrom=retList[0]
varObj.pos=retList[1]
#get the start
retList=gethg19LocFromHg38(chrom, start)
varObj.start=int(retList[1])
#get the stop
retList=gethg19LocFromHg38(chrom, stop)
varObj.stop=int(retList[1])
"""
else:
else:
varObj.hg19Chrom = chrom
varObj.hg19Chrom = chrom
varObj.hg19Pos = pos
varObj.hg19Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
geneSymbol = row.SYMBOL
geneSymbol = row.SYMBOL
# print('gene:', geneSymbol)
# print('gene:', geneSymbol)
varObj.geneSymbol = geneSymbol
varObj.geneSymbol = geneSymbol
varObj.CADD_phred = row.CADD_phred
varObj.CADD_phred = row.CADD_phred
varObj.CADD_PHRED = row.CADD_PHRED
varObj.CADD_PHRED = row.CADD_PHRED
# assign
# assign
varObj.ref = ref
varObj.ref = ref
varObj.alt = alt
varObj.alt = alt
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
# print('varId dash:', varObj.varId_dash)
# print('varId dash:', varObj.varId_dash)
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varObj.varId = varId
varObj.varId = varId
if "ZYG" in row:
if "ZYG" in row:
varObj.zyg = row.ZYG
varObj.zyg = row.ZYG
varObj.geneEnsId = row.Gene
varObj.geneEnsId = row.Gene
varObj.rsId = row.Existing_variation
varObj.rsId = row.Existing_variation
varObj.GERPpp_RS = row.GERPpp_RS
varObj.GERPpp_RS = row.GERPpp_RS
varObj.featureType = row.Feature_type
varObj.featureType = row.Feature_type
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.IMPACT = row.IMPACT
varObj.IMPACT = row.IMPACT
varObj.Consequence = row.Consequence
varObj.Consequence = row.Consequence
varObj.HGVSc = row.HGVSc
varObj.HGVSc = row.HGVSc
varObj.HGVSp = row.HGVSp
varObj.HGVSp = row.HGVSp
# dbnsfp attributes
# dbnsfp attributes
varObj.GERPpp_NR = row.GERPpp_NR
varObj.GERPpp_NR = row.GERPpp_NR
varObj.DANN_score = row.DANN_score
varObj.DANN_score = row.DANN_score
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_score = row.FATHMM_score
varObj.FATHMM_score = row.FATHMM_score
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.REVEL_score = row.REVEL_score
varObj.REVEL_score = row.REVEL_score
varObj.SIFT_score = row.SIFT_score
varObj.SIFT_score = row.SIFT_score
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_clnsig = (
varObj.clinvar_clnsig = (
row.clinvar_CLNSIG
row.clinvar_CLNSIG
) # CL: Clinvar SIG from clinvar.vcf.gz
) # CL: Clinvar SIG from clinvar.vcf.gz
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
varObj.clinvar_CLNREVSTAT = (
varObj.clinvar_CLNREVSTAT = (
row.clinvar_CLNREVSTAT
row.clinvar_CLNREVSTAT
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
varObj.clinvar_CLNSIGCONF = (
varObj.clinvar_CLNSIGCONF = (
row.clinvar_CLNSIGCONF
row.clinvar_CLNSIGCONF
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.M_CAP_score = row.M_CAP_score
varObj.M_CAP_score = row.M_CAP_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.Feature = row.Feature
varObj.Feature = row.Feature
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
if row.clinvar_CLNSIGCONF != "-":
if row.clinvar_CLNSIGCONF != "-":
clin_dict = dict()
clin_dict = dict()
for ro in row.clinvar_CLNSIGCONF.split("|_"):
for ro in row.clinvar_CLNSIGCONF.split("|_"):
temp = ro.split("(")
temp = ro.split("(")
clin_dict[temp[0]] = int(temp[1][0])
clin_dict[temp[0]] = int(temp[1][0])
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
"Likely_pathogenic", 0
"Likely_pathogenic", 0
)
)
varObj.clin_dict = clin_dict
varObj.clin_dict = clin_dict
varObj.clin_PLP = PLP_sum
varObj.clin_PLP = PLP_sum
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
else:
else:
if "benign" in row.clinvar_clnsig.lower():
if "benign" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 0
varObj.clin_PLP_perc = 0
elif "pathogenic" in row.clinvar_clnsig.lower():
elif "pathogenic" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 1
varObj.clin_PLP_perc = 1
else:
else:
varObj.clin_PLP_perc = "-"
varObj.clin_PLP_perc = "-"
varObj.clin_PLP = "-"
varObj.clin_PLP = "-"
varObj.clin_dict = "-"
varObj.clin_dict = "-"
if row.SpliceAI_pred != "-":
if row.SpliceAI_pred != "-":
varObj.spliceAI = row.SpliceAI_pred
varObj.spliceAI = row.SpliceAI_pred
temp = row.SpliceAI_pred.split("|")
temp = row.SpliceAI_pred.split("|")
varObj.spliceAImax = max(
varObj.spliceAImax = max(
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
)
)
else:
else:
varObj.spliceAI = "-"
varObj.spliceAI = "-"
varObj.spliceAImax = "-"
varObj.spliceAImax = "-"
Copiar
Copiado
Copiar
Copiado
if "conserve" in moduleList:
return vars(varObj)
# # get dgv: 1.3s
# # print('\nGetting DGV')
# dgvDictList = []
# typeList = []
# subtypeList = []
# dgvVarFound = 0
# dgvType = "-"
# dgvSubtype = "-"
# chromVal = int(varObj.chrom)
# posVal = int(varObj.pos)
# startVal = int(varObj.start)
# stopVal = int(varObj.stop)
# # CL 03-14-2023: changed column names to be compatible with hg38
# # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ]
# vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal]
# numRows = len(vals.index)
# if numRows > 0:
# dgvVarFound = 1
# # print('\tnumrows:',numRows)
# # print('\t type of vals:', type(vals))
# # print('\tvals:', vals)
# dgvType = vals.iloc[0]["type"]
# dgvSubtype = vals.iloc[0]["subType"]
# # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
# # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype)
# typeList.append(dgvType)
# subtypeList.append(dgvSubtype)
# retList = [dgvDictList, typeList, subtypeList, dgvVarFound]
Copiar
Copiado
Copiar
Copiado
# varObj.dgvDictList = retList[0]
# varObj.dgvTypeList = retList[1]
# varObj.dgvSubtypeList = retList[2]
# varObj.dgvVarFound = retList[3]
Copiar
Copiado
Copiar
Copiado
# get decipher: 0.6s
def getAnnotateInfoRow_3_2(
decipherDictList = []
varObj,
decipherDeletionObsList = []
decipherSortedDf,
decipherStudyList = []
):
decipherVarFound = 0
# get decipher: 0.6s
deletionObs = "-"
decipherDictList = []
# get the varaint object info from varObj
decipherDeletionObsList = []
chromVal = int(varObj.chrom)
decipherStudyList = []
posVal = int(varObj.pos)
decipherVarFound = 0
startVal = int(varObj.start)
deletionObs = "-"
stopVal = int(varObj.stop)
# get the varaint object info from varObj
chromVal = int(varObj.chrom)
# CL 03-14-2023: changed column names to be compatible with hg38
posVal = int(varObj.pos)
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
startVal = int(varObj.start)
if (chromVal, startVal, stopVal) in decipherSortedDf:
stopVal = int(varObj.stop)
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
Copiar
Copiado
Copiar
Copiado
decipherVarFound = 1
# CL 03-14-2023: changed column names to be compatible with hg38
deletionObs = vals.iloc[0]["deletion.obs"]
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
decipherDeletionObsList.append(deletionObs)
if (chromVal, startVal, stopVal) in decipherSortedDf:
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
Copiar
Copiado
Copiar
Copiado
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
decipherVarFound = 1
# print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs)
deletionObs = vals.iloc[0]["deletion.obs"]
retList = [
decipherDeletionObsList
.append(deletionObs)
decipherDictList,
decipherDeletionObsList
,
decipherStudyList,
decipherVarFound,
]
Copiar
Copiado
Copiar
Copiado
#
[decipherDictList,
decipherDeletionObs
List,decipherStudyList, decipherVarFound]
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
varObj.
decipherDictList
= retList[0]
#
print('\tdecipherVarFound:',decipherVarFound,'
decipherDeletionObs
:', deletionObs)
varObj.
decipherDeletionObsList
= retList[1]
retList = [
varObj.decipherStudyList = retList[2]
decipherDictList
,
varObj.decipherVarFound = retList[3]
decipherDeletionObsList
,
decipherStudyList,
# get gnomad gene metrics from gnomad file: 3.1s
decipherVarFound,
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
]
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
Copiar
Copiado
Copiar
Copiado
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
varObj.gnomadGeneZscore =
retList[0]
return {
varObj.gnomadGenePLI =
retList[1]
"decipherDictList":
retList[0]
,
varObj.gnomadGeneOELof =
retList[2]
# O/E lof
"decipherDeletionObsList":
retList[1]
,
varObj.gnomadGeneOELofUpper =
retList[3]
# O/E lof upper
"decipherStudyList":
retList[2]
,
"decipherVarFound":
retList[3]
,
}
Copiar
Copiado
Copiar
Copiado
if "curate" in moduleList:
# get
OMIM: 2s
def getAnnotateInfoRow_3_3(
# print('\nGetting OMIM')
varObj,
# varObj.omimList=jsonDict['omim']
gnomadMetricsGeneSortedDf,
# retList=[varFound, geneFound, omimDict,
omimGeneDict
,
omimAlleleDict
]
):
inputSnpList
= []
# get gnomad gene metrics from gnomad file: 3.1s
if "," in
varObj.
rsId
:
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
inputSnpList
=
varObj.
rsId.split(",")
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get
the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
return {
"gnomadGeneZscore": retList[0],
"gnomadGenePLI": retList[1],
"gnomadGeneOELof": retList[2], # O/E lof
"gnomadGeneOELofUpper": retList[3], # O/E lof upper
}
def getAnnotateInfoRow_3_4(
varObj,
omimGeneSortedDf,
):
# get OMIM: 2s
inputSnpList = []
if "," in varObj.rsId:
inputSnpList = varObj.rsId.split(",")
else:
inputSnpList = varObj.rsId
varFound = 0
geneFound = 0
omimDict = {}
omimGeneDict
= {}
omimAlleleDict
= {}
phenoList = []
phenoInhList = []
phenoMimList
= []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if
varObj.
geneSymbol in omimGeneSortedDf.index
:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict
=
omimGeneSortedDf.loc[
varObj.
geneSymbol]
snpList = []
for a in omimGeneDict["allelicVariants"]:
if "dbSnps" in a:
snpList.append(a["dbSnps"])
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
Copiar
Copiado
Copiar
Copiado
inputSnpList = varObj.rsId
varFound = 0
# print('\tinputSnpList:', inputSnpList)
varFound = 0
# get disease info from OMIM
geneFound = 0
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
omimDict = {}
for a in omimGeneDict["
phenotypes
"]:
omimGeneDict = {}
# print('type:', type(a))
omimAlleleDict = {}
pheno = a["phenotype"]
phenoList = []
if "
phenotypeMimNumber
" in a:
phenoInhList = []
phenoMim = a["phenotypeMimNumber"]
phenoMimList = []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if varObj.geneSymbol in omimGeneSortedDf.index:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol]
snpList = []
for a in omimGeneDict["
allelicVariants
"]:
# print('a:', a)
# print('type:', type(a))
if "
dbSnps
" in a:
snpList.append(a["dbSnps"])
# print('\tsnpList:', snpList)
# print('\tlen snpList:', len(snpList))
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
Copiar
Copiado
Copiar
Copiado
varFound = 0
phenoMim = "-"
if "phenotypeInheritance" in a:
# get disease info from OMIM
phenoInh = a["phenotypeInheritance"]
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
else:
for a in omimGeneDict["phenotypes"]:
phenoInh = "-"
# print('type:', type(a))
phenoList.append(pheno)
pheno = a["phenotype"]
phenoInhList.append(phenoInh)
if "phenotypeMimNumber" in a:
phenoMimList.append(str(phenoMim))
phenoMim = a["phenotypeMimNumber"]
# print('phenotype:', pheno,phenoMim,phenoInh)
else:
phenoMim = "-"
if "phenotypeInheritance" in a:
phenoInh = a["phenotypeInheritance"]
else:
phenoInh = "-"
phenoList.append(pheno)
phenoInhList.append(phenoInh)
phenoMimList.append(str(phenoMim))
# print('phenotype:', pheno,phenoMim,phenoInh)
# print('\tvarFound:', varFound)
# print('\tphenoList:', phenoList)
# print('\tphenoInhList:', phenoInhList)
# print('\tphenoMimList:', phenoMimList)
omimRet = [
varFound,
geneFound,
omimDict,
omimGeneDict,
omimAlleleDict,
phenoList,
phenoInhList,
phenoMimList,
]
Copiar
Copiado
Copiar
Copiado
varObj.omimVarFound = omimRet[0]
omimRet = [
varObj.omimG
eneFound
= omimRet[1]
varFound,
varObj.
omimDict
= omimRet[2]
g
eneFound
,
varObj.
omimGeneDict
= omimRet[3]
omimDict
,
varObj.
omimAlleleDict
= omimRet[4]
omimGeneDict
,
varObj.
phenoList
= omimRet[5]
omimAlleleDict
,
varObj.
phenoInhList
= omimRet[6]
phenoList
,
varObj.
phenoMimList
= omimRet[7]
phenoInhList
,
# print('OMIM res:')
phenoMimList
,
# print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound )
]
Copiar
Copiado
Copiar
Copiado
# get clinvar: 0.1s
return {
# print('\nReading clinVar')
"omimVarFound": omimRet[0],
clinVarRet = getClinVarUsingMarrvelFlatFile(
"omimGeneFound": omimRet[1],
varObj, clinvarAlleleDf, clinvarGeneDf
"omimDict": omimRet[2],
"omimGeneDict": omimRet[3],
"omimAlleleDict": omimRet[4],
"phenoList": omimRet[5],
"phenoInhList": omimRet[6],
"phenoMimList": omimRet[7],
}
def getAnnotateInfoRow_3_5(
varObj,
clinvarGeneDf,
clinvarAlleleDf,
):
clinVarRet = getClinVarUsingMarrvelFlatFile(
varObj, clinvarAlleleDf, clinvarGeneDf
)
clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
return {
"clinVarVarFound": clinVarRet[0],
"clinVarVarDict": clinVarRet[1],
"clinVarGeneFound": clinVarRet[2],
"clinVarGeneDict": clinVarRet[3],
"clinvarTotalNumVars": clinVarRet[4],
"clinvarNumP": clinVarRet[5],
"clinvarNumLP": clinVarRet[6],
"clinvarNumLB": clinVarRet[7],
"clinvarNumB": clinVarRet[8],
"clinvarTitle": clinVarRet[9],
"clinvarSignDesc": clinVarRet[10],
"clinvarCondition": clinVarRet[11],
}
def getAnnotateInfoRow_3_6(
varObj,
hgmdHPOScoreDf,
):
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf)
return {
"hgmdVarFound": hgmdRet[0],
"hgmdGeneFound": hgmdRet[1],
"hgmdVarPhenIdList": hgmdRet[2],
"hgmdVarHPOIdList": hgmdRet[3],
"hgmdVarHPOStrList": hgmdRet[4],
}
def getAnnotateInfoRows_3(
vepDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
def f1(row):
return getAnnotateInfoRow_3_1(row, genomeRef)
def f2(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_2(row, decipherSortedDf)
def f3(row):
if "conserve" not in moduleList:
return row
return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf)
def f4(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_4(row, omimGeneSortedDf)
def f5(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf)
def f6(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_6(
row, hgmdHPOScoreDf
)
)
Copiar
Copiado
Copiar
Copiado
varObj.clinVarVarFound = clinVarRet[0]
varObj.clinVarVarDict = clinVarRet[1]
varObj.clinVarGeneFound = clinVarRet[2]
varObj.clinVarGeneDict = clinVarRet[3]
varObj.clinvarTotalNumVars = clinVarRet[4]
varObj.clinvarNumP = clinVarRet[5]
varObj.clinvarNumLP = clinVarRet[6]
varObj.clinvarNumLB = clinVarRet[7]
varObj.clinvarNumB = clinVarRet[8]
varObj.clinvarTitle = clinVarRet[9]
varObj.clinvarSignDesc = (
row.clinvar_CLNSIG
) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
varObj.clinvarCondition = clinVarRet[11]
# print('clinVar res:')
"""
if debugFlag==1:
print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound)
print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB)
print('\tsignDesc:', varObj.clinvarSignDesc)
"""
# get HGMD: 0.3s
if "curate" in moduleList:
# print('\nReading HGMD')
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf)
# hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList
varObj.hgmdVarFound = hgmdRet[0]
varObj.hgmdGeneFound = hgmdRet[1]
varObj.hgmdVarPhenIdList = hgmdRet[2]
varObj.hgmdVarHPOIdList = hgmdRet[3]
varObj.hgmdVarHPOStrList = hgmdRet[4]
# print('HGMD results:')
# print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound,
# 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:',
# varObj.hgmdVarHPOIdList,
# 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList)
return {
"hg19Chrom": varObj.hg19Chrom,
"hg19Pos": varObj.hg19Pos,
"chrom": varObj.chrom,
"pos": varObj.pos,
"start": varObj.start,
"stop": varObj.stop,
"geneSymbol": varObj.geneSymbol,
"CADD_phred": varObj.CADD_phred,
"CADD_PHRED": varObj.CADD_PHRED,
"ref": varObj.ref,
"alt": varObj.alt,
"varId": varObj.varId,
"ZYG": varObj.zyg,
"HGVSc": varObj.HGVSc,
"HGVSp": varObj.HGVSp,
"Gene": varObj.geneEnsId,
"Existing_variation": varObj.rsId,
"GERPpp_RS": varObj.GERPpp_RS,
"Feature_type": varObj.featureType,
"gnomadAF": varObj.gnomadAF,
"gnomadAFg": varObj.gnomadAFg,
"CLIN_SIG": varObj.CLIN_SIG,
"LRT_Omega": varObj.LRT_Omega,
"LRT_score": varObj.LRT_score,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
# dbnsfp attributes
"GERPpp_NR": varObj.GERPpp_NR,
"DANN_score": varObj.DANN_score,
"FATHMM_pred": varObj.FATHMM_pred,
"FATHMM_score": varObj.FATHMM_score,
"GTEx_V8_gene": varObj.GTEx_V8_gene,
"GTEx_V8_tissue": varObj.GTEx_V8_tissue,
"Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score,
"Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score,
"REVEL_score": varObj.REVEL_score,
"SIFT_score": varObj.SIFT_score,
"clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz
"clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz
"clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
"clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz
"clin_code": varObj.clin_code, # CL: feature for ai
"fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score,
"LRT_score": varObj.LRT_score,
"LRT_Omega": varObj.LRT_Omega,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
"M_CAP_score": varObj.M_CAP_score,
"MutationAssessor_score": varObj.MutationAssessor_score,
"MutationTaster_score": varObj.MutationTaster_score,
"ESP6500_AA_AC": varObj.ESP6500_AA_AC,
"ESP6500_AA_AF": varObj.ESP6500_AA_AF,
"ESP6500_EA_AC": varObj.ESP6500_EA_AC,
"ESP6500_EA_AF": varObj.ESP6500_EA_AF,
# dbnsfp
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper,
"IMPACT": varObj.IMPACT,
"Consequence": varObj.Consequence,
"omimVarFound": varObj.omimVarFound,
"omimGeneFound": varObj.omimGeneFound,
"omimDict": varObj.omimDict,
"omimGeneDict": varObj.omimGeneDict,
"omimAlleleDict": varObj.omimAlleleDict,
"phenoList": varObj.phenoList,
"phenoInhList": varObj.phenoInhList,
"phenoMimList": varObj.phenoMimList,
"clinVarVarFound": varObj.clinVarVarFound,
"clinVarVarDict": varObj.clinVarVarDict,
"clinVarGeneFound": varObj.clinVarGeneFound,
"clinVarGeneDict": varObj.clinVarGeneDict,
"clinvarTotalNumVars": varObj.clinvarTotalNumVars,
"clinvarNumP": varObj.clinvarNumP,
"clinvarNumLP": varObj.clinvarNumLP,
"clinvarNumLB": varObj.clinvarNumLB,
"clinvarNumB": varObj.clinvarNumB,
"clinvarTitle": varObj.clinvarTitle,
"clinvarSignDesc": varObj.clinvarSignDesc,
"clinvarCondition": varObj.clinvarCondition,
"hgmdVarFound": varObj.hgmdVarFound,
"hgmdGeneFound": varObj.hgmdGeneFound,
"hgmdVarPhenIdList": varObj.hgmdVarPhenIdList,
"hgmdVarHPOIdList": varObj.hgmdVarHPOIdList,
"hgmdVarHPOStrList": varObj.hgmdVarHPOStrList,
"varId_dash": varObj.varId_dash,
"dgvDictList": varObj.dgvDictList,
"dgvTypeList": varObj.dgvTypeList,
"dgvSubtypeList": varObj.dgvSubtypeList,
"dgvVarFound": varObj.dgvVarFound,
"decipherDictList": varObj.decipherDictList,
"decipherDeletionObsList": varObj.decipherDeletionObsList,
"decipherStudyList": varObj.decipherStudyList,
"decipherVarFound": varObj.decipherVarFound,
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof,
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper,
# symptom
"SymptomMatched": varObj.SymptomMatched,
"symptomScore": varObj.symptomScore,
"symptomName": varObj.symptomName,
"omimSymptomSimScore": varObj.omimSymptomSimScore,
"omimSymMatchFlag": varObj.omimSymMatchFlag,
"hgmdSymptomScore": varObj.hgmdSymptomScore,
"hgmdSymptomSimScore": varObj.hgmdSymptomSimScore,
"hgmdSymMatchFlag": varObj.hgmdSymMatchFlag,
"clinVarSymMatchFlag": varObj.clinVarSymMatchFlag,
"VARIANT_CLASS": varObj.VARIANT_CLASS,
"Feature": varObj.Feature,
"hom": varObj.hom,
"hgmd_rs": varObj.hgmd_rs,
"hgmd_id": varObj.hgmd_id, # CL added
"hgmd_symbol": varObj.hgmd_symbol, # CL added
"hgmd_PHEN": varObj.hgmd_PHEN, # CL added
"hgmd_CLASS": varObj.hgmd_CLASS, # CL added
"clin_dict": varObj.clin_dict,
"clin_PLP": varObj.clin_PLP,
"clin_PLP_perc": varObj.clin_PLP_perc,
"spliceAI": varObj.spliceAI,
"spliceAImax": varObj.spliceAImax,
Copiar
Copiado
Copiar
Copiado
"zyg": varObj.zyg,
annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand')
'geneEnsId': varObj.geneEnsId,
df = annotateInfoDf.apply(f2, axis=1, result_type='expand')
'rsId': varObj.rsId
annotateInfoDf[df.columns] = df
}
df = annotateInfoDf.apply(f3, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f4, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f5, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f6, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
return annotateInfoDf
Diferenças salvas
Texto original
Abrir arquivo
def getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # CL 03-14-2023: commented all printing lines # print('type of row:', type(row)) varObj = Variant() transcriptId = row.Feature # s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A ####row[0]: 21_11039079_C/A ####s: ['21', '11039079', 'C/A'] # print('row[0]:', row[0]) # two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop """ retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py # retList=[newChrom, newPos] varObj.hg19Chrom=retList[0] varObj.hg19Pos=retList[1] varObj.chrom=retList[0] varObj.pos=retList[1] #get the start retList=gethg19LocFromHg38(chrom, start) varObj.start=int(retList[1]) #get the stop retList=gethg19LocFromHg38(chrom, stop) varObj.stop=int(retList[1]) """ else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" if "conserve" in moduleList: # # get dgv: 1.3s # # print('\nGetting DGV') # dgvDictList = [] # typeList = [] # subtypeList = [] # dgvVarFound = 0 # dgvType = "-" # dgvSubtype = "-" # chromVal = int(varObj.chrom) # posVal = int(varObj.pos) # startVal = int(varObj.start) # stopVal = int(varObj.stop) # # CL 03-14-2023: changed column names to be compatible with hg38 # # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ] # vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal] # numRows = len(vals.index) # if numRows > 0: # dgvVarFound = 1 # # print('\tnumrows:',numRows) # # print('\t type of vals:', type(vals)) # # print('\tvals:', vals) # dgvType = vals.iloc[0]["type"] # dgvSubtype = vals.iloc[0]["subType"] # # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype) # typeList.append(dgvType) # subtypeList.append(dgvSubtype) # retList = [dgvDictList, typeList, subtypeList, dgvVarFound] # varObj.dgvDictList = retList[0] # varObj.dgvTypeList = retList[1] # varObj.dgvSubtypeList = retList[2] # varObj.dgvVarFound = retList[3] # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.decipherDictList = retList[0] varObj.decipherDeletionObsList = retList[1] varObj.decipherStudyList = retList[2] varObj.decipherVarFound = retList[3] # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.gnomadGeneZscore = retList[0] varObj.gnomadGenePLI = retList[1] varObj.gnomadGeneOELof = retList[2] # O/E lof varObj.gnomadGeneOELofUpper = retList[3] # O/E lof upper if "curate" in moduleList: # get OMIM: 2s # print('\nGetting OMIM') # varObj.omimList=jsonDict['omim'] # retList=[varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict] inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId # print('\tinputSnpList:', inputSnpList) varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: # print('a:', a) # print('type:', type(a)) if "dbSnps" in a: snpList.append(a["dbSnps"]) # print('\tsnpList:', snpList) # print('\tlen snpList:', len(snpList)) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) # print('\tvarFound:', varFound) # print('\tphenoList:', phenoList) # print('\tphenoInhList:', phenoInhList) # print('\tphenoMimList:', phenoMimList) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] varObj.omimVarFound = omimRet[0] varObj.omimGeneFound = omimRet[1] varObj.omimDict = omimRet[2] varObj.omimGeneDict = omimRet[3] varObj.omimAlleleDict = omimRet[4] varObj.phenoList = omimRet[5] varObj.phenoInhList = omimRet[6] varObj.phenoMimList = omimRet[7] # print('OMIM res:') # print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound ) # get clinvar: 0.1s # print('\nReading clinVar') clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) varObj.clinVarVarFound = clinVarRet[0] varObj.clinVarVarDict = clinVarRet[1] varObj.clinVarGeneFound = clinVarRet[2] varObj.clinVarGeneDict = clinVarRet[3] varObj.clinvarTotalNumVars = clinVarRet[4] varObj.clinvarNumP = clinVarRet[5] varObj.clinvarNumLP = clinVarRet[6] varObj.clinvarNumLB = clinVarRet[7] varObj.clinvarNumB = clinVarRet[8] varObj.clinvarTitle = clinVarRet[9] varObj.clinvarSignDesc = ( row.clinvar_CLNSIG ) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation varObj.clinvarCondition = clinVarRet[11] # print('clinVar res:') """ if debugFlag==1: print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound) print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB) print('\tsignDesc:', varObj.clinvarSignDesc) """ # get HGMD: 0.3s if "curate" in moduleList: # print('\nReading HGMD') hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf) # hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList varObj.hgmdVarFound = hgmdRet[0] varObj.hgmdGeneFound = hgmdRet[1] varObj.hgmdVarPhenIdList = hgmdRet[2] varObj.hgmdVarHPOIdList = hgmdRet[3] varObj.hgmdVarHPOStrList = hgmdRet[4] # print('HGMD results:') # print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound, # 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:', # varObj.hgmdVarHPOIdList, # 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList) return { "hg19Chrom": varObj.hg19Chrom, "hg19Pos": varObj.hg19Pos, "chrom": varObj.chrom, "pos": varObj.pos, "start": varObj.start, "stop": varObj.stop, "geneSymbol": varObj.geneSymbol, "CADD_phred": varObj.CADD_phred, "CADD_PHRED": varObj.CADD_PHRED, "ref": varObj.ref, "alt": varObj.alt, "varId": varObj.varId, "ZYG": varObj.zyg, "HGVSc": varObj.HGVSc, "HGVSp": varObj.HGVSp, "Gene": varObj.geneEnsId, "Existing_variation": varObj.rsId, "GERPpp_RS": varObj.GERPpp_RS, "Feature_type": varObj.featureType, "gnomadAF": varObj.gnomadAF, "gnomadAFg": varObj.gnomadAFg, "CLIN_SIG": varObj.CLIN_SIG, "LRT_Omega": varObj.LRT_Omega, "LRT_score": varObj.LRT_score, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, # dbnsfp attributes "GERPpp_NR": varObj.GERPpp_NR, "DANN_score": varObj.DANN_score, "FATHMM_pred": varObj.FATHMM_pred, "FATHMM_score": varObj.FATHMM_score, "GTEx_V8_gene": varObj.GTEx_V8_gene, "GTEx_V8_tissue": varObj.GTEx_V8_tissue, "Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score, "Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score, "REVEL_score": varObj.REVEL_score, "SIFT_score": varObj.SIFT_score, "clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz "clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz "clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only "clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz "clin_code": varObj.clin_code, # CL: feature for ai "fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score, "LRT_score": varObj.LRT_score, "LRT_Omega": varObj.LRT_Omega, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, "M_CAP_score": varObj.M_CAP_score, "MutationAssessor_score": varObj.MutationAssessor_score, "MutationTaster_score": varObj.MutationTaster_score, "ESP6500_AA_AC": varObj.ESP6500_AA_AC, "ESP6500_AA_AF": varObj.ESP6500_AA_AF, "ESP6500_EA_AC": varObj.ESP6500_EA_AC, "ESP6500_EA_AF": varObj.ESP6500_EA_AF, # dbnsfp "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper, "IMPACT": varObj.IMPACT, "Consequence": varObj.Consequence, "omimVarFound": varObj.omimVarFound, "omimGeneFound": varObj.omimGeneFound, "omimDict": varObj.omimDict, "omimGeneDict": varObj.omimGeneDict, "omimAlleleDict": varObj.omimAlleleDict, "phenoList": varObj.phenoList, "phenoInhList": varObj.phenoInhList, "phenoMimList": varObj.phenoMimList, "clinVarVarFound": varObj.clinVarVarFound, "clinVarVarDict": varObj.clinVarVarDict, "clinVarGeneFound": varObj.clinVarGeneFound, "clinVarGeneDict": varObj.clinVarGeneDict, "clinvarTotalNumVars": varObj.clinvarTotalNumVars, "clinvarNumP": varObj.clinvarNumP, "clinvarNumLP": varObj.clinvarNumLP, "clinvarNumLB": varObj.clinvarNumLB, "clinvarNumB": varObj.clinvarNumB, "clinvarTitle": varObj.clinvarTitle, "clinvarSignDesc": varObj.clinvarSignDesc, "clinvarCondition": varObj.clinvarCondition, "hgmdVarFound": varObj.hgmdVarFound, "hgmdGeneFound": varObj.hgmdGeneFound, "hgmdVarPhenIdList": varObj.hgmdVarPhenIdList, "hgmdVarHPOIdList": varObj.hgmdVarHPOIdList, "hgmdVarHPOStrList": varObj.hgmdVarHPOStrList, "varId_dash": varObj.varId_dash, "dgvDictList": varObj.dgvDictList, "dgvTypeList": varObj.dgvTypeList, "dgvSubtypeList": varObj.dgvSubtypeList, "dgvVarFound": varObj.dgvVarFound, "decipherDictList": varObj.decipherDictList, "decipherDeletionObsList": varObj.decipherDeletionObsList, "decipherStudyList": varObj.decipherStudyList, "decipherVarFound": varObj.decipherVarFound, "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # symptom "SymptomMatched": varObj.SymptomMatched, "symptomScore": varObj.symptomScore, "symptomName": varObj.symptomName, "omimSymptomSimScore": varObj.omimSymptomSimScore, "omimSymMatchFlag": varObj.omimSymMatchFlag, "hgmdSymptomScore": varObj.hgmdSymptomScore, "hgmdSymptomSimScore": varObj.hgmdSymptomSimScore, "hgmdSymMatchFlag": varObj.hgmdSymMatchFlag, "clinVarSymMatchFlag": varObj.clinVarSymMatchFlag, "VARIANT_CLASS": varObj.VARIANT_CLASS, "Feature": varObj.Feature, "hom": varObj.hom, "hgmd_rs": varObj.hgmd_rs, "hgmd_id": varObj.hgmd_id, # CL added "hgmd_symbol": varObj.hgmd_symbol, # CL added "hgmd_PHEN": varObj.hgmd_PHEN, # CL added "hgmd_CLASS": varObj.hgmd_CLASS, # CL added "clin_dict": varObj.clin_dict, "clin_PLP": varObj.clin_PLP, "clin_PLP_perc": varObj.clin_PLP_perc, "spliceAI": varObj.spliceAI, "spliceAImax": varObj.spliceAImax, "zyg": varObj.zyg, 'geneEnsId': varObj.geneEnsId, 'rsId': varObj.rsId }
Texto alterado
Abrir arquivo
import re from .utils_1 import Variant from .utils_for_marrvel_flatfile import ( getClinVarUsingMarrvelFlatFile, getHGMDUsingFlatFile, getAnnotateInfoRow_2, ) def getAnnotateInfoRows_2( varDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # NOTE(JL): It is old implementation and not used. # But left to for tracing purpose. Feel free to remove def f(row): return getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ) annotateInfoDf = varDf.apply(f, axis=1, result_type='expand') return annotateInfoDf def getAnnotateInfoRow_3_1(row, genomeRef): varObj = Variant() transcriptId = row.Feature optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" return vars(varObj) def getAnnotateInfoRow_3_2( varObj, decipherSortedDf, ): # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] return { "decipherDictList": retList[0], "decipherDeletionObsList": retList[1], "decipherStudyList": retList[2], "decipherVarFound": retList[3], } def getAnnotateInfoRow_3_3( varObj, gnomadMetricsGeneSortedDf, ): # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] return { "gnomadGeneZscore": retList[0], "gnomadGenePLI": retList[1], "gnomadGeneOELof": retList[2], # O/E lof "gnomadGeneOELofUpper": retList[3], # O/E lof upper } def getAnnotateInfoRow_3_4( varObj, omimGeneSortedDf, ): # get OMIM: 2s inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: if "dbSnps" in a: snpList.append(a["dbSnps"]) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] return { "omimVarFound": omimRet[0], "omimGeneFound": omimRet[1], "omimDict": omimRet[2], "omimGeneDict": omimRet[3], "omimAlleleDict": omimRet[4], "phenoList": omimRet[5], "phenoInhList": omimRet[6], "phenoMimList": omimRet[7], } def getAnnotateInfoRow_3_5( varObj, clinvarGeneDf, clinvarAlleleDf, ): clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation return { "clinVarVarFound": clinVarRet[0], "clinVarVarDict": clinVarRet[1], "clinVarGeneFound": clinVarRet[2], "clinVarGeneDict": clinVarRet[3], "clinvarTotalNumVars": clinVarRet[4], "clinvarNumP": clinVarRet[5], "clinvarNumLP": clinVarRet[6], "clinvarNumLB": clinVarRet[7], "clinvarNumB": clinVarRet[8], "clinvarTitle": clinVarRet[9], "clinvarSignDesc": clinVarRet[10], "clinvarCondition": clinVarRet[11], } def getAnnotateInfoRow_3_6( varObj, hgmdHPOScoreDf, ): hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf) return { "hgmdVarFound": hgmdRet[0], "hgmdGeneFound": hgmdRet[1], "hgmdVarPhenIdList": hgmdRet[2], "hgmdVarHPOIdList": hgmdRet[3], "hgmdVarHPOStrList": hgmdRet[4], } def getAnnotateInfoRows_3( vepDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): def f1(row): return getAnnotateInfoRow_3_1(row, genomeRef) def f2(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_2(row, decipherSortedDf) def f3(row): if "conserve" not in moduleList: return row return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf) def f4(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_4(row, omimGeneSortedDf) def f5(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf) def f6(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_6( row, hgmdHPOScoreDf ) annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand') df = annotateInfoDf.apply(f2, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f3, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f4, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f5, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f6, axis=1, result_type='expand') annotateInfoDf[df.columns] = df return annotateInfoDf
Encontrar Diferença