Diff
checker
Testo
Testo
Immagini
Documenti
Excel
Cartelle
Legal
Enterprise
Applicazione per desktop
Prezzi
Accedi
Scarica Diffchecker Desktop
Confronta il testo
Trova la differenza tra due file di testo
Strumenti
Cronologia
Editor live
Comprimi invariate
Senza a capo
Layout
Diviso
Unificato
Livello di dettaglio
Intelligente
Parola
Carattere
Evidenziazione sintassi
Scegli sintassi
Ignora
Trasforma testo
Vai alla prima modifica
Modifica input
Diffchecker Desktop
Il modo più sicuro per usare Diffchecker. Ottieni l'app Diffchecker Desktop: i tuoi diff non lasciano mai il tuo computer!
Ottieni Desktop
Untitled diff
Creato
2 anni fa
Il diff non scade mai
Eliminare
Esporta
Condividere
Spiegare
386 rimozioni
Linee
Totale
Rimosso
Caratteri
Totale
Rimosso
Per continuare a utilizzare questa funzione, aggiorna a
Diff
checker
Pro
Visualizza prezzi
563 linee
Copia tutti
272 aggiunte
Linee
Totale
Aggiunto
Caratteri
Totale
Aggiunto
Per continuare a utilizzare questa funzione, aggiorna a
Diff
checker
Pro
Visualizza prezzi
464 linee
Copia tutti
Copia
Copiato
Copia
Copiato
import re
Copia
Copiato
Copia
Copiato
def getAnnotateInfoRow
_2(
from .utils_1 import Variant
row,
from .utils_for_marrvel_flatfile import (
genomeRef,
getClinVarUsingMarrvelFlatFile,
clinvarGeneDf,
getHGMDUsingFlatFile,
clinvarAlleleDf,
getAnnotateInfoRow_2,
omimGeneSortedDf,
)
omimAlleleList,
hgmdDf,
moduleList,
def getAnnotateInfoRow
s
_2(
decipherSortedDf,
varDf,
gnomadMetricsGeneSortedDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
):
Copia
Copiato
Copia
Copiato
# NOTE(JL): It is old implementation and not used.
# But left to for tracing purpose. Feel free to remove
def f(row):
return getAnnotateInfoRow_2(
row,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
)
annotateInfoDf = varDf.apply(f, axis=1, result_type='expand')
return annotateInfoDf
Copia
Copiato
Copia
Copiato
# CL 03-14-2023: commented all printing lines
# print('type of row:', type(row))
def getAnnotateInfoRow_3_1(row, genomeRef):
varObj = Variant()
varObj = Variant()
transcriptId = row.Feature
transcriptId = row.Feature
Copia
Copiato
Copia
Copiato
# s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A
####row[0]: 21_11039079_C/A
####s: ['21', '11039079', 'C/A']
# print('row[0]:', row[0])
# two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag
optFlag = 0
optFlag = 0
if row[0].find("/") != -1:
if row[0].find("/") != -1:
optFlag = 1
optFlag = 1
if optFlag == 0:
if optFlag == 0:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
ref = s[2]
ref = s[2]
alt = s[3]
alt = s[3]
elif optFlag == 1:
elif optFlag == 1:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
s = s[2].split("/")
s = s[2].split("/")
ref = s[0]
ref = s[0]
alt = s[1]
alt = s[1]
# get the start and stop from second column like '1:10203-10204'
# get the start and stop from second column like '1:10203-10204'
if "-" in row[1]:
if "-" in row[1]:
s = row[1].split(":")
s = row[1].split(":")
tmp = s[1]
tmp = s[1]
s = tmp.split("-")
s = tmp.split("-")
# print('s:',s)
# print('s:',s)
start = int(s[0])
start = int(s[0])
stop = int(s[1])
stop = int(s[1])
else:
else:
# start and stop the same
# start and stop the same
s = row[1].split(":")
s = row[1].split(":")
start = int(s[1])
start = int(s[1])
stop = int(s[1])
stop = int(s[1])
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# change chrom X and Y and MT to numbers
# change chrom X and Y and MT to numbers
if chrom == "X":
if chrom == "X":
chrom = 23
chrom = 23
elif chrom == "Y":
elif chrom == "Y":
chrom = 24
chrom = 24
elif chrom == "MT":
elif chrom == "MT":
chrom = 25
chrom = 25
elif re.search(r"GL", chrom):
elif re.search(r"GL", chrom):
chrom = 26
chrom = 26
chrom = int(chrom)
chrom = int(chrom)
# if it is hg38 get its hg19 coordinates
# if it is hg38 get its hg19 coordinates
# CL 03-14-2023: we have separate database for hg19 and hg38,
# CL 03-14-2023: we have separate database for hg19 and hg38,
# we don't need to use LiftOver which is inaccurate
# we don't need to use LiftOver which is inaccurate
# related codes commented and modified
# related codes commented and modified
if genomeRef == "hg38":
if genomeRef == "hg38":
varObj.hg38Chrom = chrom
varObj.hg38Chrom = chrom
varObj.hg38Pos = pos
varObj.hg38Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
Copia
Copiato
Copia
Copiato
"""
retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py
# retList=[newChrom, newPos]
varObj.hg19Chrom=retList[0]
varObj.hg19Pos=retList[1]
varObj.chrom=retList[0]
varObj.pos=retList[1]
#get the start
retList=gethg19LocFromHg38(chrom, start)
varObj.start=int(retList[1])
#get the stop
retList=gethg19LocFromHg38(chrom, stop)
varObj.stop=int(retList[1])
"""
else:
else:
varObj.hg19Chrom = chrom
varObj.hg19Chrom = chrom
varObj.hg19Pos = pos
varObj.hg19Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
geneSymbol = row.SYMBOL
geneSymbol = row.SYMBOL
# print('gene:', geneSymbol)
# print('gene:', geneSymbol)
varObj.geneSymbol = geneSymbol
varObj.geneSymbol = geneSymbol
varObj.CADD_phred = row.CADD_phred
varObj.CADD_phred = row.CADD_phred
varObj.CADD_PHRED = row.CADD_PHRED
varObj.CADD_PHRED = row.CADD_PHRED
# assign
# assign
varObj.ref = ref
varObj.ref = ref
varObj.alt = alt
varObj.alt = alt
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
# print('varId dash:', varObj.varId_dash)
# print('varId dash:', varObj.varId_dash)
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varObj.varId = varId
varObj.varId = varId
if "ZYG" in row:
if "ZYG" in row:
varObj.zyg = row.ZYG
varObj.zyg = row.ZYG
varObj.geneEnsId = row.Gene
varObj.geneEnsId = row.Gene
varObj.rsId = row.Existing_variation
varObj.rsId = row.Existing_variation
varObj.GERPpp_RS = row.GERPpp_RS
varObj.GERPpp_RS = row.GERPpp_RS
varObj.featureType = row.Feature_type
varObj.featureType = row.Feature_type
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.IMPACT = row.IMPACT
varObj.IMPACT = row.IMPACT
varObj.Consequence = row.Consequence
varObj.Consequence = row.Consequence
varObj.HGVSc = row.HGVSc
varObj.HGVSc = row.HGVSc
varObj.HGVSp = row.HGVSp
varObj.HGVSp = row.HGVSp
# dbnsfp attributes
# dbnsfp attributes
varObj.GERPpp_NR = row.GERPpp_NR
varObj.GERPpp_NR = row.GERPpp_NR
varObj.DANN_score = row.DANN_score
varObj.DANN_score = row.DANN_score
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_score = row.FATHMM_score
varObj.FATHMM_score = row.FATHMM_score
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.REVEL_score = row.REVEL_score
varObj.REVEL_score = row.REVEL_score
varObj.SIFT_score = row.SIFT_score
varObj.SIFT_score = row.SIFT_score
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_clnsig = (
varObj.clinvar_clnsig = (
row.clinvar_CLNSIG
row.clinvar_CLNSIG
) # CL: Clinvar SIG from clinvar.vcf.gz
) # CL: Clinvar SIG from clinvar.vcf.gz
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
varObj.clinvar_CLNREVSTAT = (
varObj.clinvar_CLNREVSTAT = (
row.clinvar_CLNREVSTAT
row.clinvar_CLNREVSTAT
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
varObj.clinvar_CLNSIGCONF = (
varObj.clinvar_CLNSIGCONF = (
row.clinvar_CLNSIGCONF
row.clinvar_CLNSIGCONF
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.M_CAP_score = row.M_CAP_score
varObj.M_CAP_score = row.M_CAP_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.Feature = row.Feature
varObj.Feature = row.Feature
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
if row.clinvar_CLNSIGCONF != "-":
if row.clinvar_CLNSIGCONF != "-":
clin_dict = dict()
clin_dict = dict()
for ro in row.clinvar_CLNSIGCONF.split("|_"):
for ro in row.clinvar_CLNSIGCONF.split("|_"):
temp = ro.split("(")
temp = ro.split("(")
clin_dict[temp[0]] = int(temp[1][0])
clin_dict[temp[0]] = int(temp[1][0])
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
"Likely_pathogenic", 0
"Likely_pathogenic", 0
)
)
varObj.clin_dict = clin_dict
varObj.clin_dict = clin_dict
varObj.clin_PLP = PLP_sum
varObj.clin_PLP = PLP_sum
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
else:
else:
if "benign" in row.clinvar_clnsig.lower():
if "benign" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 0
varObj.clin_PLP_perc = 0
elif "pathogenic" in row.clinvar_clnsig.lower():
elif "pathogenic" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 1
varObj.clin_PLP_perc = 1
else:
else:
varObj.clin_PLP_perc = "-"
varObj.clin_PLP_perc = "-"
varObj.clin_PLP = "-"
varObj.clin_PLP = "-"
varObj.clin_dict = "-"
varObj.clin_dict = "-"
if row.SpliceAI_pred != "-":
if row.SpliceAI_pred != "-":
varObj.spliceAI = row.SpliceAI_pred
varObj.spliceAI = row.SpliceAI_pred
temp = row.SpliceAI_pred.split("|")
temp = row.SpliceAI_pred.split("|")
varObj.spliceAImax = max(
varObj.spliceAImax = max(
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
)
)
else:
else:
varObj.spliceAI = "-"
varObj.spliceAI = "-"
varObj.spliceAImax = "-"
varObj.spliceAImax = "-"
Copia
Copiato
Copia
Copiato
if "conserve" in moduleList:
return vars(varObj)
# # get dgv: 1.3s
# # print('\nGetting DGV')
# dgvDictList = []
# typeList = []
# subtypeList = []
# dgvVarFound = 0
# dgvType = "-"
# dgvSubtype = "-"
# chromVal = int(varObj.chrom)
# posVal = int(varObj.pos)
# startVal = int(varObj.start)
# stopVal = int(varObj.stop)
# # CL 03-14-2023: changed column names to be compatible with hg38
# # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ]
# vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal]
# numRows = len(vals.index)
# if numRows > 0:
# dgvVarFound = 1
# # print('\tnumrows:',numRows)
# # print('\t type of vals:', type(vals))
# # print('\tvals:', vals)
# dgvType = vals.iloc[0]["type"]
# dgvSubtype = vals.iloc[0]["subType"]
# # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
# # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype)
# typeList.append(dgvType)
# subtypeList.append(dgvSubtype)
# retList = [dgvDictList, typeList, subtypeList, dgvVarFound]
Copia
Copiato
Copia
Copiato
# varObj.dgvDictList = retList[0]
# varObj.dgvTypeList = retList[1]
# varObj.dgvSubtypeList = retList[2]
# varObj.dgvVarFound = retList[3]
Copia
Copiato
Copia
Copiato
# get decipher: 0.6s
def getAnnotateInfoRow_3_2(
decipherDictList = []
varObj,
decipherDeletionObsList = []
decipherSortedDf,
decipherStudyList = []
):
decipherVarFound = 0
# get decipher: 0.6s
deletionObs = "-"
decipherDictList = []
# get the varaint object info from varObj
decipherDeletionObsList = []
chromVal = int(varObj.chrom)
decipherStudyList = []
posVal = int(varObj.pos)
decipherVarFound = 0
startVal = int(varObj.start)
deletionObs = "-"
stopVal = int(varObj.stop)
# get the varaint object info from varObj
chromVal = int(varObj.chrom)
# CL 03-14-2023: changed column names to be compatible with hg38
posVal = int(varObj.pos)
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
startVal = int(varObj.start)
if (chromVal, startVal, stopVal) in decipherSortedDf:
stopVal = int(varObj.stop)
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
Copia
Copiato
Copia
Copiato
decipherVarFound = 1
# CL 03-14-2023: changed column names to be compatible with hg38
deletionObs = vals.iloc[0]["deletion.obs"]
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
decipherDeletionObsList.append(deletionObs)
if (chromVal, startVal, stopVal) in decipherSortedDf:
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
Copia
Copiato
Copia
Copiato
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
decipherVarFound = 1
# print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs)
deletionObs = vals.iloc[0]["deletion.obs"]
retList = [
decipherDeletionObsList
.append(deletionObs)
decipherDictList,
decipherDeletionObsList
,
decipherStudyList,
decipherVarFound,
]
Copia
Copiato
Copia
Copiato
#
[decipherDictList,
decipherDeletionObs
List,decipherStudyList, decipherVarFound]
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
varObj.
decipherDictList
= retList[0]
#
print('\tdecipherVarFound:',decipherVarFound,'
decipherDeletionObs
:', deletionObs)
varObj.
decipherDeletionObsList
= retList[1]
retList = [
varObj.decipherStudyList = retList[2]
decipherDictList
,
varObj.decipherVarFound = retList[3]
decipherDeletionObsList
,
decipherStudyList,
# get gnomad gene metrics from gnomad file: 3.1s
decipherVarFound,
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
]
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
Copia
Copiato
Copia
Copiato
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
varObj.gnomadGeneZscore =
retList[0]
return {
varObj.gnomadGenePLI =
retList[1]
"decipherDictList":
retList[0]
,
varObj.gnomadGeneOELof =
retList[2]
# O/E lof
"decipherDeletionObsList":
retList[1]
,
varObj.gnomadGeneOELofUpper =
retList[3]
# O/E lof upper
"decipherStudyList":
retList[2]
,
"decipherVarFound":
retList[3]
,
}
Copia
Copiato
Copia
Copiato
if "curate" in moduleList:
# get
OMIM: 2s
def getAnnotateInfoRow_3_3(
# print('\nGetting OMIM')
varObj,
# varObj.omimList=jsonDict['omim']
gnomadMetricsGeneSortedDf,
# retList=[varFound, geneFound, omimDict,
omimGeneDict
,
omimAlleleDict
]
):
inputSnpList
= []
# get gnomad gene metrics from gnomad file: 3.1s
if "," in
varObj.
rsId
:
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
inputSnpList
=
varObj.
rsId.split(",")
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get
the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
return {
"gnomadGeneZscore": retList[0],
"gnomadGenePLI": retList[1],
"gnomadGeneOELof": retList[2], # O/E lof
"gnomadGeneOELofUpper": retList[3], # O/E lof upper
}
def getAnnotateInfoRow_3_4(
varObj,
omimGeneSortedDf,
):
# get OMIM: 2s
inputSnpList = []
if "," in varObj.rsId:
inputSnpList = varObj.rsId.split(",")
else:
inputSnpList = varObj.rsId
varFound = 0
geneFound = 0
omimDict = {}
omimGeneDict
= {}
omimAlleleDict
= {}
phenoList = []
phenoInhList = []
phenoMimList
= []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if
varObj.
geneSymbol in omimGeneSortedDf.index
:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict
=
omimGeneSortedDf.loc[
varObj.
geneSymbol]
snpList = []
for a in omimGeneDict["allelicVariants"]:
if "dbSnps" in a:
snpList.append(a["dbSnps"])
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
Copia
Copiato
Copia
Copiato
inputSnpList = varObj.rsId
varFound = 0
# print('\tinputSnpList:', inputSnpList)
varFound = 0
# get disease info from OMIM
geneFound = 0
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
omimDict = {}
for a in omimGeneDict["
phenotypes
"]:
omimGeneDict = {}
# print('type:', type(a))
omimAlleleDict = {}
pheno = a["phenotype"]
phenoList = []
if "
phenotypeMimNumber
" in a:
phenoInhList = []
phenoMim = a["phenotypeMimNumber"]
phenoMimList = []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if varObj.geneSymbol in omimGeneSortedDf.index:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol]
snpList = []
for a in omimGeneDict["
allelicVariants
"]:
# print('a:', a)
# print('type:', type(a))
if "
dbSnps
" in a:
snpList.append(a["dbSnps"])
# print('\tsnpList:', snpList)
# print('\tlen snpList:', len(snpList))
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
Copia
Copiato
Copia
Copiato
varFound = 0
phenoMim = "-"
if "phenotypeInheritance" in a:
# get disease info from OMIM
phenoInh = a["phenotypeInheritance"]
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
else:
for a in omimGeneDict["phenotypes"]:
phenoInh = "-"
# print('type:', type(a))
phenoList.append(pheno)
pheno = a["phenotype"]
phenoInhList.append(phenoInh)
if "phenotypeMimNumber" in a:
phenoMimList.append(str(phenoMim))
phenoMim = a["phenotypeMimNumber"]
# print('phenotype:', pheno,phenoMim,phenoInh)
else:
phenoMim = "-"
if "phenotypeInheritance" in a:
phenoInh = a["phenotypeInheritance"]
else:
phenoInh = "-"
phenoList.append(pheno)
phenoInhList.append(phenoInh)
phenoMimList.append(str(phenoMim))
# print('phenotype:', pheno,phenoMim,phenoInh)
# print('\tvarFound:', varFound)
# print('\tphenoList:', phenoList)
# print('\tphenoInhList:', phenoInhList)
# print('\tphenoMimList:', phenoMimList)
omimRet = [
varFound,
geneFound,
omimDict,
omimGeneDict,
omimAlleleDict,
phenoList,
phenoInhList,
phenoMimList,
]
Copia
Copiato
Copia
Copiato
varObj.omimVarFound = omimRet[0]
omimRet = [
varObj.omimG
eneFound
= omimRet[1]
varFound,
varObj.
omimDict
= omimRet[2]
g
eneFound
,
varObj.
omimGeneDict
= omimRet[3]
omimDict
,
varObj.
omimAlleleDict
= omimRet[4]
omimGeneDict
,
varObj.
phenoList
= omimRet[5]
omimAlleleDict
,
varObj.
phenoInhList
= omimRet[6]
phenoList
,
varObj.
phenoMimList
= omimRet[7]
phenoInhList
,
# print('OMIM res:')
phenoMimList
,
# print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound )
]
Copia
Copiato
Copia
Copiato
# get clinvar: 0.1s
return {
# print('\nReading clinVar')
"omimVarFound": omimRet[0],
clinVarRet = getClinVarUsingMarrvelFlatFile(
"omimGeneFound": omimRet[1],
varObj, clinvarAlleleDf, clinvarGeneDf
"omimDict": omimRet[2],
"omimGeneDict": omimRet[3],
"omimAlleleDict": omimRet[4],
"phenoList": omimRet[5],
"phenoInhList": omimRet[6],
"phenoMimList": omimRet[7],
}
def getAnnotateInfoRow_3_5(
varObj,
clinvarGeneDf,
clinvarAlleleDf,
):
clinVarRet = getClinVarUsingMarrvelFlatFile(
varObj, clinvarAlleleDf, clinvarGeneDf
)
clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
return {
"clinVarVarFound": clinVarRet[0],
"clinVarVarDict": clinVarRet[1],
"clinVarGeneFound": clinVarRet[2],
"clinVarGeneDict": clinVarRet[3],
"clinvarTotalNumVars": clinVarRet[4],
"clinvarNumP": clinVarRet[5],
"clinvarNumLP": clinVarRet[6],
"clinvarNumLB": clinVarRet[7],
"clinvarNumB": clinVarRet[8],
"clinvarTitle": clinVarRet[9],
"clinvarSignDesc": clinVarRet[10],
"clinvarCondition": clinVarRet[11],
}
def getAnnotateInfoRow_3_6(
varObj,
hgmdHPOScoreDf,
):
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf)
return {
"hgmdVarFound": hgmdRet[0],
"hgmdGeneFound": hgmdRet[1],
"hgmdVarPhenIdList": hgmdRet[2],
"hgmdVarHPOIdList": hgmdRet[3],
"hgmdVarHPOStrList": hgmdRet[4],
}
def getAnnotateInfoRows_3(
vepDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
def f1(row):
return getAnnotateInfoRow_3_1(row, genomeRef)
def f2(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_2(row, decipherSortedDf)
def f3(row):
if "conserve" not in moduleList:
return row
return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf)
def f4(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_4(row, omimGeneSortedDf)
def f5(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf)
def f6(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_6(
row, hgmdHPOScoreDf
)
)
Copia
Copiato
Copia
Copiato
varObj.clinVarVarFound = clinVarRet[0]
varObj.clinVarVarDict = clinVarRet[1]
varObj.clinVarGeneFound = clinVarRet[2]
varObj.clinVarGeneDict = clinVarRet[3]
varObj.clinvarTotalNumVars = clinVarRet[4]
varObj.clinvarNumP = clinVarRet[5]
varObj.clinvarNumLP = clinVarRet[6]
varObj.clinvarNumLB = clinVarRet[7]
varObj.clinvarNumB = clinVarRet[8]
varObj.clinvarTitle = clinVarRet[9]
varObj.clinvarSignDesc = (
row.clinvar_CLNSIG
) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
varObj.clinvarCondition = clinVarRet[11]
# print('clinVar res:')
"""
if debugFlag==1:
print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound)
print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB)
print('\tsignDesc:', varObj.clinvarSignDesc)
"""
# get HGMD: 0.3s
if "curate" in moduleList:
# print('\nReading HGMD')
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf)
# hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList
varObj.hgmdVarFound = hgmdRet[0]
varObj.hgmdGeneFound = hgmdRet[1]
varObj.hgmdVarPhenIdList = hgmdRet[2]
varObj.hgmdVarHPOIdList = hgmdRet[3]
varObj.hgmdVarHPOStrList = hgmdRet[4]
# print('HGMD results:')
# print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound,
# 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:',
# varObj.hgmdVarHPOIdList,
# 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList)
return {
"hg19Chrom": varObj.hg19Chrom,
"hg19Pos": varObj.hg19Pos,
"chrom": varObj.chrom,
"pos": varObj.pos,
"start": varObj.start,
"stop": varObj.stop,
"geneSymbol": varObj.geneSymbol,
"CADD_phred": varObj.CADD_phred,
"CADD_PHRED": varObj.CADD_PHRED,
"ref": varObj.ref,
"alt": varObj.alt,
"varId": varObj.varId,
"ZYG": varObj.zyg,
"HGVSc": varObj.HGVSc,
"HGVSp": varObj.HGVSp,
"Gene": varObj.geneEnsId,
"Existing_variation": varObj.rsId,
"GERPpp_RS": varObj.GERPpp_RS,
"Feature_type": varObj.featureType,
"gnomadAF": varObj.gnomadAF,
"gnomadAFg": varObj.gnomadAFg,
"CLIN_SIG": varObj.CLIN_SIG,
"LRT_Omega": varObj.LRT_Omega,
"LRT_score": varObj.LRT_score,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
# dbnsfp attributes
"GERPpp_NR": varObj.GERPpp_NR,
"DANN_score": varObj.DANN_score,
"FATHMM_pred": varObj.FATHMM_pred,
"FATHMM_score": varObj.FATHMM_score,
"GTEx_V8_gene": varObj.GTEx_V8_gene,
"GTEx_V8_tissue": varObj.GTEx_V8_tissue,
"Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score,
"Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score,
"REVEL_score": varObj.REVEL_score,
"SIFT_score": varObj.SIFT_score,
"clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz
"clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz
"clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
"clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz
"clin_code": varObj.clin_code, # CL: feature for ai
"fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score,
"LRT_score": varObj.LRT_score,
"LRT_Omega": varObj.LRT_Omega,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
"M_CAP_score": varObj.M_CAP_score,
"MutationAssessor_score": varObj.MutationAssessor_score,
"MutationTaster_score": varObj.MutationTaster_score,
"ESP6500_AA_AC": varObj.ESP6500_AA_AC,
"ESP6500_AA_AF": varObj.ESP6500_AA_AF,
"ESP6500_EA_AC": varObj.ESP6500_EA_AC,
"ESP6500_EA_AF": varObj.ESP6500_EA_AF,
# dbnsfp
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper,
"IMPACT": varObj.IMPACT,
"Consequence": varObj.Consequence,
"omimVarFound": varObj.omimVarFound,
"omimGeneFound": varObj.omimGeneFound,
"omimDict": varObj.omimDict,
"omimGeneDict": varObj.omimGeneDict,
"omimAlleleDict": varObj.omimAlleleDict,
"phenoList": varObj.phenoList,
"phenoInhList": varObj.phenoInhList,
"phenoMimList": varObj.phenoMimList,
"clinVarVarFound": varObj.clinVarVarFound,
"clinVarVarDict": varObj.clinVarVarDict,
"clinVarGeneFound": varObj.clinVarGeneFound,
"clinVarGeneDict": varObj.clinVarGeneDict,
"clinvarTotalNumVars": varObj.clinvarTotalNumVars,
"clinvarNumP": varObj.clinvarNumP,
"clinvarNumLP": varObj.clinvarNumLP,
"clinvarNumLB": varObj.clinvarNumLB,
"clinvarNumB": varObj.clinvarNumB,
"clinvarTitle": varObj.clinvarTitle,
"clinvarSignDesc": varObj.clinvarSignDesc,
"clinvarCondition": varObj.clinvarCondition,
"hgmdVarFound": varObj.hgmdVarFound,
"hgmdGeneFound": varObj.hgmdGeneFound,
"hgmdVarPhenIdList": varObj.hgmdVarPhenIdList,
"hgmdVarHPOIdList": varObj.hgmdVarHPOIdList,
"hgmdVarHPOStrList": varObj.hgmdVarHPOStrList,
"varId_dash": varObj.varId_dash,
"dgvDictList": varObj.dgvDictList,
"dgvTypeList": varObj.dgvTypeList,
"dgvSubtypeList": varObj.dgvSubtypeList,
"dgvVarFound": varObj.dgvVarFound,
"decipherDictList": varObj.decipherDictList,
"decipherDeletionObsList": varObj.decipherDeletionObsList,
"decipherStudyList": varObj.decipherStudyList,
"decipherVarFound": varObj.decipherVarFound,
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof,
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper,
# symptom
"SymptomMatched": varObj.SymptomMatched,
"symptomScore": varObj.symptomScore,
"symptomName": varObj.symptomName,
"omimSymptomSimScore": varObj.omimSymptomSimScore,
"omimSymMatchFlag": varObj.omimSymMatchFlag,
"hgmdSymptomScore": varObj.hgmdSymptomScore,
"hgmdSymptomSimScore": varObj.hgmdSymptomSimScore,
"hgmdSymMatchFlag": varObj.hgmdSymMatchFlag,
"clinVarSymMatchFlag": varObj.clinVarSymMatchFlag,
"VARIANT_CLASS": varObj.VARIANT_CLASS,
"Feature": varObj.Feature,
"hom": varObj.hom,
"hgmd_rs": varObj.hgmd_rs,
"hgmd_id": varObj.hgmd_id, # CL added
"hgmd_symbol": varObj.hgmd_symbol, # CL added
"hgmd_PHEN": varObj.hgmd_PHEN, # CL added
"hgmd_CLASS": varObj.hgmd_CLASS, # CL added
"clin_dict": varObj.clin_dict,
"clin_PLP": varObj.clin_PLP,
"clin_PLP_perc": varObj.clin_PLP_perc,
"spliceAI": varObj.spliceAI,
"spliceAImax": varObj.spliceAImax,
Copia
Copiato
Copia
Copiato
"zyg": varObj.zyg,
annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand')
'geneEnsId': varObj.geneEnsId,
df = annotateInfoDf.apply(f2, axis=1, result_type='expand')
'rsId': varObj.rsId
annotateInfoDf[df.columns] = df
}
df = annotateInfoDf.apply(f3, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f4, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f5, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f6, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
return annotateInfoDf
Diff salvati
Testo originale
Apri file
def getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # CL 03-14-2023: commented all printing lines # print('type of row:', type(row)) varObj = Variant() transcriptId = row.Feature # s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A ####row[0]: 21_11039079_C/A ####s: ['21', '11039079', 'C/A'] # print('row[0]:', row[0]) # two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop """ retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py # retList=[newChrom, newPos] varObj.hg19Chrom=retList[0] varObj.hg19Pos=retList[1] varObj.chrom=retList[0] varObj.pos=retList[1] #get the start retList=gethg19LocFromHg38(chrom, start) varObj.start=int(retList[1]) #get the stop retList=gethg19LocFromHg38(chrom, stop) varObj.stop=int(retList[1]) """ else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" if "conserve" in moduleList: # # get dgv: 1.3s # # print('\nGetting DGV') # dgvDictList = [] # typeList = [] # subtypeList = [] # dgvVarFound = 0 # dgvType = "-" # dgvSubtype = "-" # chromVal = int(varObj.chrom) # posVal = int(varObj.pos) # startVal = int(varObj.start) # stopVal = int(varObj.stop) # # CL 03-14-2023: changed column names to be compatible with hg38 # # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ] # vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal] # numRows = len(vals.index) # if numRows > 0: # dgvVarFound = 1 # # print('\tnumrows:',numRows) # # print('\t type of vals:', type(vals)) # # print('\tvals:', vals) # dgvType = vals.iloc[0]["type"] # dgvSubtype = vals.iloc[0]["subType"] # # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype) # typeList.append(dgvType) # subtypeList.append(dgvSubtype) # retList = [dgvDictList, typeList, subtypeList, dgvVarFound] # varObj.dgvDictList = retList[0] # varObj.dgvTypeList = retList[1] # varObj.dgvSubtypeList = retList[2] # varObj.dgvVarFound = retList[3] # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.decipherDictList = retList[0] varObj.decipherDeletionObsList = retList[1] varObj.decipherStudyList = retList[2] varObj.decipherVarFound = retList[3] # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.gnomadGeneZscore = retList[0] varObj.gnomadGenePLI = retList[1] varObj.gnomadGeneOELof = retList[2] # O/E lof varObj.gnomadGeneOELofUpper = retList[3] # O/E lof upper if "curate" in moduleList: # get OMIM: 2s # print('\nGetting OMIM') # varObj.omimList=jsonDict['omim'] # retList=[varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict] inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId # print('\tinputSnpList:', inputSnpList) varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: # print('a:', a) # print('type:', type(a)) if "dbSnps" in a: snpList.append(a["dbSnps"]) # print('\tsnpList:', snpList) # print('\tlen snpList:', len(snpList)) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) # print('\tvarFound:', varFound) # print('\tphenoList:', phenoList) # print('\tphenoInhList:', phenoInhList) # print('\tphenoMimList:', phenoMimList) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] varObj.omimVarFound = omimRet[0] varObj.omimGeneFound = omimRet[1] varObj.omimDict = omimRet[2] varObj.omimGeneDict = omimRet[3] varObj.omimAlleleDict = omimRet[4] varObj.phenoList = omimRet[5] varObj.phenoInhList = omimRet[6] varObj.phenoMimList = omimRet[7] # print('OMIM res:') # print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound ) # get clinvar: 0.1s # print('\nReading clinVar') clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) varObj.clinVarVarFound = clinVarRet[0] varObj.clinVarVarDict = clinVarRet[1] varObj.clinVarGeneFound = clinVarRet[2] varObj.clinVarGeneDict = clinVarRet[3] varObj.clinvarTotalNumVars = clinVarRet[4] varObj.clinvarNumP = clinVarRet[5] varObj.clinvarNumLP = clinVarRet[6] varObj.clinvarNumLB = clinVarRet[7] varObj.clinvarNumB = clinVarRet[8] varObj.clinvarTitle = clinVarRet[9] varObj.clinvarSignDesc = ( row.clinvar_CLNSIG ) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation varObj.clinvarCondition = clinVarRet[11] # print('clinVar res:') """ if debugFlag==1: print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound) print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB) print('\tsignDesc:', varObj.clinvarSignDesc) """ # get HGMD: 0.3s if "curate" in moduleList: # print('\nReading HGMD') hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf) # hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList varObj.hgmdVarFound = hgmdRet[0] varObj.hgmdGeneFound = hgmdRet[1] varObj.hgmdVarPhenIdList = hgmdRet[2] varObj.hgmdVarHPOIdList = hgmdRet[3] varObj.hgmdVarHPOStrList = hgmdRet[4] # print('HGMD results:') # print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound, # 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:', # varObj.hgmdVarHPOIdList, # 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList) return { "hg19Chrom": varObj.hg19Chrom, "hg19Pos": varObj.hg19Pos, "chrom": varObj.chrom, "pos": varObj.pos, "start": varObj.start, "stop": varObj.stop, "geneSymbol": varObj.geneSymbol, "CADD_phred": varObj.CADD_phred, "CADD_PHRED": varObj.CADD_PHRED, "ref": varObj.ref, "alt": varObj.alt, "varId": varObj.varId, "ZYG": varObj.zyg, "HGVSc": varObj.HGVSc, "HGVSp": varObj.HGVSp, "Gene": varObj.geneEnsId, "Existing_variation": varObj.rsId, "GERPpp_RS": varObj.GERPpp_RS, "Feature_type": varObj.featureType, "gnomadAF": varObj.gnomadAF, "gnomadAFg": varObj.gnomadAFg, "CLIN_SIG": varObj.CLIN_SIG, "LRT_Omega": varObj.LRT_Omega, "LRT_score": varObj.LRT_score, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, # dbnsfp attributes "GERPpp_NR": varObj.GERPpp_NR, "DANN_score": varObj.DANN_score, "FATHMM_pred": varObj.FATHMM_pred, "FATHMM_score": varObj.FATHMM_score, "GTEx_V8_gene": varObj.GTEx_V8_gene, "GTEx_V8_tissue": varObj.GTEx_V8_tissue, "Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score, "Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score, "REVEL_score": varObj.REVEL_score, "SIFT_score": varObj.SIFT_score, "clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz "clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz "clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only "clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz "clin_code": varObj.clin_code, # CL: feature for ai "fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score, "LRT_score": varObj.LRT_score, "LRT_Omega": varObj.LRT_Omega, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, "M_CAP_score": varObj.M_CAP_score, "MutationAssessor_score": varObj.MutationAssessor_score, "MutationTaster_score": varObj.MutationTaster_score, "ESP6500_AA_AC": varObj.ESP6500_AA_AC, "ESP6500_AA_AF": varObj.ESP6500_AA_AF, "ESP6500_EA_AC": varObj.ESP6500_EA_AC, "ESP6500_EA_AF": varObj.ESP6500_EA_AF, # dbnsfp "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper, "IMPACT": varObj.IMPACT, "Consequence": varObj.Consequence, "omimVarFound": varObj.omimVarFound, "omimGeneFound": varObj.omimGeneFound, "omimDict": varObj.omimDict, "omimGeneDict": varObj.omimGeneDict, "omimAlleleDict": varObj.omimAlleleDict, "phenoList": varObj.phenoList, "phenoInhList": varObj.phenoInhList, "phenoMimList": varObj.phenoMimList, "clinVarVarFound": varObj.clinVarVarFound, "clinVarVarDict": varObj.clinVarVarDict, "clinVarGeneFound": varObj.clinVarGeneFound, "clinVarGeneDict": varObj.clinVarGeneDict, "clinvarTotalNumVars": varObj.clinvarTotalNumVars, "clinvarNumP": varObj.clinvarNumP, "clinvarNumLP": varObj.clinvarNumLP, "clinvarNumLB": varObj.clinvarNumLB, "clinvarNumB": varObj.clinvarNumB, "clinvarTitle": varObj.clinvarTitle, "clinvarSignDesc": varObj.clinvarSignDesc, "clinvarCondition": varObj.clinvarCondition, "hgmdVarFound": varObj.hgmdVarFound, "hgmdGeneFound": varObj.hgmdGeneFound, "hgmdVarPhenIdList": varObj.hgmdVarPhenIdList, "hgmdVarHPOIdList": varObj.hgmdVarHPOIdList, "hgmdVarHPOStrList": varObj.hgmdVarHPOStrList, "varId_dash": varObj.varId_dash, "dgvDictList": varObj.dgvDictList, "dgvTypeList": varObj.dgvTypeList, "dgvSubtypeList": varObj.dgvSubtypeList, "dgvVarFound": varObj.dgvVarFound, "decipherDictList": varObj.decipherDictList, "decipherDeletionObsList": varObj.decipherDeletionObsList, "decipherStudyList": varObj.decipherStudyList, "decipherVarFound": varObj.decipherVarFound, "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # symptom "SymptomMatched": varObj.SymptomMatched, "symptomScore": varObj.symptomScore, "symptomName": varObj.symptomName, "omimSymptomSimScore": varObj.omimSymptomSimScore, "omimSymMatchFlag": varObj.omimSymMatchFlag, "hgmdSymptomScore": varObj.hgmdSymptomScore, "hgmdSymptomSimScore": varObj.hgmdSymptomSimScore, "hgmdSymMatchFlag": varObj.hgmdSymMatchFlag, "clinVarSymMatchFlag": varObj.clinVarSymMatchFlag, "VARIANT_CLASS": varObj.VARIANT_CLASS, "Feature": varObj.Feature, "hom": varObj.hom, "hgmd_rs": varObj.hgmd_rs, "hgmd_id": varObj.hgmd_id, # CL added "hgmd_symbol": varObj.hgmd_symbol, # CL added "hgmd_PHEN": varObj.hgmd_PHEN, # CL added "hgmd_CLASS": varObj.hgmd_CLASS, # CL added "clin_dict": varObj.clin_dict, "clin_PLP": varObj.clin_PLP, "clin_PLP_perc": varObj.clin_PLP_perc, "spliceAI": varObj.spliceAI, "spliceAImax": varObj.spliceAImax, "zyg": varObj.zyg, 'geneEnsId': varObj.geneEnsId, 'rsId': varObj.rsId }
Testo modificato
Apri file
import re from .utils_1 import Variant from .utils_for_marrvel_flatfile import ( getClinVarUsingMarrvelFlatFile, getHGMDUsingFlatFile, getAnnotateInfoRow_2, ) def getAnnotateInfoRows_2( varDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # NOTE(JL): It is old implementation and not used. # But left to for tracing purpose. Feel free to remove def f(row): return getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ) annotateInfoDf = varDf.apply(f, axis=1, result_type='expand') return annotateInfoDf def getAnnotateInfoRow_3_1(row, genomeRef): varObj = Variant() transcriptId = row.Feature optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" return vars(varObj) def getAnnotateInfoRow_3_2( varObj, decipherSortedDf, ): # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] return { "decipherDictList": retList[0], "decipherDeletionObsList": retList[1], "decipherStudyList": retList[2], "decipherVarFound": retList[3], } def getAnnotateInfoRow_3_3( varObj, gnomadMetricsGeneSortedDf, ): # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] return { "gnomadGeneZscore": retList[0], "gnomadGenePLI": retList[1], "gnomadGeneOELof": retList[2], # O/E lof "gnomadGeneOELofUpper": retList[3], # O/E lof upper } def getAnnotateInfoRow_3_4( varObj, omimGeneSortedDf, ): # get OMIM: 2s inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: if "dbSnps" in a: snpList.append(a["dbSnps"]) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] return { "omimVarFound": omimRet[0], "omimGeneFound": omimRet[1], "omimDict": omimRet[2], "omimGeneDict": omimRet[3], "omimAlleleDict": omimRet[4], "phenoList": omimRet[5], "phenoInhList": omimRet[6], "phenoMimList": omimRet[7], } def getAnnotateInfoRow_3_5( varObj, clinvarGeneDf, clinvarAlleleDf, ): clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation return { "clinVarVarFound": clinVarRet[0], "clinVarVarDict": clinVarRet[1], "clinVarGeneFound": clinVarRet[2], "clinVarGeneDict": clinVarRet[3], "clinvarTotalNumVars": clinVarRet[4], "clinvarNumP": clinVarRet[5], "clinvarNumLP": clinVarRet[6], "clinvarNumLB": clinVarRet[7], "clinvarNumB": clinVarRet[8], "clinvarTitle": clinVarRet[9], "clinvarSignDesc": clinVarRet[10], "clinvarCondition": clinVarRet[11], } def getAnnotateInfoRow_3_6( varObj, hgmdHPOScoreDf, ): hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf) return { "hgmdVarFound": hgmdRet[0], "hgmdGeneFound": hgmdRet[1], "hgmdVarPhenIdList": hgmdRet[2], "hgmdVarHPOIdList": hgmdRet[3], "hgmdVarHPOStrList": hgmdRet[4], } def getAnnotateInfoRows_3( vepDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): def f1(row): return getAnnotateInfoRow_3_1(row, genomeRef) def f2(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_2(row, decipherSortedDf) def f3(row): if "conserve" not in moduleList: return row return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf) def f4(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_4(row, omimGeneSortedDf) def f5(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf) def f6(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_6( row, hgmdHPOScoreDf ) annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand') df = annotateInfoDf.apply(f2, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f3, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f4, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f5, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f6, axis=1, result_type='expand') annotateInfoDf[df.columns] = df return annotateInfoDf
Trovare la differenza