Diff
checker
Text
Text
Bilder
Dokumente
Excel
Ordner
Legal
Enterprise
Desktop-App
Preise
Einloggen
Diffchecker Desktop herunterladen
Texte vergleichen
Finde den Unterschied zwischen zwei Textdateien
Werkzeuge
Verlauf
Live-Editor
Gleiches ausblenden
Zeilenumbruch aus
Ansicht
Zweispaltig
Einspaltig
Vergleichsgenauigkeit
Intelligent
Wort
Zeichen
Syntaxhervorhebung
Syntax auswählen
Ignorieren
Text umwandeln
Zur ersten Änderung
Eingabe bearbeiten
Diffchecker Desktop
Der sicherste Weg, Diffchecker zu nutzen. Hol dir die Desktop-App: Deine Diffs verlassen nie deinen Computer!
Desktop holen
Untitled diff
Erstellt
vor 2 Jahren
Diff läuft nie ab
Löschen
Exportieren
Teilen
Erklären
386 Entfernungen
Zeilen
Gesamt
Entfernt
Zeichen
Gesamt
Entfernt
Um diese Funktion weiterhin zu nutzen, aktualisiere auf
Diff
checker
Pro
Preise anzeigen
563 Zeilen
Kopieren
272 Hinzufügungen
Zeilen
Gesamt
Hinzugefügt
Zeichen
Gesamt
Hinzugefügt
Um diese Funktion weiterhin zu nutzen, aktualisiere auf
Diff
checker
Pro
Preise anzeigen
464 Zeilen
Kopieren
Kopieren
Kopiert
Kopieren
Kopiert
import re
Kopieren
Kopiert
Kopieren
Kopiert
def getAnnotateInfoRow
_2(
from .utils_1 import Variant
row,
from .utils_for_marrvel_flatfile import (
genomeRef,
getClinVarUsingMarrvelFlatFile,
clinvarGeneDf,
getHGMDUsingFlatFile,
clinvarAlleleDf,
getAnnotateInfoRow_2,
omimGeneSortedDf,
)
omimAlleleList,
hgmdDf,
moduleList,
def getAnnotateInfoRow
s
_2(
decipherSortedDf,
varDf,
gnomadMetricsGeneSortedDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
):
Kopieren
Kopiert
Kopieren
Kopiert
# NOTE(JL): It is old implementation and not used.
# But left to for tracing purpose. Feel free to remove
def f(row):
return getAnnotateInfoRow_2(
row,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
)
annotateInfoDf = varDf.apply(f, axis=1, result_type='expand')
return annotateInfoDf
Kopieren
Kopiert
Kopieren
Kopiert
# CL 03-14-2023: commented all printing lines
# print('type of row:', type(row))
def getAnnotateInfoRow_3_1(row, genomeRef):
varObj = Variant()
varObj = Variant()
transcriptId = row.Feature
transcriptId = row.Feature
Kopieren
Kopiert
Kopieren
Kopiert
# s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A
####row[0]: 21_11039079_C/A
####s: ['21', '11039079', 'C/A']
# print('row[0]:', row[0])
# two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag
optFlag = 0
optFlag = 0
if row[0].find("/") != -1:
if row[0].find("/") != -1:
optFlag = 1
optFlag = 1
if optFlag == 0:
if optFlag == 0:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
ref = s[2]
ref = s[2]
alt = s[3]
alt = s[3]
elif optFlag == 1:
elif optFlag == 1:
s = row[0].split("_")
s = row[0].split("_")
# print('s:', s)
# print('s:', s)
chrom = s[0]
chrom = s[0]
pos = int(s[1])
pos = int(s[1])
s = s[2].split("/")
s = s[2].split("/")
ref = s[0]
ref = s[0]
alt = s[1]
alt = s[1]
# get the start and stop from second column like '1:10203-10204'
# get the start and stop from second column like '1:10203-10204'
if "-" in row[1]:
if "-" in row[1]:
s = row[1].split(":")
s = row[1].split(":")
tmp = s[1]
tmp = s[1]
s = tmp.split("-")
s = tmp.split("-")
# print('s:',s)
# print('s:',s)
start = int(s[0])
start = int(s[0])
stop = int(s[1])
stop = int(s[1])
else:
else:
# start and stop the same
# start and stop the same
s = row[1].split(":")
s = row[1].split(":")
start = int(s[1])
start = int(s[1])
stop = int(s[1])
stop = int(s[1])
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop)
# change chrom X and Y and MT to numbers
# change chrom X and Y and MT to numbers
if chrom == "X":
if chrom == "X":
chrom = 23
chrom = 23
elif chrom == "Y":
elif chrom == "Y":
chrom = 24
chrom = 24
elif chrom == "MT":
elif chrom == "MT":
chrom = 25
chrom = 25
elif re.search(r"GL", chrom):
elif re.search(r"GL", chrom):
chrom = 26
chrom = 26
chrom = int(chrom)
chrom = int(chrom)
# if it is hg38 get its hg19 coordinates
# if it is hg38 get its hg19 coordinates
# CL 03-14-2023: we have separate database for hg19 and hg38,
# CL 03-14-2023: we have separate database for hg19 and hg38,
# we don't need to use LiftOver which is inaccurate
# we don't need to use LiftOver which is inaccurate
# related codes commented and modified
# related codes commented and modified
if genomeRef == "hg38":
if genomeRef == "hg38":
varObj.hg38Chrom = chrom
varObj.hg38Chrom = chrom
varObj.hg38Pos = pos
varObj.hg38Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
Kopieren
Kopiert
Kopieren
Kopiert
"""
retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py
# retList=[newChrom, newPos]
varObj.hg19Chrom=retList[0]
varObj.hg19Pos=retList[1]
varObj.chrom=retList[0]
varObj.pos=retList[1]
#get the start
retList=gethg19LocFromHg38(chrom, start)
varObj.start=int(retList[1])
#get the stop
retList=gethg19LocFromHg38(chrom, stop)
varObj.stop=int(retList[1])
"""
else:
else:
varObj.hg19Chrom = chrom
varObj.hg19Chrom = chrom
varObj.hg19Pos = pos
varObj.hg19Pos = pos
varObj.chrom = chrom
varObj.chrom = chrom
varObj.pos = pos
varObj.pos = pos
varObj.start = start
varObj.start = start
varObj.stop = stop
varObj.stop = stop
geneSymbol = row.SYMBOL
geneSymbol = row.SYMBOL
# print('gene:', geneSymbol)
# print('gene:', geneSymbol)
varObj.geneSymbol = geneSymbol
varObj.geneSymbol = geneSymbol
varObj.CADD_phred = row.CADD_phred
varObj.CADD_phred = row.CADD_phred
varObj.CADD_PHRED = row.CADD_PHRED
varObj.CADD_PHRED = row.CADD_PHRED
# assign
# assign
varObj.ref = ref
varObj.ref = ref
varObj.alt = alt
varObj.alt = alt
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
# print('varId dash:', varObj.varId_dash)
# print('varId dash:', varObj.varId_dash)
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varObj.varId = varId
varObj.varId = varId
if "ZYG" in row:
if "ZYG" in row:
varObj.zyg = row.ZYG
varObj.zyg = row.ZYG
varObj.geneEnsId = row.Gene
varObj.geneEnsId = row.Gene
varObj.rsId = row.Existing_variation
varObj.rsId = row.Existing_variation
varObj.GERPpp_RS = row.GERPpp_RS
varObj.GERPpp_RS = row.GERPpp_RS
varObj.featureType = row.Feature_type
varObj.featureType = row.Feature_type
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAF = row.gnomAD_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.gnomadAFg = row.gnomADg_AF
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.IMPACT = row.IMPACT
varObj.IMPACT = row.IMPACT
varObj.Consequence = row.Consequence
varObj.Consequence = row.Consequence
varObj.HGVSc = row.HGVSc
varObj.HGVSc = row.HGVSc
varObj.HGVSp = row.HGVSp
varObj.HGVSp = row.HGVSp
# dbnsfp attributes
# dbnsfp attributes
varObj.GERPpp_NR = row.GERPpp_NR
varObj.GERPpp_NR = row.GERPpp_NR
varObj.DANN_score = row.DANN_score
varObj.DANN_score = row.DANN_score
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_pred = row.FATHMM_pred
varObj.FATHMM_score = row.FATHMM_score
varObj.FATHMM_score = row.FATHMM_score
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_gene = row.GTEx_V8_gene
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.GTEx_V8_tissue = row.GTEx_V8_tissue
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score
varObj.REVEL_score = row.REVEL_score
varObj.REVEL_score = row.REVEL_score
varObj.SIFT_score = row.SIFT_score
varObj.SIFT_score = row.SIFT_score
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz
varObj.clinvar_clnsig = (
varObj.clinvar_clnsig = (
row.clinvar_CLNSIG
row.clinvar_CLNSIG
) # CL: Clinvar SIG from clinvar.vcf.gz
) # CL: Clinvar SIG from clinvar.vcf.gz
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
# varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted
varObj.clinvar_CLNREVSTAT = (
varObj.clinvar_CLNREVSTAT = (
row.clinvar_CLNREVSTAT
row.clinvar_CLNREVSTAT
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
varObj.clinvar_CLNSIGCONF = (
varObj.clinvar_CLNSIGCONF = (
row.clinvar_CLNSIGCONF
row.clinvar_CLNSIGCONF
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
) # CL: Clinvar SIGCONF from clinvar.vcf.gz
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score
varObj.LRT_score = row.LRT_score
varObj.LRT_score = row.LRT_score
varObj.LRT_Omega = row.LRT_Omega
varObj.LRT_Omega = row.LRT_Omega
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate
varObj.M_CAP_score = row.M_CAP_score
varObj.M_CAP_score = row.M_CAP_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationAssessor_score = row.MutationAssessor_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.MutationTaster_score = row.MutationTaster_score
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AC = row.ESP6500_AA_AC
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_AA_AF = row.ESP6500_AA_AF
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AC = row.ESP6500_EA_AC
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.ESP6500_EA_AF = row.ESP6500_EA_AF
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.VARIANT_CLASS = row.VARIANT_CLASS
varObj.Feature = row.Feature
varObj.Feature = row.Feature
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hom = row.gnomADg_controls_nhomalt
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_id = row.hgmd # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_symbol = row.hgmd_GENE # CL added
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_rs = row.hgmd_RANKSCORE
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_PHEN = row.hgmd_PHEN # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
varObj.hgmd_CLASS = row.hgmd_CLASS # CL added
if row.clinvar_CLNSIGCONF != "-":
if row.clinvar_CLNSIGCONF != "-":
clin_dict = dict()
clin_dict = dict()
for ro in row.clinvar_CLNSIGCONF.split("|_"):
for ro in row.clinvar_CLNSIGCONF.split("|_"):
temp = ro.split("(")
temp = ro.split("(")
clin_dict[temp[0]] = int(temp[1][0])
clin_dict[temp[0]] = int(temp[1][0])
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get(
"Likely_pathogenic", 0
"Likely_pathogenic", 0
)
)
varObj.clin_dict = clin_dict
varObj.clin_dict = clin_dict
varObj.clin_PLP = PLP_sum
varObj.clin_PLP = PLP_sum
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values())
else:
else:
if "benign" in row.clinvar_clnsig.lower():
if "benign" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 0
varObj.clin_PLP_perc = 0
elif "pathogenic" in row.clinvar_clnsig.lower():
elif "pathogenic" in row.clinvar_clnsig.lower():
varObj.clin_PLP_perc = 1
varObj.clin_PLP_perc = 1
else:
else:
varObj.clin_PLP_perc = "-"
varObj.clin_PLP_perc = "-"
varObj.clin_PLP = "-"
varObj.clin_PLP = "-"
varObj.clin_dict = "-"
varObj.clin_dict = "-"
if row.SpliceAI_pred != "-":
if row.SpliceAI_pred != "-":
varObj.spliceAI = row.SpliceAI_pred
varObj.spliceAI = row.SpliceAI_pred
temp = row.SpliceAI_pred.split("|")
temp = row.SpliceAI_pred.split("|")
varObj.spliceAImax = max(
varObj.spliceAImax = max(
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4])
)
)
else:
else:
varObj.spliceAI = "-"
varObj.spliceAI = "-"
varObj.spliceAImax = "-"
varObj.spliceAImax = "-"
Kopieren
Kopiert
Kopieren
Kopiert
if "conserve" in moduleList:
return vars(varObj)
# # get dgv: 1.3s
# # print('\nGetting DGV')
# dgvDictList = []
# typeList = []
# subtypeList = []
# dgvVarFound = 0
# dgvType = "-"
# dgvSubtype = "-"
# chromVal = int(varObj.chrom)
# posVal = int(varObj.pos)
# startVal = int(varObj.start)
# stopVal = int(varObj.stop)
# # CL 03-14-2023: changed column names to be compatible with hg38
# # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ]
# vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal]
# numRows = len(vals.index)
# if numRows > 0:
# dgvVarFound = 1
# # print('\tnumrows:',numRows)
# # print('\t type of vals:', type(vals))
# # print('\tvals:', vals)
# dgvType = vals.iloc[0]["type"]
# dgvSubtype = vals.iloc[0]["subType"]
# # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
# # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype)
# typeList.append(dgvType)
# subtypeList.append(dgvSubtype)
# retList = [dgvDictList, typeList, subtypeList, dgvVarFound]
Kopieren
Kopiert
Kopieren
Kopiert
# varObj.dgvDictList = retList[0]
# varObj.dgvTypeList = retList[1]
# varObj.dgvSubtypeList = retList[2]
# varObj.dgvVarFound = retList[3]
Kopieren
Kopiert
Kopieren
Kopiert
# get decipher: 0.6s
def getAnnotateInfoRow_3_2(
decipherDictList = []
varObj,
decipherDeletionObsList = []
decipherSortedDf,
decipherStudyList = []
):
decipherVarFound = 0
# get decipher: 0.6s
deletionObs = "-"
decipherDictList = []
# get the varaint object info from varObj
decipherDeletionObsList = []
chromVal = int(varObj.chrom)
decipherStudyList = []
posVal = int(varObj.pos)
decipherVarFound = 0
startVal = int(varObj.start)
deletionObs = "-"
stopVal = int(varObj.stop)
# get the varaint object info from varObj
chromVal = int(varObj.chrom)
# CL 03-14-2023: changed column names to be compatible with hg38
posVal = int(varObj.pos)
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
startVal = int(varObj.start)
if (chromVal, startVal, stopVal) in decipherSortedDf:
stopVal = int(varObj.stop)
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
Kopieren
Kopiert
Kopieren
Kopiert
decipherVarFound = 1
# CL 03-14-2023: changed column names to be compatible with hg38
deletionObs = vals.iloc[0]["deletion.obs"]
# vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ]
decipherDeletionObsList.append(deletionObs)
if (chromVal, startVal, stopVal) in decipherSortedDf:
vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)]
Kopieren
Kopiert
Kopieren
Kopiert
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
decipherVarFound = 1
# print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs)
deletionObs = vals.iloc[0]["deletion.obs"]
retList = [
decipherDeletionObsList
.append(deletionObs)
decipherDictList,
decipherDeletionObsList
,
decipherStudyList,
decipherVarFound,
]
Kopieren
Kopiert
Kopieren
Kopiert
#
[decipherDictList,
decipherDeletionObs
List,decipherStudyList, decipherVarFound]
# print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal)
varObj.
decipherDictList
= retList[0]
#
print('\tdecipherVarFound:',decipherVarFound,'
decipherDeletionObs
:', deletionObs)
varObj.
decipherDeletionObsList
= retList[1]
retList = [
varObj.decipherStudyList = retList[2]
decipherDictList
,
varObj.decipherVarFound = retList[3]
decipherDeletionObsList
,
decipherStudyList,
# get gnomad gene metrics from gnomad file: 3.1s
decipherVarFound,
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
]
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
Kopieren
Kopiert
Kopieren
Kopiert
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
# [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound]
varObj.gnomadGeneZscore =
retList[0]
return {
varObj.gnomadGenePLI =
retList[1]
"decipherDictList":
retList[0]
,
varObj.gnomadGeneOELof =
retList[2]
# O/E lof
"decipherDeletionObsList":
retList[1]
,
varObj.gnomadGeneOELofUpper =
retList[3]
# O/E lof upper
"decipherStudyList":
retList[2]
,
"decipherVarFound":
retList[3]
,
}
Kopieren
Kopiert
Kopieren
Kopiert
if "curate" in moduleList:
# get
OMIM: 2s
def getAnnotateInfoRow_3_3(
# print('\nGetting OMIM')
varObj,
# varObj.omimList=jsonDict['omim']
gnomadMetricsGeneSortedDf,
# retList=[varFound, geneFound, omimDict,
omimGeneDict
,
omimAlleleDict
]
):
inputSnpList
= []
# get gnomad gene metrics from gnomad file: 3.1s
if "," in
varObj.
rsId
:
if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z
inputSnpList
=
varObj.
rsId.split(",")
val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol]
gnomadGeneZscore = val["mis_z"]
gnomadGenePLI = val["pLI"]
gnomadGeneOELof = val["oe_lof"]
gnomadGeneOELofUpper = val["oe_lof_upper"]
else:
# get
the values
gnomadGeneZscore = "-"
gnomadGenePLI = "-"
gnomadGeneOELof = "-"
gnomadGeneOELofUpper = "-"
retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper]
return {
"gnomadGeneZscore": retList[0],
"gnomadGenePLI": retList[1],
"gnomadGeneOELof": retList[2], # O/E lof
"gnomadGeneOELofUpper": retList[3], # O/E lof upper
}
def getAnnotateInfoRow_3_4(
varObj,
omimGeneSortedDf,
):
# get OMIM: 2s
inputSnpList = []
if "," in varObj.rsId:
inputSnpList = varObj.rsId.split(",")
else:
inputSnpList = varObj.rsId
varFound = 0
geneFound = 0
omimDict = {}
omimGeneDict
= {}
omimAlleleDict
= {}
phenoList = []
phenoInhList = []
phenoMimList
= []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if
varObj.
geneSymbol in omimGeneSortedDf.index
:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict
=
omimGeneSortedDf.loc[
varObj.
geneSymbol]
snpList = []
for a in omimGeneDict["allelicVariants"]:
if "dbSnps" in a:
snpList.append(a["dbSnps"])
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
Kopieren
Kopiert
Kopieren
Kopiert
inputSnpList = varObj.rsId
varFound = 0
# print('\tinputSnpList:', inputSnpList)
varFound = 0
# get disease info from OMIM
geneFound = 0
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
omimDict = {}
for a in omimGeneDict["
phenotypes
"]:
omimGeneDict = {}
# print('type:', type(a))
omimAlleleDict = {}
pheno = a["phenotype"]
phenoList = []
if "
phenotypeMimNumber
" in a:
phenoInhList = []
phenoMim = a["phenotypeMimNumber"]
phenoMimList = []
# check gene
# keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol'])
if varObj.geneSymbol in omimGeneSortedDf.index:
# print('\tgene:', varObj.geneSymbol, 'found')
geneFound = 1
omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol]
snpList = []
for a in omimGeneDict["
allelicVariants
"]:
# print('a:', a)
# print('type:', type(a))
if "
dbSnps
" in a:
snpList.append(a["dbSnps"])
# print('\tsnpList:', snpList)
# print('\tlen snpList:', len(snpList))
# check if input snpID matches the OMIM one
set1 = set(inputSnpList)
set2 = set(snpList)
if set1.intersection(set2):
varFound = 1
else:
else:
Kopieren
Kopiert
Kopieren
Kopiert
varFound = 0
phenoMim = "-"
if "phenotypeInheritance" in a:
# get disease info from OMIM
phenoInh = a["phenotypeInheritance"]
# print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) )
else:
for a in omimGeneDict["phenotypes"]:
phenoInh = "-"
# print('type:', type(a))
phenoList.append(pheno)
pheno = a["phenotype"]
phenoInhList.append(phenoInh)
if "phenotypeMimNumber" in a:
phenoMimList.append(str(phenoMim))
phenoMim = a["phenotypeMimNumber"]
# print('phenotype:', pheno,phenoMim,phenoInh)
else:
phenoMim = "-"
if "phenotypeInheritance" in a:
phenoInh = a["phenotypeInheritance"]
else:
phenoInh = "-"
phenoList.append(pheno)
phenoInhList.append(phenoInh)
phenoMimList.append(str(phenoMim))
# print('phenotype:', pheno,phenoMim,phenoInh)
# print('\tvarFound:', varFound)
# print('\tphenoList:', phenoList)
# print('\tphenoInhList:', phenoInhList)
# print('\tphenoMimList:', phenoMimList)
omimRet = [
varFound,
geneFound,
omimDict,
omimGeneDict,
omimAlleleDict,
phenoList,
phenoInhList,
phenoMimList,
]
Kopieren
Kopiert
Kopieren
Kopiert
varObj.omimVarFound = omimRet[0]
omimRet = [
varObj.omimG
eneFound
= omimRet[1]
varFound,
varObj.
omimDict
= omimRet[2]
g
eneFound
,
varObj.
omimGeneDict
= omimRet[3]
omimDict
,
varObj.
omimAlleleDict
= omimRet[4]
omimGeneDict
,
varObj.
phenoList
= omimRet[5]
omimAlleleDict
,
varObj.
phenoInhList
= omimRet[6]
phenoList
,
varObj.
phenoMimList
= omimRet[7]
phenoInhList
,
# print('OMIM res:')
phenoMimList
,
# print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound )
]
Kopieren
Kopiert
Kopieren
Kopiert
# get clinvar: 0.1s
return {
# print('\nReading clinVar')
"omimVarFound": omimRet[0],
clinVarRet = getClinVarUsingMarrvelFlatFile(
"omimGeneFound": omimRet[1],
varObj, clinvarAlleleDf, clinvarGeneDf
"omimDict": omimRet[2],
"omimGeneDict": omimRet[3],
"omimAlleleDict": omimRet[4],
"phenoList": omimRet[5],
"phenoInhList": omimRet[6],
"phenoMimList": omimRet[7],
}
def getAnnotateInfoRow_3_5(
varObj,
clinvarGeneDf,
clinvarAlleleDf,
):
clinVarRet = getClinVarUsingMarrvelFlatFile(
varObj, clinvarAlleleDf, clinvarGeneDf
)
clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
return {
"clinVarVarFound": clinVarRet[0],
"clinVarVarDict": clinVarRet[1],
"clinVarGeneFound": clinVarRet[2],
"clinVarGeneDict": clinVarRet[3],
"clinvarTotalNumVars": clinVarRet[4],
"clinvarNumP": clinVarRet[5],
"clinvarNumLP": clinVarRet[6],
"clinvarNumLB": clinVarRet[7],
"clinvarNumB": clinVarRet[8],
"clinvarTitle": clinVarRet[9],
"clinvarSignDesc": clinVarRet[10],
"clinvarCondition": clinVarRet[11],
}
def getAnnotateInfoRow_3_6(
varObj,
hgmdHPOScoreDf,
):
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf)
return {
"hgmdVarFound": hgmdRet[0],
"hgmdGeneFound": hgmdRet[1],
"hgmdVarPhenIdList": hgmdRet[2],
"hgmdVarHPOIdList": hgmdRet[3],
"hgmdVarHPOStrList": hgmdRet[4],
}
def getAnnotateInfoRows_3(
vepDf,
genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
):
def f1(row):
return getAnnotateInfoRow_3_1(row, genomeRef)
def f2(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_2(row, decipherSortedDf)
def f3(row):
if "conserve" not in moduleList:
return row
return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf)
def f4(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_4(row, omimGeneSortedDf)
def f5(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf)
def f6(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_6(
row, hgmdHPOScoreDf
)
)
Kopieren
Kopiert
Kopieren
Kopiert
varObj.clinVarVarFound = clinVarRet[0]
varObj.clinVarVarDict = clinVarRet[1]
varObj.clinVarGeneFound = clinVarRet[2]
varObj.clinVarGeneDict = clinVarRet[3]
varObj.clinvarTotalNumVars = clinVarRet[4]
varObj.clinvarNumP = clinVarRet[5]
varObj.clinvarNumLP = clinVarRet[6]
varObj.clinvarNumLB = clinVarRet[7]
varObj.clinvarNumB = clinVarRet[8]
varObj.clinvarTitle = clinVarRet[9]
varObj.clinvarSignDesc = (
row.clinvar_CLNSIG
) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation
varObj.clinvarCondition = clinVarRet[11]
# print('clinVar res:')
"""
if debugFlag==1:
print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound)
print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB)
print('\tsignDesc:', varObj.clinvarSignDesc)
"""
# get HGMD: 0.3s
if "curate" in moduleList:
# print('\nReading HGMD')
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf)
# hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList
varObj.hgmdVarFound = hgmdRet[0]
varObj.hgmdGeneFound = hgmdRet[1]
varObj.hgmdVarPhenIdList = hgmdRet[2]
varObj.hgmdVarHPOIdList = hgmdRet[3]
varObj.hgmdVarHPOStrList = hgmdRet[4]
# print('HGMD results:')
# print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound,
# 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:',
# varObj.hgmdVarHPOIdList,
# 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList)
return {
"hg19Chrom": varObj.hg19Chrom,
"hg19Pos": varObj.hg19Pos,
"chrom": varObj.chrom,
"pos": varObj.pos,
"start": varObj.start,
"stop": varObj.stop,
"geneSymbol": varObj.geneSymbol,
"CADD_phred": varObj.CADD_phred,
"CADD_PHRED": varObj.CADD_PHRED,
"ref": varObj.ref,
"alt": varObj.alt,
"varId": varObj.varId,
"ZYG": varObj.zyg,
"HGVSc": varObj.HGVSc,
"HGVSp": varObj.HGVSp,
"Gene": varObj.geneEnsId,
"Existing_variation": varObj.rsId,
"GERPpp_RS": varObj.GERPpp_RS,
"Feature_type": varObj.featureType,
"gnomadAF": varObj.gnomadAF,
"gnomadAFg": varObj.gnomadAFg,
"CLIN_SIG": varObj.CLIN_SIG,
"LRT_Omega": varObj.LRT_Omega,
"LRT_score": varObj.LRT_score,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
# dbnsfp attributes
"GERPpp_NR": varObj.GERPpp_NR,
"DANN_score": varObj.DANN_score,
"FATHMM_pred": varObj.FATHMM_pred,
"FATHMM_score": varObj.FATHMM_score,
"GTEx_V8_gene": varObj.GTEx_V8_gene,
"GTEx_V8_tissue": varObj.GTEx_V8_tissue,
"Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score,
"Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score,
"REVEL_score": varObj.REVEL_score,
"SIFT_score": varObj.SIFT_score,
"clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz
"clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz
"clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only
"clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz
"clin_code": varObj.clin_code, # CL: feature for ai
"fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score,
"LRT_score": varObj.LRT_score,
"LRT_Omega": varObj.LRT_Omega,
"phyloP100way_vertebrate": varObj.phyloP100way_vertebrate,
"M_CAP_score": varObj.M_CAP_score,
"MutationAssessor_score": varObj.MutationAssessor_score,
"MutationTaster_score": varObj.MutationTaster_score,
"ESP6500_AA_AC": varObj.ESP6500_AA_AC,
"ESP6500_AA_AF": varObj.ESP6500_AA_AF,
"ESP6500_EA_AC": varObj.ESP6500_EA_AC,
"ESP6500_EA_AF": varObj.ESP6500_EA_AF,
# dbnsfp
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper,
"IMPACT": varObj.IMPACT,
"Consequence": varObj.Consequence,
"omimVarFound": varObj.omimVarFound,
"omimGeneFound": varObj.omimGeneFound,
"omimDict": varObj.omimDict,
"omimGeneDict": varObj.omimGeneDict,
"omimAlleleDict": varObj.omimAlleleDict,
"phenoList": varObj.phenoList,
"phenoInhList": varObj.phenoInhList,
"phenoMimList": varObj.phenoMimList,
"clinVarVarFound": varObj.clinVarVarFound,
"clinVarVarDict": varObj.clinVarVarDict,
"clinVarGeneFound": varObj.clinVarGeneFound,
"clinVarGeneDict": varObj.clinVarGeneDict,
"clinvarTotalNumVars": varObj.clinvarTotalNumVars,
"clinvarNumP": varObj.clinvarNumP,
"clinvarNumLP": varObj.clinvarNumLP,
"clinvarNumLB": varObj.clinvarNumLB,
"clinvarNumB": varObj.clinvarNumB,
"clinvarTitle": varObj.clinvarTitle,
"clinvarSignDesc": varObj.clinvarSignDesc,
"clinvarCondition": varObj.clinvarCondition,
"hgmdVarFound": varObj.hgmdVarFound,
"hgmdGeneFound": varObj.hgmdGeneFound,
"hgmdVarPhenIdList": varObj.hgmdVarPhenIdList,
"hgmdVarHPOIdList": varObj.hgmdVarHPOIdList,
"hgmdVarHPOStrList": varObj.hgmdVarHPOStrList,
"varId_dash": varObj.varId_dash,
"dgvDictList": varObj.dgvDictList,
"dgvTypeList": varObj.dgvTypeList,
"dgvSubtypeList": varObj.dgvSubtypeList,
"dgvVarFound": varObj.dgvVarFound,
"decipherDictList": varObj.decipherDictList,
"decipherDeletionObsList": varObj.decipherDeletionObsList,
"decipherStudyList": varObj.decipherStudyList,
"decipherVarFound": varObj.decipherVarFound,
"gnomadGeneZscore": varObj.gnomadGeneZscore,
"gnomadGenePLI": varObj.gnomadGenePLI,
"gnomadGeneOELof": varObj.gnomadGeneOELof,
"gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper,
# symptom
"SymptomMatched": varObj.SymptomMatched,
"symptomScore": varObj.symptomScore,
"symptomName": varObj.symptomName,
"omimSymptomSimScore": varObj.omimSymptomSimScore,
"omimSymMatchFlag": varObj.omimSymMatchFlag,
"hgmdSymptomScore": varObj.hgmdSymptomScore,
"hgmdSymptomSimScore": varObj.hgmdSymptomSimScore,
"hgmdSymMatchFlag": varObj.hgmdSymMatchFlag,
"clinVarSymMatchFlag": varObj.clinVarSymMatchFlag,
"VARIANT_CLASS": varObj.VARIANT_CLASS,
"Feature": varObj.Feature,
"hom": varObj.hom,
"hgmd_rs": varObj.hgmd_rs,
"hgmd_id": varObj.hgmd_id, # CL added
"hgmd_symbol": varObj.hgmd_symbol, # CL added
"hgmd_PHEN": varObj.hgmd_PHEN, # CL added
"hgmd_CLASS": varObj.hgmd_CLASS, # CL added
"clin_dict": varObj.clin_dict,
"clin_PLP": varObj.clin_PLP,
"clin_PLP_perc": varObj.clin_PLP_perc,
"spliceAI": varObj.spliceAI,
"spliceAImax": varObj.spliceAImax,
Kopieren
Kopiert
Kopieren
Kopiert
"zyg": varObj.zyg,
annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand')
'geneEnsId': varObj.geneEnsId,
df = annotateInfoDf.apply(f2, axis=1, result_type='expand')
'rsId': varObj.rsId
annotateInfoDf[df.columns] = df
}
df = annotateInfoDf.apply(f3, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f4, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f5, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
df = annotateInfoDf.apply(f6, axis=1, result_type='expand')
annotateInfoDf[df.columns] = df
return annotateInfoDf
Gespeicherte Diffs
Originaltext
Datei öffnen
def getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # CL 03-14-2023: commented all printing lines # print('type of row:', type(row)) varObj = Variant() transcriptId = row.Feature # s=row.Uploaded_variation.split('_') '1_10204_-/T' 1_1588250_T_A ####row[0]: 21_11039079_C/A ####s: ['21', '11039079', 'C/A'] # print('row[0]:', row[0]) # two ways of input of first column either 1_1588250_T_A OR 21_11039079_C/A, so use the option flag optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop """ retList=gethg19LocFromHg38(chrom, pos)#called from the utils_1.py # retList=[newChrom, newPos] varObj.hg19Chrom=retList[0] varObj.hg19Pos=retList[1] varObj.chrom=retList[0] varObj.pos=retList[1] #get the start retList=gethg19LocFromHg38(chrom, start) varObj.start=int(retList[1]) #get the stop retList=gethg19LocFromHg38(chrom, stop) varObj.stop=int(retList[1]) """ else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" if "conserve" in moduleList: # # get dgv: 1.3s # # print('\nGetting DGV') # dgvDictList = [] # typeList = [] # subtypeList = [] # dgvVarFound = 0 # dgvType = "-" # dgvSubtype = "-" # chromVal = int(varObj.chrom) # posVal = int(varObj.pos) # startVal = int(varObj.start) # stopVal = int(varObj.stop) # # CL 03-14-2023: changed column names to be compatible with hg38 # # vals=dgvDf[ ( dgvDf['hg19Chr'] == chromVal ) & ( dgvDf['hg19Start']<=startVal ) & (dgvDf['hg19Stop']>=stopVal) ] # vals = dgvSortedDf.loc[chromVal].loc[:(startVal+1)].loc[:stopVal] # numRows = len(vals.index) # if numRows > 0: # dgvVarFound = 1 # # print('\tnumrows:',numRows) # # print('\t type of vals:', type(vals)) # # print('\tvals:', vals) # dgvType = vals.iloc[0]["type"] # dgvSubtype = vals.iloc[0]["subType"] # # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # # print('\tdgvVarFound:',dgvVarFound,'dgvType:', dgvType, 'dgvsubtype:', dgvSubtype) # typeList.append(dgvType) # subtypeList.append(dgvSubtype) # retList = [dgvDictList, typeList, subtypeList, dgvVarFound] # varObj.dgvDictList = retList[0] # varObj.dgvTypeList = retList[1] # varObj.dgvSubtypeList = retList[2] # varObj.dgvVarFound = retList[3] # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.decipherDictList = retList[0] varObj.decipherDeletionObsList = retList[1] varObj.decipherStudyList = retList[2] varObj.decipherVarFound = retList[3] # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] varObj.gnomadGeneZscore = retList[0] varObj.gnomadGenePLI = retList[1] varObj.gnomadGeneOELof = retList[2] # O/E lof varObj.gnomadGeneOELofUpper = retList[3] # O/E lof upper if "curate" in moduleList: # get OMIM: 2s # print('\nGetting OMIM') # varObj.omimList=jsonDict['omim'] # retList=[varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict] inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId # print('\tinputSnpList:', inputSnpList) varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: # print('a:', a) # print('type:', type(a)) if "dbSnps" in a: snpList.append(a["dbSnps"]) # print('\tsnpList:', snpList) # print('\tlen snpList:', len(snpList)) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) # print('\tvarFound:', varFound) # print('\tphenoList:', phenoList) # print('\tphenoInhList:', phenoInhList) # print('\tphenoMimList:', phenoMimList) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] varObj.omimVarFound = omimRet[0] varObj.omimGeneFound = omimRet[1] varObj.omimDict = omimRet[2] varObj.omimGeneDict = omimRet[3] varObj.omimAlleleDict = omimRet[4] varObj.phenoList = omimRet[5] varObj.phenoInhList = omimRet[6] varObj.phenoMimList = omimRet[7] # print('OMIM res:') # print('\tgeneFound:',varObj.omimGeneFound,'varFound:',varObj.omimVarFound ) # get clinvar: 0.1s # print('\nReading clinVar') clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) varObj.clinVarVarFound = clinVarRet[0] varObj.clinVarVarDict = clinVarRet[1] varObj.clinVarGeneFound = clinVarRet[2] varObj.clinVarGeneDict = clinVarRet[3] varObj.clinvarTotalNumVars = clinVarRet[4] varObj.clinvarNumP = clinVarRet[5] varObj.clinvarNumLP = clinVarRet[6] varObj.clinvarNumLB = clinVarRet[7] varObj.clinvarNumB = clinVarRet[8] varObj.clinvarTitle = clinVarRet[9] varObj.clinvarSignDesc = ( row.clinvar_CLNSIG ) # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation varObj.clinvarCondition = clinVarRet[11] # print('clinVar res:') """ if debugFlag==1: print('\tgeneFound::',varObj.clinVarGeneFound,'varFound:',varObj.clinVarVarFound) print('\tnumVars:',varObj.clinvarTotalNumVars,'numPathologic:',varObj.clinvarNumP,'numBenign:',varObj.clinvarNumB) print('\tsignDesc:', varObj.clinvarSignDesc) """ # get HGMD: 0.3s if "curate" in moduleList: # print('\nReading HGMD') hgmdRet = getHGMDUsingFlatFile(varObj, hgmdDf) # hgmdVarFound,hgmdGeneFound,hgmdVarPhenIdList,hgmdVarHPOIdList,hgmdVarHPOStrList varObj.hgmdVarFound = hgmdRet[0] varObj.hgmdGeneFound = hgmdRet[1] varObj.hgmdVarPhenIdList = hgmdRet[2] varObj.hgmdVarHPOIdList = hgmdRet[3] varObj.hgmdVarHPOStrList = hgmdRet[4] # print('HGMD results:') # print('\thgmdVarFound:',varObj.hgmdVarFound,'hgmdGeneFound:',varObj.hgmdGeneFound, # 'hgmdVarPhenIdList:',varObj.hgmdVarPhenIdList,'hgmdVarHPOIdList:', # varObj.hgmdVarHPOIdList, # 'hgmdVarHPOStrList:',varObj.hgmdVarHPOStrList) return { "hg19Chrom": varObj.hg19Chrom, "hg19Pos": varObj.hg19Pos, "chrom": varObj.chrom, "pos": varObj.pos, "start": varObj.start, "stop": varObj.stop, "geneSymbol": varObj.geneSymbol, "CADD_phred": varObj.CADD_phred, "CADD_PHRED": varObj.CADD_PHRED, "ref": varObj.ref, "alt": varObj.alt, "varId": varObj.varId, "ZYG": varObj.zyg, "HGVSc": varObj.HGVSc, "HGVSp": varObj.HGVSp, "Gene": varObj.geneEnsId, "Existing_variation": varObj.rsId, "GERPpp_RS": varObj.GERPpp_RS, "Feature_type": varObj.featureType, "gnomadAF": varObj.gnomadAF, "gnomadAFg": varObj.gnomadAFg, "CLIN_SIG": varObj.CLIN_SIG, "LRT_Omega": varObj.LRT_Omega, "LRT_score": varObj.LRT_score, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, # dbnsfp attributes "GERPpp_NR": varObj.GERPpp_NR, "DANN_score": varObj.DANN_score, "FATHMM_pred": varObj.FATHMM_pred, "FATHMM_score": varObj.FATHMM_score, "GTEx_V8_gene": varObj.GTEx_V8_gene, "GTEx_V8_tissue": varObj.GTEx_V8_tissue, "Polyphen2_HDIV_score": varObj.Polyphen2_HDIV_score, "Polyphen2_HVAR_score": varObj.Polyphen2_HVAR_score, "REVEL_score": varObj.REVEL_score, "SIFT_score": varObj.SIFT_score, "clinvar_AlleleID": varObj.clinvar_AlleleID, # Clinvar allele ID from clinvar.vcf.gz "clinvar_clnsig": varObj.clinvar_clnsig, # CL: Clinvar SIG from clinvar.vcf.gz "clinvar_CLNREVSTAT": varObj.clinvar_CLNREVSTAT, # CL: Clinvar STAT from clinvar.vcf.gz, for interface only "clinvar_CLNSIGCONF": varObj.clinvar_CLNSIGCONF, # CL: Clinvar SIGCONF from clinvar.vcf.gz "clin_code": varObj.clin_code, # CL: feature for ai "fathmm_MKL_coding_score": varObj.fathmm_MKL_coding_score, "LRT_score": varObj.LRT_score, "LRT_Omega": varObj.LRT_Omega, "phyloP100way_vertebrate": varObj.phyloP100way_vertebrate, "M_CAP_score": varObj.M_CAP_score, "MutationAssessor_score": varObj.MutationAssessor_score, "MutationTaster_score": varObj.MutationTaster_score, "ESP6500_AA_AC": varObj.ESP6500_AA_AC, "ESP6500_AA_AF": varObj.ESP6500_AA_AF, "ESP6500_EA_AC": varObj.ESP6500_EA_AC, "ESP6500_EA_AF": varObj.ESP6500_EA_AF, # dbnsfp "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, # O/E lof "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # O/E lof upper, "IMPACT": varObj.IMPACT, "Consequence": varObj.Consequence, "omimVarFound": varObj.omimVarFound, "omimGeneFound": varObj.omimGeneFound, "omimDict": varObj.omimDict, "omimGeneDict": varObj.omimGeneDict, "omimAlleleDict": varObj.omimAlleleDict, "phenoList": varObj.phenoList, "phenoInhList": varObj.phenoInhList, "phenoMimList": varObj.phenoMimList, "clinVarVarFound": varObj.clinVarVarFound, "clinVarVarDict": varObj.clinVarVarDict, "clinVarGeneFound": varObj.clinVarGeneFound, "clinVarGeneDict": varObj.clinVarGeneDict, "clinvarTotalNumVars": varObj.clinvarTotalNumVars, "clinvarNumP": varObj.clinvarNumP, "clinvarNumLP": varObj.clinvarNumLP, "clinvarNumLB": varObj.clinvarNumLB, "clinvarNumB": varObj.clinvarNumB, "clinvarTitle": varObj.clinvarTitle, "clinvarSignDesc": varObj.clinvarSignDesc, "clinvarCondition": varObj.clinvarCondition, "hgmdVarFound": varObj.hgmdVarFound, "hgmdGeneFound": varObj.hgmdGeneFound, "hgmdVarPhenIdList": varObj.hgmdVarPhenIdList, "hgmdVarHPOIdList": varObj.hgmdVarHPOIdList, "hgmdVarHPOStrList": varObj.hgmdVarHPOStrList, "varId_dash": varObj.varId_dash, "dgvDictList": varObj.dgvDictList, "dgvTypeList": varObj.dgvTypeList, "dgvSubtypeList": varObj.dgvSubtypeList, "dgvVarFound": varObj.dgvVarFound, "decipherDictList": varObj.decipherDictList, "decipherDeletionObsList": varObj.decipherDeletionObsList, "decipherStudyList": varObj.decipherStudyList, "decipherVarFound": varObj.decipherVarFound, "gnomadGeneZscore": varObj.gnomadGeneZscore, "gnomadGenePLI": varObj.gnomadGenePLI, "gnomadGeneOELof": varObj.gnomadGeneOELof, "gnomadGeneOELofUpper": varObj.gnomadGeneOELofUpper, # symptom "SymptomMatched": varObj.SymptomMatched, "symptomScore": varObj.symptomScore, "symptomName": varObj.symptomName, "omimSymptomSimScore": varObj.omimSymptomSimScore, "omimSymMatchFlag": varObj.omimSymMatchFlag, "hgmdSymptomScore": varObj.hgmdSymptomScore, "hgmdSymptomSimScore": varObj.hgmdSymptomSimScore, "hgmdSymMatchFlag": varObj.hgmdSymMatchFlag, "clinVarSymMatchFlag": varObj.clinVarSymMatchFlag, "VARIANT_CLASS": varObj.VARIANT_CLASS, "Feature": varObj.Feature, "hom": varObj.hom, "hgmd_rs": varObj.hgmd_rs, "hgmd_id": varObj.hgmd_id, # CL added "hgmd_symbol": varObj.hgmd_symbol, # CL added "hgmd_PHEN": varObj.hgmd_PHEN, # CL added "hgmd_CLASS": varObj.hgmd_CLASS, # CL added "clin_dict": varObj.clin_dict, "clin_PLP": varObj.clin_PLP, "clin_PLP_perc": varObj.clin_PLP_perc, "spliceAI": varObj.spliceAI, "spliceAImax": varObj.spliceAImax, "zyg": varObj.zyg, 'geneEnsId': varObj.geneEnsId, 'rsId': varObj.rsId }
Bearbeitung
Datei öffnen
import re from .utils_1 import Variant from .utils_for_marrvel_flatfile import ( getClinVarUsingMarrvelFlatFile, getHGMDUsingFlatFile, getAnnotateInfoRow_2, ) def getAnnotateInfoRows_2( varDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): # NOTE(JL): It is old implementation and not used. # But left to for tracing purpose. Feel free to remove def f(row): return getAnnotateInfoRow_2( row, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ) annotateInfoDf = varDf.apply(f, axis=1, result_type='expand') return annotateInfoDf def getAnnotateInfoRow_3_1(row, genomeRef): varObj = Variant() transcriptId = row.Feature optFlag = 0 if row[0].find("/") != -1: optFlag = 1 if optFlag == 0: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) ref = s[2] alt = s[3] elif optFlag == 1: s = row[0].split("_") # print('s:', s) chrom = s[0] pos = int(s[1]) s = s[2].split("/") ref = s[0] alt = s[1] # get the start and stop from second column like '1:10203-10204' if "-" in row[1]: s = row[1].split(":") tmp = s[1] s = tmp.split("-") # print('s:',s) start = int(s[0]) stop = int(s[1]) else: # start and stop the same s = row[1].split(":") start = int(s[1]) stop = int(s[1]) # print('chrom:', chrom,'pos:',pos,'ref:',ref,'alt:',alt,'start:',start,'stop:',stop) # change chrom X and Y and MT to numbers if chrom == "X": chrom = 23 elif chrom == "Y": chrom = 24 elif chrom == "MT": chrom = 25 elif re.search(r"GL", chrom): chrom = 26 chrom = int(chrom) # if it is hg38 get its hg19 coordinates # CL 03-14-2023: we have separate database for hg19 and hg38, # we don't need to use LiftOver which is inaccurate # related codes commented and modified if genomeRef == "hg38": varObj.hg38Chrom = chrom varObj.hg38Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop else: varObj.hg19Chrom = chrom varObj.hg19Pos = pos varObj.chrom = chrom varObj.pos = pos varObj.start = start varObj.stop = stop geneSymbol = row.SYMBOL # print('gene:', geneSymbol) varObj.geneSymbol = geneSymbol varObj.CADD_phred = row.CADD_phred varObj.CADD_PHRED = row.CADD_PHRED # assign varObj.ref = ref varObj.alt = alt varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt]) # print('varId dash:', varObj.varId_dash) varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId]) varObj.varId = varId if "ZYG" in row: varObj.zyg = row.ZYG varObj.geneEnsId = row.Gene varObj.rsId = row.Existing_variation varObj.GERPpp_RS = row.GERPpp_RS varObj.featureType = row.Feature_type varObj.gnomadAF = row.gnomAD_AF varObj.gnomadAFg = row.gnomADg_AF varObj.CLIN_SIG = row.CLIN_SIG # CL: useless but kept for now varObj.LRT_Omega = row.LRT_Omega varObj.LRT_score = row.LRT_score varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.IMPACT = row.IMPACT varObj.Consequence = row.Consequence varObj.HGVSc = row.HGVSc varObj.HGVSp = row.HGVSp # dbnsfp attributes varObj.GERPpp_NR = row.GERPpp_NR varObj.DANN_score = row.DANN_score varObj.FATHMM_pred = row.FATHMM_pred varObj.FATHMM_score = row.FATHMM_score varObj.GTEx_V8_gene = row.GTEx_V8_gene varObj.GTEx_V8_tissue = row.GTEx_V8_tissue varObj.Polyphen2_HDIV_score = row.Polyphen2_HDIV_score varObj.Polyphen2_HVAR_score = row.Polyphen2_HVAR_score varObj.REVEL_score = row.REVEL_score varObj.SIFT_score = row.SIFT_score varObj.clinvar_AlleleID = row.clinvar # Clinvar allele ID from clinvar.vcf.gz varObj.clinvar_clnsig = ( row.clinvar_CLNSIG ) # CL: Clinvar SIG from clinvar.vcf.gz # varObj.clinvar_clnsig = row.clinvar_clnsig #CL: Clinvar SIG from VEP, deleted varObj.clinvar_CLNREVSTAT = ( row.clinvar_CLNREVSTAT ) # CL: Clinvar STAT from clinvar.vcf.gz, for interface only varObj.clinvar_CLNSIGCONF = ( row.clinvar_CLNSIGCONF ) # CL: Clinvar SIGCONF from clinvar.vcf.gz varObj.clin_code = row.clinvar_CLNSIG # CL: feature name for ai varObj.fathmm_MKL_coding_score = row.fathmm_MKL_coding_score varObj.LRT_score = row.LRT_score varObj.LRT_Omega = row.LRT_Omega varObj.phyloP100way_vertebrate = row.phyloP100way_vertebrate varObj.M_CAP_score = row.M_CAP_score varObj.MutationAssessor_score = row.MutationAssessor_score varObj.MutationTaster_score = row.MutationTaster_score varObj.ESP6500_AA_AC = row.ESP6500_AA_AC varObj.ESP6500_AA_AF = row.ESP6500_AA_AF varObj.ESP6500_EA_AC = row.ESP6500_EA_AC varObj.ESP6500_EA_AF = row.ESP6500_EA_AF varObj.VARIANT_CLASS = row.VARIANT_CLASS varObj.Feature = row.Feature varObj.hom = row.gnomADg_controls_nhomalt varObj.hgmd_id = row.hgmd # CL added varObj.hgmd_symbol = row.hgmd_GENE # CL added varObj.hgmd_rs = row.hgmd_RANKSCORE varObj.hgmd_PHEN = row.hgmd_PHEN # CL added varObj.hgmd_CLASS = row.hgmd_CLASS # CL added if row.clinvar_CLNSIGCONF != "-": clin_dict = dict() for ro in row.clinvar_CLNSIGCONF.split("|_"): temp = ro.split("(") clin_dict[temp[0]] = int(temp[1][0]) PLP_sum = clin_dict.get("Pathogenic", 0) + clin_dict.get( "Likely_pathogenic", 0 ) varObj.clin_dict = clin_dict varObj.clin_PLP = PLP_sum varObj.clin_PLP_perc = PLP_sum / sum(clin_dict.values()) else: if "benign" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 0 elif "pathogenic" in row.clinvar_clnsig.lower(): varObj.clin_PLP_perc = 1 else: varObj.clin_PLP_perc = "-" varObj.clin_PLP = "-" varObj.clin_dict = "-" if row.SpliceAI_pred != "-": varObj.spliceAI = row.SpliceAI_pred temp = row.SpliceAI_pred.split("|") varObj.spliceAImax = max( float(temp[1]), float(temp[2]), float(temp[3]), float(temp[4]) ) else: varObj.spliceAI = "-" varObj.spliceAImax = "-" return vars(varObj) def getAnnotateInfoRow_3_2( varObj, decipherSortedDf, ): # get decipher: 0.6s decipherDictList = [] decipherDeletionObsList = [] decipherStudyList = [] decipherVarFound = 0 deletionObs = "-" # get the varaint object info from varObj chromVal = int(varObj.chrom) posVal = int(varObj.pos) startVal = int(varObj.start) stopVal = int(varObj.stop) # CL 03-14-2023: changed column names to be compatible with hg38 # vals=decipherDf[ ( decipherDf['hg19Chr'] == chromVal ) & ( decipherDf['hg19Start']==startVal ) & (decipherDf['hg19Stop']==stopVal) ] if (chromVal, startVal, stopVal) in decipherSortedDf: vals = decipherSortedDf.loc[(chromVal, startVal, stopVal)] decipherVarFound = 1 deletionObs = vals.iloc[0]["deletion.obs"] decipherDeletionObsList.append(deletionObs) # print('\tchrom:', chromVal,'posVal:', posVal,'start:', startVal,'stopVal:', stopVal) # print('\tdecipherVarFound:',decipherVarFound,'decipherDeletionObs:', deletionObs) retList = [ decipherDictList, decipherDeletionObsList, decipherStudyList, decipherVarFound, ] # [decipherDictList,decipherDeletionObsList,decipherStudyList, decipherVarFound] return { "decipherDictList": retList[0], "decipherDeletionObsList": retList[1], "decipherStudyList": retList[2], "decipherVarFound": retList[3], } def getAnnotateInfoRow_3_3( varObj, gnomadMetricsGeneSortedDf, ): # get gnomad gene metrics from gnomad file: 3.1s if varObj.geneSymbol in gnomadMetricsGeneSortedDf.index: # pLI, oe_lof, oe_lof_upper,mis_z val = gnomadMetricsGeneSortedDf.loc[varObj.geneSymbol] gnomadGeneZscore = val["mis_z"] gnomadGenePLI = val["pLI"] gnomadGeneOELof = val["oe_lof"] gnomadGeneOELofUpper = val["oe_lof_upper"] else: # get the values gnomadGeneZscore = "-" gnomadGenePLI = "-" gnomadGeneOELof = "-" gnomadGeneOELofUpper = "-" retList = [gnomadGeneZscore, gnomadGenePLI, gnomadGeneOELof, gnomadGeneOELofUpper] return { "gnomadGeneZscore": retList[0], "gnomadGenePLI": retList[1], "gnomadGeneOELof": retList[2], # O/E lof "gnomadGeneOELofUpper": retList[3], # O/E lof upper } def getAnnotateInfoRow_3_4( varObj, omimGeneSortedDf, ): # get OMIM: 2s inputSnpList = [] if "," in varObj.rsId: inputSnpList = varObj.rsId.split(",") else: inputSnpList = varObj.rsId varFound = 0 geneFound = 0 omimDict = {} omimGeneDict = {} omimAlleleDict = {} phenoList = [] phenoInhList = [] phenoMimList = [] # check gene # keys: dict_keys(['phenotypes', 'allelicVariants', 'mimNumber', 'status', 'title', 'description', 'geneEntrezId', 'geneSymbol']) if varObj.geneSymbol in omimGeneSortedDf.index: # print('\tgene:', varObj.geneSymbol, 'found') geneFound = 1 omimGeneDict = omimGeneSortedDf.loc[varObj.geneSymbol] snpList = [] for a in omimGeneDict["allelicVariants"]: if "dbSnps" in a: snpList.append(a["dbSnps"]) # check if input snpID matches the OMIM one set1 = set(inputSnpList) set2 = set(snpList) if set1.intersection(set2): varFound = 1 else: varFound = 0 # get disease info from OMIM # print('\tphenotypes:', type(omimGeneDict['phenotypes']), ' len:', len(omimGeneDict['phenotypes']) ) for a in omimGeneDict["phenotypes"]: # print('type:', type(a)) pheno = a["phenotype"] if "phenotypeMimNumber" in a: phenoMim = a["phenotypeMimNumber"] else: phenoMim = "-" if "phenotypeInheritance" in a: phenoInh = a["phenotypeInheritance"] else: phenoInh = "-" phenoList.append(pheno) phenoInhList.append(phenoInh) phenoMimList.append(str(phenoMim)) # print('phenotype:', pheno,phenoMim,phenoInh) omimRet = [ varFound, geneFound, omimDict, omimGeneDict, omimAlleleDict, phenoList, phenoInhList, phenoMimList, ] return { "omimVarFound": omimRet[0], "omimGeneFound": omimRet[1], "omimDict": omimRet[2], "omimGeneDict": omimRet[3], "omimAlleleDict": omimRet[4], "phenoList": omimRet[5], "phenoInhList": omimRet[6], "phenoMimList": omimRet[7], } def getAnnotateInfoRow_3_5( varObj, clinvarGeneDf, clinvarAlleleDf, ): clinVarRet = getClinVarUsingMarrvelFlatFile( varObj, clinvarAlleleDf, clinvarGeneDf ) clinVarRet[10] = varObj.clinvar_clnsig # clinVarRet[10] #CL: changed to clinvar.vcf.gz annotation return { "clinVarVarFound": clinVarRet[0], "clinVarVarDict": clinVarRet[1], "clinVarGeneFound": clinVarRet[2], "clinVarGeneDict": clinVarRet[3], "clinvarTotalNumVars": clinVarRet[4], "clinvarNumP": clinVarRet[5], "clinvarNumLP": clinVarRet[6], "clinvarNumLB": clinVarRet[7], "clinvarNumB": clinVarRet[8], "clinvarTitle": clinVarRet[9], "clinvarSignDesc": clinVarRet[10], "clinvarCondition": clinVarRet[11], } def getAnnotateInfoRow_3_6( varObj, hgmdHPOScoreDf, ): hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf) return { "hgmdVarFound": hgmdRet[0], "hgmdGeneFound": hgmdRet[1], "hgmdVarPhenIdList": hgmdRet[2], "hgmdVarHPOIdList": hgmdRet[3], "hgmdVarHPOStrList": hgmdRet[4], } def getAnnotateInfoRows_3( vepDf, genomeRef, clinvarGeneDf, clinvarAlleleDf, omimGeneSortedDf, omimAlleleList, hgmdHPOScoreDf, moduleList, decipherSortedDf, gnomadMetricsGeneSortedDf, ): def f1(row): return getAnnotateInfoRow_3_1(row, genomeRef) def f2(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_2(row, decipherSortedDf) def f3(row): if "conserve" not in moduleList: return row return getAnnotateInfoRow_3_3(row, gnomadMetricsGeneSortedDf) def f4(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_4(row, omimGeneSortedDf) def f5(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_5(row, clinvarGeneDf, clinvarAlleleDf) def f6(row): if "curate" not in moduleList: return row return getAnnotateInfoRow_3_6( row, hgmdHPOScoreDf ) annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand') df = annotateInfoDf.apply(f2, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f3, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f4, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f5, axis=1, result_type='expand') annotateInfoDf[df.columns] = df df = annotateInfoDf.apply(f6, axis=1, result_type='expand') annotateInfoDf[df.columns] = df return annotateInfoDf
Unterschied finden