Skip to content
Snippets Groups Projects
Commit b3944989 authored by thierry's avatar thierry
Browse files

begininng of parseReportFile function to get information of validation in diann log file

parent a4126644
Branches main
No related tags found
No related merge requests found
......@@ -2,95 +2,143 @@ import re
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
prot_descr = {}
import argparse
import sys
import os
with open("/gorgone/pappso/moulon/database/uniprotkb_S_cerevisiae_S288C_20241125.fasta") as handle:
for record in SeqIO.parse(handle, "fasta"):
print(record.id)
record_id = re.sub("^[a-zA-Z]+\|", "", record.id)
print(record_id)
record_id = re.sub("\|[a-zA-Z0-9]*_[a-zA-Z0-9]*", "", record_id)
print(record_id)
prot_descr[record_id] = record.id + " " + record.description
with open("/gorgone/pappso/moulon/database/contaminants_standarts.fasta") as handle:
for record in SeqIO.parse(handle, "fasta"):
prot_descr[record.id] = record.id + " " + record.description
def parseReportLog(reportlogFile):
if os.path.exists(reportlogFile):
inputLog = open(reportlogFile, "r")
fastafile = []
inputfile =[]
for line in inputLog:
# print(line)
if "--fasta" in line:
print(line)
if match := re.search("(--f\ .*mzML|mzXML)", line):
matchpart = match.group(1).split("--f")
inputfile = [x.strip() for x in matchpart if x != ""]
print(inputfile)
uniprot = {}
uniprotfile = open("uniprot.csv", "r")
for line in uniprotfile:
if "record_id" not in line:
line = line.replace('"', "")
tabline = line.strip().split("\t")
id = f"(UniMod:{tabline[0]})"
uniprot[id] = {"id":tabline[0], "title":tabline[1], "full_name":tabline[2], "mono_isotopique":tabline[3]}
uniprotfile.close()
pep_ids = open("pep_ids.tsv", "w")
pep_ids.write("Modified.sequence\tSequence\tmods\n")
file = open("report.tsv", "r")
samples = {}
for line in file:
if "File.Name" not in line:
tabline = line.split("\t")
sample = tabline[1]
previous_match = 0
mods = []
mods_string = []
for match in re.finditer(r"(\(UniMod\:[0-9]*\))", tabline[13]):
#print(f"{match.group()} : {match.span()[0]} {tabline[13]} {tabline[13][match.span()[0]-1]}, {match.span()[1]-match.span()[0]}")
amino_acid = tabline[13][match.span()[0]-1]
amino_acid_number = match.span()[0]-previous_match
#print(match.group())
#print(f'<mod aa="{amino_acid}" pos="{amino_acid_number}" mod="{uniprot[match.group()]["mono_isotopique"]}" />')
if match.group() == "(UniMod:4)":
mono_isotopique=57.0214614868
elif match.group() == "(UniMod:35)":
mono_isotopique=15.9949102402
elif match.group() == "(UniMod:1)":
mono_isotopique=42.0105667114
mods.append({"amino_acid":amino_acid, "amino_acid_number":amino_acid_number, "mono_isotopique":mono_isotopique})
previous_match += match.span()[1]-match.span()[0]
mods_string.append(f"{amino_acid}{amino_acid_number}:{mono_isotopique}")
pep_ids.write(f'{tabline[13]}\t{tabline[14]}\t{" ".join(mods_string)}\n')
if sample in samples.keys():
samples[sample].append({"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods})
else:
samples[sample] = [{"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods}]
pep_ids.close()
outputfile = open("outputfile.xml", "w")
outputfile.write('<?xml version="1.0" encoding="utf-8" ?>\n')
outputfile.write('<peptide_result>\n')
outputfile.write('<filter evalue="0.01" />\n')
for sample in samples:
outputfile.write(f'<sample name="{sample}" file="{sample}">\n')
scanid = 0
modlines = ""
for peptiz in samples[sample]:
scanid = scanid+1
# print(f"scan {scanid} : {peptiz}")
current_pep = peptiz
prot = current_pep["proteins.Ids"].split(";")
pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
if len(current_pep['mods']) != 0:
for mod in current_pep['mods']:
pep_mass += float(mod["mono_isotopique"])
outputfile.write(f"<scan num=\"{scanid}\" z=\"{current_pep['charge']}\" mhObs=\"{pep_mass}\">\n")
for i in range(len(prot)):
if len(current_pep['mods']) == 0:
pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\"></psm>\n")
else:
outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\">\n")
for mod in current_pep['mods']:
outputfile.write(f'<mod aa="{mod["amino_acid"]}" pos="{mod["amino_acid_number"]}" mod="{mod["mono_isotopique"]}" />\n')
outputfile.write("</psm>\n")
outputfile.write("</scan>\n")
outputfile.write("</sample>\n")
outputfile.write("</peptide_result>\n")
outputfile.close()
# #Defined command line
# desc = "Process Diann result to peptide result. \
# The result can then be grouped by gp-grouping program."
# command = argparse.ArgumentParser(prog='gp-read-diann', \
# description=desc, usage='%(prog)s [options] files')
# command.add_argument('-q', '--Qvalue', default=0.01, type=float, \
# help='Minimum peptide qvalue threshold (default:0.01)')
# command.add.argument('-p', '--PG.Qvalue', default=0.01, type=float, \
# help='Minimum protein group qvalue threshold (default:0.01)')
# command.add_argument('-o', '--outfile', nargs="?", \
# type=argparse.FileType("w"), default=sys.stdout, \
# help='Save peptide result default:STDOUT')
# command.add_argument('-f', '--fasta', default="", type=str, \
# help='The Fasta file which is used for DIA-NN processing for multiple')
# command.add_argument('-c', '--contaminant_fasta', default="", type=str, \
# help='The Contaminant Fasta file which is used for DIA-NN processing')
# command.add_argument('files', metavar='files', nargs='+', \
# help='List of X!Tandem files to process')
# command.add_argument('-v', '--version', action='version', \
# version='%(prog)s ${GP_VERSION}')
#
#
#
# prot_descr = {}
#
# with open("/gorgone/pappso/moulon/database/uniprotkb_S_cerevisiae_S288C_20241125.fasta") as handle:
# for record in SeqIO.parse(handle, "fasta"):
# print(record.id)
# record_id = re.sub("^[a-zA-Z]+\|", "", record.id)
# print(record_id)
# record_id = re.sub("\|[a-zA-Z0-9]*_[a-zA-Z0-9]*", "", record_id)
# print(record_id)
# prot_descr[record_id] = record.id + " " + record.description
# with open("/gorgone/pappso/moulon/database/contaminants_standarts.fasta") as handle:
# for record in SeqIO.parse(handle, "fasta"):
# prot_descr[record.id] = record.id + " " + record.description
#
#
#
# uniprot = {}
# uniprotfile = open("uniprot.csv", "r")
# for line in uniprotfile:
# if "record_id" not in line:
# line = line.replace('"', "")
# tabline = line.strip().split("\t")
# id = f"(UniMod:{tabline[0]})"
# uniprot[id] = {"id":tabline[0], "title":tabline[1], "full_name":tabline[2], "mono_isotopique":tabline[3]}
# uniprotfile.close()
#
# pep_ids = open("pep_ids.tsv", "w")
# pep_ids.write("Modified.sequence\tSequence\tmods\n")
#
# file = open("report.tsv", "r")
# samples = {}
# for line in file:
# if "File.Name" not in line:
# tabline = line.split("\t")
# sample = tabline[1]
# previous_match = 0
# mods = []
# mods_string = []
# for match in re.finditer(r"(\(UniMod\:[0-9]*\))", tabline[13]):
# #print(f"{match.group()} : {match.span()[0]} {tabline[13]} {tabline[13][match.span()[0]-1]}, {match.span()[1]-match.span()[0]}")
# amino_acid = tabline[13][match.span()[0]-1]
# amino_acid_number = match.span()[0]-previous_match
# #print(match.group())
# #print(f'<mod aa="{amino_acid}" pos="{amino_acid_number}" mod="{uniprot[match.group()]["mono_isotopique"]}" />')
# if match.group() == "(UniMod:4)":
# mono_isotopique=57.0214614868
# elif match.group() == "(UniMod:35)":
# mono_isotopique=15.9949102402
# elif match.group() == "(UniMod:1)":
# mono_isotopique=42.0105667114
# mods.append({"amino_acid":amino_acid, "amino_acid_number":amino_acid_number, "mono_isotopique":mono_isotopique})
# previous_match += match.span()[1]-match.span()[0]
# mods_string.append(f"{amino_acid}{amino_acid_number}:{mono_isotopique}")
# pep_ids.write(f'{tabline[13]}\t{tabline[14]}\t{" ".join(mods_string)}\n')
# if sample in samples.keys():
# samples[sample].append({"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods})
# else:
# samples[sample] = [{"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods}]
#
# pep_ids.close()
#
# outputfile = open("outputfile.xml", "w")
# outputfile.write('<?xml version="1.0" encoding="utf-8" ?>\n')
# outputfile.write('<peptide_result>\n')
# outputfile.write('<filter evalue="0.01" />\n')
# for sample in samples:
# outputfile.write(f'<sample name="{sample}" file="{sample}">\n')
# scanid = 0
# modlines = ""
# for peptiz in samples[sample]:
# scanid = scanid+1
# # print(f"scan {scanid} : {peptiz}")
# current_pep = peptiz
# prot = current_pep["proteins.Ids"].split(";")
# pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
# if len(current_pep['mods']) != 0:
# for mod in current_pep['mods']:
# pep_mass += float(mod["mono_isotopique"])
# outputfile.write(f"<scan num=\"{scanid}\" z=\"{current_pep['charge']}\" mhObs=\"{pep_mass}\">\n")
# for i in range(len(prot)):
# if len(current_pep['mods']) == 0:
# pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
# outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\"></psm>\n")
# else:
# outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\">\n")
# for mod in current_pep['mods']:
# outputfile.write(f'<mod aa="{mod["amino_acid"]}" pos="{mod["amino_acid_number"]}" mod="{mod["mono_isotopique"]}" />\n')
# outputfile.write("</psm>\n")
# outputfile.write("</scan>\n")
# outputfile.write("</sample>\n")
# outputfile.write("</peptide_result>\n")
# outputfile.close()
parseReportLog("/gorgone/pappso/moulon/users/thierry/20240329_test_astral/diann/col-O_DIA_180SPD/report.log.txt")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment