begininng of parseReportFile function to get information of validation in diann log file

b3944989 · thierry · a4126644 · b3944989
Commit b3944989 authored 2 weeks ago by thierry
--- a/src/python/gp-read-diann.py
+++ b/src/python/gp-read-diann.py
@@ -2,95 +2,143 @@ import re
 from Bio import SeqIO
 from Bio.SeqUtils.ProtParam import ProteinAnalysis

-prot_descr = {}
+import argparse
+import sys
+import os

-with open("/gorgone/pappso/moulon/database/uniprotkb_S_cerevisiae_S288C_20241125.fasta") as handle:
-	for record in SeqIO.parse(handle, "fasta"):
-		print(record.id)
-		record_id = re.sub("^[a-zA-Z]+\|", "", record.id)
-		print(record_id)
-		record_id = re.sub("\|[a-zA-Z0-9]*_[a-zA-Z0-9]*", "", record_id)
-		print(record_id)
-		prot_descr[record_id] = record.id + " " + record.description
-with open("/gorgone/pappso/moulon/database/contaminants_standarts.fasta") as handle:
-    for record in SeqIO.parse(handle, "fasta"):
-        prot_descr[record.id] = record.id + " " + record.description
+def parseReportLog(reportlogFile):
+	if os.path.exists(reportlogFile):
+		inputLog = open(reportlogFile, "r")
+		fastafile = []
+		inputfile  =[]
+		for line in inputLog:
+			# print(line)
+			if "--fasta" in line:
+				print(line)
+				if match := re.search("(--f\ .*mzML|mzXML)", line):
+					matchpart = match.group(1).split("--f")
+					inputfile = [x.strip() for x in matchpart if x != ""]

+					print(inputfile)


-uniprot = {}
-uniprotfile = open("uniprot.csv", "r")
-for line in uniprotfile:
-	if "record_id" not in line:
-		line = line.replace('"', "")
-		tabline = line.strip().split("\t")
-		id = f"(UniMod:{tabline[0]})"
-		uniprot[id] = {"id":tabline[0], "title":tabline[1], "full_name":tabline[2], "mono_isotopique":tabline[3]}
-uniprotfile.close()

-pep_ids = open("pep_ids.tsv", "w")
-pep_ids.write("Modified.sequence\tSequence\tmods\n")

-file = open("report.tsv", "r")
-samples = {}
-for line in file:
-	if "File.Name" not in line:
-		tabline = line.split("\t")
-		sample = tabline[1]
-		previous_match = 0
-		mods = []
-		mods_string = []
-		for match in re.finditer(r"(\(UniMod\:[0-9]*\))", tabline[13]):
-			#print(f"{match.group()} : {match.span()[0]} {tabline[13]} {tabline[13][match.span()[0]-1]}, {match.span()[1]-match.span()[0]}")
-			amino_acid = tabline[13][match.span()[0]-1]
-			amino_acid_number = match.span()[0]-previous_match
-			#print(match.group())
-			#print(f'<mod aa="{amino_acid}" pos="{amino_acid_number}" mod="{uniprot[match.group()]["mono_isotopique"]}" />')
-			if match.group() == "(UniMod:4)":
-				mono_isotopique=57.0214614868
-			elif match.group() == "(UniMod:35)":
-				mono_isotopique=15.9949102402
-			elif match.group() == "(UniMod:1)":
-				mono_isotopique=42.0105667114
-			mods.append({"amino_acid":amino_acid, "amino_acid_number":amino_acid_number, "mono_isotopique":mono_isotopique})
-			previous_match += match.span()[1]-match.span()[0]
-			mods_string.append(f"{amino_acid}{amino_acid_number}:{mono_isotopique}")
-		pep_ids.write(f'{tabline[13]}\t{tabline[14]}\t{" ".join(mods_string)}\n')
-		if sample in samples.keys():
-			samples[sample].append({"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods})
-		else:
-			samples[sample] = [{"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods}]
-		
-pep_ids.close()

-outputfile = open("outputfile.xml", "w")		
-outputfile.write('<?xml version="1.0" encoding="utf-8" ?>\n')
-outputfile.write('<peptide_result>\n')
-outputfile.write('<filter evalue="0.01" />\n')
-for sample in samples:
-	outputfile.write(f'<sample name="{sample}" file="{sample}">\n')
-	scanid = 0
-	modlines = ""
-	for peptiz in samples[sample]:
-		scanid = scanid+1
-	#	print(f"scan {scanid} : {peptiz}")
-		current_pep = peptiz
-		prot = current_pep["proteins.Ids"].split(";")
-		pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
-		if len(current_pep['mods']) != 0:
-			for mod in current_pep['mods']:
-				pep_mass += float(mod["mono_isotopique"])
-		outputfile.write(f"<scan num=\"{scanid}\" z=\"{current_pep['charge']}\" mhObs=\"{pep_mass}\">\n")
-		for i in range(len(prot)):
-			if len(current_pep['mods']) == 0:
-				pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
-				outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\"></psm>\n")
-			else:
-				outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\">\n")
-				for mod in current_pep['mods']:
-					outputfile.write(f'<mod aa="{mod["amino_acid"]}" pos="{mod["amino_acid_number"]}" mod="{mod["mono_isotopique"]}" />\n')
-				outputfile.write("</psm>\n")
-		outputfile.write("</scan>\n")
-	outputfile.write("</sample>\n")
-outputfile.write("</peptide_result>\n")
-outputfile.close()
+# #Defined command line
+# desc = "Process Diann result to peptide result. \
+#     The result can then be grouped by gp-grouping program."
+# command = argparse.ArgumentParser(prog='gp-read-diann', \
+#     description=desc, usage='%(prog)s [options] files')
+# command.add_argument('-q', '--Qvalue', default=0.01, type=float, \
+#     help='Minimum peptide qvalue threshold (default:0.01)')
+# command.add.argument('-p', '--PG.Qvalue', default=0.01, type=float, \
+#     help='Minimum protein group qvalue threshold (default:0.01)')
+# command.add_argument('-o', '--outfile', nargs="?", \
+#     type=argparse.FileType("w"), default=sys.stdout, \
+#     help='Save peptide result default:STDOUT')
+# command.add_argument('-f', '--fasta', default="", type=str, \
+# 	help='The Fasta file which is used for DIA-NN processing for multiple')
+# command.add_argument('-c', '--contaminant_fasta', default="", type=str, \
+# 	help='The Contaminant Fasta file which is used for DIA-NN processing')
+# command.add_argument('files', metavar='files', nargs='+', \
+#     help='List of X!Tandem files to process')
+# command.add_argument('-v', '--version', action='version', \
+#     version='%(prog)s ${GP_VERSION}')
+#
+#
+#
+# prot_descr = {}
+#
+# with open("/gorgone/pappso/moulon/database/uniprotkb_S_cerevisiae_S288C_20241125.fasta") as handle:
+# 	for record in SeqIO.parse(handle, "fasta"):
+# 		print(record.id)
+# 		record_id = re.sub("^[a-zA-Z]+\|", "", record.id)
+# 		print(record_id)
+# 		record_id = re.sub("\|[a-zA-Z0-9]*_[a-zA-Z0-9]*", "", record_id)
+# 		print(record_id)
+# 		prot_descr[record_id] = record.id + " " + record.description
+# with open("/gorgone/pappso/moulon/database/contaminants_standarts.fasta") as handle:
+#     for record in SeqIO.parse(handle, "fasta"):
+#         prot_descr[record.id] = record.id + " " + record.description
+#
+#
+#
+# uniprot = {}
+# uniprotfile = open("uniprot.csv", "r")
+# for line in uniprotfile:
+# 	if "record_id" not in line:
+# 		line = line.replace('"', "")
+# 		tabline = line.strip().split("\t")
+# 		id = f"(UniMod:{tabline[0]})"
+# 		uniprot[id] = {"id":tabline[0], "title":tabline[1], "full_name":tabline[2], "mono_isotopique":tabline[3]}
+# uniprotfile.close()
+#
+# pep_ids = open("pep_ids.tsv", "w")
+# pep_ids.write("Modified.sequence\tSequence\tmods\n")
+#
+# file = open("report.tsv", "r")
+# samples = {}
+# for line in file:
+# 	if "File.Name" not in line:
+# 		tabline = line.split("\t")
+# 		sample = tabline[1]
+# 		previous_match = 0
+# 		mods = []
+# 		mods_string = []
+# 		for match in re.finditer(r"(\(UniMod\:[0-9]*\))", tabline[13]):
+# 			#print(f"{match.group()} : {match.span()[0]} {tabline[13]} {tabline[13][match.span()[0]-1]}, {match.span()[1]-match.span()[0]}")
+# 			amino_acid = tabline[13][match.span()[0]-1]
+# 			amino_acid_number = match.span()[0]-previous_match
+# 			#print(match.group())
+# 			#print(f'<mod aa="{amino_acid}" pos="{amino_acid_number}" mod="{uniprot[match.group()]["mono_isotopique"]}" />')
+# 			if match.group() == "(UniMod:4)":
+# 				mono_isotopique=57.0214614868
+# 			elif match.group() == "(UniMod:35)":
+# 				mono_isotopique=15.9949102402
+# 			elif match.group() == "(UniMod:1)":
+# 				mono_isotopique=42.0105667114
+# 			mods.append({"amino_acid":amino_acid, "amino_acid_number":amino_acid_number, "mono_isotopique":mono_isotopique})
+# 			previous_match += match.span()[1]-match.span()[0]
+# 			mods_string.append(f"{amino_acid}{amino_acid_number}:{mono_isotopique}")
+# 		pep_ids.write(f'{tabline[13]}\t{tabline[14]}\t{" ".join(mods_string)}\n')
+# 		if sample in samples.keys():
+# 			samples[sample].append({"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods})
+# 		else:
+# 			samples[sample] = [{"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods}]
+#
+# pep_ids.close()
+#
+# outputfile = open("outputfile.xml", "w")
+# outputfile.write('<?xml version="1.0" encoding="utf-8" ?>\n')
+# outputfile.write('<peptide_result>\n')
+# outputfile.write('<filter evalue="0.01" />\n')
+# for sample in samples:
+# 	outputfile.write(f'<sample name="{sample}" file="{sample}">\n')
+# 	scanid = 0
+# 	modlines = ""
+# 	for peptiz in samples[sample]:
+# 		scanid = scanid+1
+# 	#	print(f"scan {scanid} : {peptiz}")
+# 		current_pep = peptiz
+# 		prot = current_pep["proteins.Ids"].split(";")
+# 		pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
+# 		if len(current_pep['mods']) != 0:
+# 			for mod in current_pep['mods']:
+# 				pep_mass += float(mod["mono_isotopique"])
+# 		outputfile.write(f"<scan num=\"{scanid}\" z=\"{current_pep['charge']}\" mhObs=\"{pep_mass}\">\n")
+# 		for i in range(len(prot)):
+# 			if len(current_pep['mods']) == 0:
+# 				pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
+# 				outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\"></psm>\n")
+# 			else:
+# 				outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\">\n")
+# 				for mod in current_pep['mods']:
+# 					outputfile.write(f'<mod aa="{mod["amino_acid"]}" pos="{mod["amino_acid_number"]}" mod="{mod["mono_isotopique"]}" />\n')
+# 				outputfile.write("</psm>\n")
+# 		outputfile.write("</scan>\n")
+# 	outputfile.write("</sample>\n")
+# outputfile.write("</peptide_result>\n")
+# outputfile.close()
+
+parseReportLog("/gorgone/pappso/moulon/users/thierry/20240329_test_astral/diann/col-O_DIA_180SPD/report.log.txt")