From 53f7d94046b04a4e9509abbf57043ad91a3b8ff7 Mon Sep 17 00:00:00 2001 From: Xiaolong Cao Date: Wed, 16 Apr 2025 15:36:42 +0800 Subject: [PATCH 1/2] percolator support gzipped file --- psm_utils/io/percolator.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/psm_utils/io/percolator.py b/psm_utils/io/percolator.py index 045d09c..5a288b6 100644 --- a/psm_utils/io/percolator.py +++ b/psm_utils/io/percolator.py @@ -16,6 +16,7 @@ from __future__ import annotations import csv +import gzip import logging import re from pathlib import Path @@ -118,8 +119,12 @@ def __iter__(self) -> Iterable[PSM]: @staticmethod def _read_header(filename): - with open(filename, "rt") as f: - fieldnames = f.readline().strip().lower().split("\t") + if str(filename).endswith(".gz"): + with gzip.open(filename, "rt") as f: + fieldnames = f.readline().strip().lower().split("\t") + else: + with open(filename, "rt") as f: + fieldnames = f.readline().strip().lower().split("\t") return fieldnames @staticmethod @@ -367,7 +372,12 @@ def _parse_existing_file( ) -> Tuple[List[str], Optional[int]]: """Parse existing Percolator Tab file to determine fieldnames and last ScanNr.""" # Get fieldnames - with open(filename, "rt") as open_file: + if str(filename).endswith(".gz"): + open_func = gzip.open + else: + open_func = open + + with open_func(filename, "rt") as open_file: for line in open_file: fieldnames = line.strip().split("\t") break @@ -382,7 +392,7 @@ def _parse_existing_file( # Get last ScanNr last_scannr = None - with open(filename, "rt") as open_file: + with open_func(filename, "rt") as open_file: # Read last line open_file.seek(0) last_line = None @@ -409,7 +419,11 @@ def _parse_existing_file( class _PercolatorTabIO: def __init__(self, *args, protein_separator="|||", **kwargs) -> None: """File reader and writer for Percolator Tab files with fixed Proteins tab.""" - self._open_file = open(*args, **kwargs) + filename = args[0] + if str(filename).endswith(".gz"): + self._open_file = gzip.open(*args, **kwargs) + else: + self._open_file = open(*args, **kwargs) self.protein_separator = protein_separator def __enter__(self, *args, **kwargs) -> _PercolatorTabIO: From 1e6e1522bce7d32d51699951103f671bbaa7097b Mon Sep 17 00:00:00 2001 From: Xiaolong Cao Date: Wed, 16 Apr 2025 16:43:36 +0800 Subject: [PATCH 2/2] handle percolator pin file generated by comet --- psm_utils/io/percolator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/psm_utils/io/percolator.py b/psm_utils/io/percolator.py index 5a288b6..e70e064 100644 --- a/psm_utils/io/percolator.py +++ b/psm_utils/io/percolator.py @@ -149,9 +149,15 @@ def _infer_charge_columns(fieldnames): @staticmethod def _parse_peptidoform(percolator_peptide, charge): """Parse Percolator TSV peptide notation to Peptidoform.""" - # Remove leading and trailing amino acids + # Remove leading and trailing amino acids (e.g., R.PEPTIDE.S -> PEPTIDE) match = re.match(r"^(?:[A-Z-])?\.(.+)\.(?:[A-Z-])?$", percolator_peptide) peptidoform = match[1] if match else percolator_peptide + # Handle Comet's n-terminal modification format: n[42.0106]PEPTIDE + peptidoform = re.sub(r'^n\[([+-]?[\w\.]*?)\]', r'[\1]-', peptidoform) + + # Ensure positive values inside square brackets have a '+' sign + peptidoform = re.sub(r'\[(\d+[\.]*\d*)]', r'[+\1]', peptidoform) + if charge: peptidoform += f"/{charge}" return Peptidoform(peptidoform)