Skip to content

percolator support gzipped file #121

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 26 additions & 6 deletions psm_utils/io/percolator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from __future__ import annotations

import csv
import gzip
import logging
import re
from pathlib import Path
Expand Down Expand Up @@ -118,8 +119,12 @@ def __iter__(self) -> Iterable[PSM]:

@staticmethod
def _read_header(filename):
with open(filename, "rt") as f:
fieldnames = f.readline().strip().lower().split("\t")
if str(filename).endswith(".gz"):
with gzip.open(filename, "rt") as f:
fieldnames = f.readline().strip().lower().split("\t")
else:
with open(filename, "rt") as f:
fieldnames = f.readline().strip().lower().split("\t")
return fieldnames

@staticmethod
Expand All @@ -144,9 +149,15 @@ def _infer_charge_columns(fieldnames):
@staticmethod
def _parse_peptidoform(percolator_peptide, charge):
"""Parse Percolator TSV peptide notation to Peptidoform."""
# Remove leading and trailing amino acids
# Remove leading and trailing amino acids (e.g., R.PEPTIDE.S -> PEPTIDE)
match = re.match(r"^(?:[A-Z-])?\.(.+)\.(?:[A-Z-])?$", percolator_peptide)
peptidoform = match[1] if match else percolator_peptide
# Handle Comet's n-terminal modification format: n[42.0106]PEPTIDE
peptidoform = re.sub(r'^n\[([+-]?[\w\.]*?)\]', r'[\1]-', peptidoform)

# Ensure positive values inside square brackets have a '+' sign
peptidoform = re.sub(r'\[(\d+[\.]*\d*)]', r'[+\1]', peptidoform)

if charge:
peptidoform += f"/{charge}"
return Peptidoform(peptidoform)
Expand Down Expand Up @@ -367,7 +378,12 @@ def _parse_existing_file(
) -> Tuple[List[str], Optional[int]]:
"""Parse existing Percolator Tab file to determine fieldnames and last ScanNr."""
# Get fieldnames
with open(filename, "rt") as open_file:
if str(filename).endswith(".gz"):
open_func = gzip.open
else:
open_func = open

with open_func(filename, "rt") as open_file:
for line in open_file:
fieldnames = line.strip().split("\t")
break
Expand All @@ -382,7 +398,7 @@ def _parse_existing_file(

# Get last ScanNr
last_scannr = None
with open(filename, "rt") as open_file:
with open_func(filename, "rt") as open_file:
# Read last line
open_file.seek(0)
last_line = None
Expand All @@ -409,7 +425,11 @@ def _parse_existing_file(
class _PercolatorTabIO:
def __init__(self, *args, protein_separator="|||", **kwargs) -> None:
"""File reader and writer for Percolator Tab files with fixed Proteins tab."""
self._open_file = open(*args, **kwargs)
filename = args[0]
if str(filename).endswith(".gz"):
self._open_file = gzip.open(*args, **kwargs)
else:
self._open_file = open(*args, **kwargs)
self.protein_separator = protein_separator

def __enter__(self, *args, **kwargs) -> _PercolatorTabIO:
Expand Down