Source code for pdb_io

"""Read and write CSV files and parse protein structures

Author: Adriaan Lategan
"""

import gzip
import pathlib
import warnings
from dataclasses import dataclass, field
from typing import Iterator, Generator, Iterable

from Bio import PDB
from Bio.PDB import Structure, Chain, Residue


[docs]def custom_format(message: Warning | str, category: type[Warning], filename: str, lineno: int, *args, **kwargs ) -> str: """ Overwrite the format of Warnings Returns ------- str """ return f'{filename}:{lineno}: {category.__name__}: {message}\n'
warnings.formatwarning = custom_format
[docs]@dataclass class PolypeptideEntry: """ Object representing a contiguous polymer of amino acids, with methods to extract amino acid sequences from it Attributes ---------- pdb_id : str pdb entity ID of the protein structure model : int pdb model number chain : PDB.Chain.Chain protein chain entity object polypeptide : PDB.Polypeptide a contiguous chain of amino acids from the protein chain entry """ pdb_id: str model: int chain: PDB.Chain.Chain polypeptide: PDB.Polypeptide @property def sequence(self) -> str: """ Returns ------- str amino acid sequence of the polypeptide instance """ return self.polypeptide.get_sequence()
[docs] def find_motif(self, motif: str ) -> Generator[list[PDB.Residue.Residue], None, None]: """find each instance of the sequence motif in the polypeptides Parameters ---------- motif : str Yields ------- list[PDB.Residue.Residue] residues matching the motif from each polypeptide """ end = 0 start = self.sequence.find(motif, end) if start == -1: return while start != -1: end = start + len(motif) yield self.polypeptide[start:end] start = self.sequence.find(motif, end)
[docs]class PdbReader:
[docs] def __init__(self, pdb_directory: str, pdb_type: str = 'cif', gzipped: bool = True ): """Object for parsing protein structures in pdb or cif format from a specific directory Parameters ---------- pdb_directory : str a directory containing protein structure files pdb_type : str the format of the protein structure files. Default "cif" gzipped : bool true if the protein structure files are compressed with gzip, false if uncompressed Attributes ---------- directory_queries pdb_directory : pathlib.Path path to the directory containing protein structure files text_handler : Callable method for opening text stream parser : PDB.MMCIFParser or PDB.PDBParser """ self.pdb_directory = pathlib.Path(pdb_directory) if not self.pdb_directory.exists(): error = f"The directory {self.pdb_directory} does not exist." raise FileNotFoundError(error) self.text_handler = gzip.open if gzipped else open if pdb_type == 'cif': self.parser = PDB.MMCIFParser(QUIET=True, auth_chains=False) elif pdb_type == 'pdb': self.parser = PDB.PDBParser(QUIET=True) else: error = f'Invalid structure format {pdb_type}. The structure ' \ f'format should either be "cif" or "pdb"' raise ValueError(error)
[docs] def read_file(self, protein_id: str, file_name: str ) -> PDB.Structure.Structure | None: """ Parse a protein structure file and return a biopython PDB structure Parameters ---------- protein_id : str pdb entity ID of the protein structure file_name : str name of the protein structure file Returns ------- PDB.Structure.Structure or None Biopython structure entity """ file_path = self.pdb_directory / file_name structure = None if not file_path.exists(): warnings.warn(f"The pdb file {file_path} does not exist.") return structure with self.text_handler(file_path, "rt") as file: try: structure = self.parser.get_structure(protein_id, file) except (EOFError, TypeError, ValueError) as error: warnings.warn(f'Encountered error "{error}" while parsing ' f'file "{file_path}"') return structure
@property def directory_queries(self): """read each file in the directory and get the protein chain IDs and each protein structure file Returns ------- PdbQueries the path to each protein structure file and the IDs of the polypeptide chains in that protein """ queries = PdbQueries() for file in self.pdb_directory.iterdir(): try: if not file.is_file(): continue path = file.name protein_id = path[:path.index(".")] structure = self.read_file(protein_id, path) if not structure: continue for chain in structure.get_chains(): queries.add_query(protein_id, path, chain.id) except OSError as error: warnings.warn(f"Encountered OSError {error} for file {file}") return queries
[docs]@dataclass(slots=True) class PdbFileQuery: """ Object representing a specific proteins structure file and the amino protein chain entities that should be parsed from it Attributes ---------- protein_id: str pdb entity ID of the protein structure file_path: str path to the protein structure file chain_ids: list[str] list of protein chain entities to parse """ protein_id: str file_path: str chain_ids: list[str] def __iter__(self): return self.chain_ids.__iter__() def add_chain(self, chain_id: str | list[str]): self.chain_ids += chain_id
[docs] def get_structure(self, pdb_reader: PdbReader ) -> PDB.Structure.Structure | None: """ Parse a protein structure file and return a biopython PDB structure Parameters ---------- pdb_reader : str protein structure file parser Returns ------- PDB.Structure.Structure or None biopython structure entity """ return pdb_reader.read_file(self.protein_id, self.file_path)
[docs] def get_polypeptides(self, pdb_reader: PdbReader, builder: PDB.Polypeptide.PPBuilder | PDB.Polypeptide.CaPPBuilder ) -> Generator[PolypeptideEntry, None, None]: """ Identify contiguous chains of amino acids in the protein chain entity Parameters ---------- pdb_reader : PdbReader protein structure file parser builder : PDB.Polypeptide.PPBuilder or PDB.Polypeptide.CaPPBuilder polypeptide constructor Yields ------- PolypeptideEntry contiguous chains of amino acids from the protein chain entry """ structure = self.get_structure(pdb_reader) if not structure: return for model in structure: for chain_id in self.chain_ids: if chain_id not in model: warnings.warn(f'Model {model.id} of structure ' f'{structure.id} ' f'does not contain Chain {chain_id}.' ) continue chain = model[chain_id] for polypeptide in builder.build_peptides(chain): entry = PolypeptideEntry(structure.id, model.id, chain, polypeptide ) yield entry
[docs]@dataclass(slots=True) class PdbQueries: """ List with unique protein structure file query entries Attributes ---------- query_list : list[PdbFileQuery] """ query_list: list = field(default_factory=list)
[docs] def add_query(self, protein_id: str, path_string: str, chain: str): """ Creates a new PdbFileQuery object, or appends a protein chain entity ID to an existing query Parameters ---------- protein_id : str pdb entity ID of the protein structure path_string : str name of the protein structure file chain : str pdb polypeptide instance ID Returns ------- None """ if protein_id not in self: query = PdbFileQuery(protein_id, path_string, [chain]) self.query_list.append(query) return if chain not in self[protein_id]: self[protein_id].add_chain(chain) return
def __contains__(self, item): return any(query.protein_id == item for query in self) def __getitem__(self, item: str) -> PdbFileQuery: for query in self: if query.protein_id == item: return query raise KeyError def __iter__(self) -> Iterator[PdbFileQuery]: return self.query_list.__iter__()
[docs]class PdbQueryCsv:
[docs] def __init__(self, chain_list: str, has_header: bool = True) -> None: """Object for parsing csv files listing protein entity IDs, paths to protein structure files, and protein chain IDs Parameters ---------- chain_list : str path to a csv file listing protein structures to parse has_header: bool flag to indicate whether the csv file has column headings. Default: True Attributes ---------- read path : pathlib.Path path to a csv file listing protein structures to parse has_header flag to indicate whether the csv file has column headings. """ self.path = pathlib.Path(chain_list) if not self.path.exists(): raise FileNotFoundError(f"Chain list file at {self.path} not " f"found.") self.has_header = has_header
@property def read(self) -> PdbQueries: """Read a csv file listing PDB_IDs, paths, and chains Returns ------- PdbQueries """ discard = self.has_header queries = PdbQueries() with open(self.path, 'r') as file: for line in file: if not discard: protein_id, path, chain = line.rstrip("\n").split(',') queries.add_query(protein_id, path, chain) discard = False return queries
[docs]class CsvWriter:
[docs] def __init__(self, path_string: str, fields: Iterable[str]): """Object for writing values to defined fields in a csv file Parameters ---------- path_string : str path of the output file to write fields : Iterable[str] column names of csv file Attributes --------- output_handle : IO text stream for writing output file fields : Iterable[str] column names of csv file """ output_path = pathlib.Path(path_string) self.output_handle = open(output_path, 'w') self.fields = fields
def write_headings(self): header = ','.join(self.fields) + '\n' self.output_handle.write(header)
[docs] def write_line(self, field_values: Iterable[str]): """add a row to the csv format table and write it to the output file Parameters ---------- field_values : Iterable[str] list containing a string value for each column """ self.output_handle.write(f'{",".join(field_values)}\n')
[docs] def close(self): """ Close the TextIO stream """ self.output_handle.close()