Source code for aaindex._aaindex_matrix

################################################################################
################         AAindex Matrix Base Class             #################
################################################################################

#importing required modules and dependencies
import json
import os
import sys
import copy
import re
from typing import Dict, Iterator, List, Optional, Union


[docs] class Map(dict): """A dict subclass that enables attribute-style (dot notation) access to keys. Works for nested dicts. Each AAindex record returned by __getitem__ is wrapped in this class so fields can be read as record.description as well as record['description']. References: https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) for arg in args: if isinstance(arg, dict): for k, v in arg.items(): self[k] = v if kwargs: for k, v in kwargs.items(): self[k] = v def __getattr__(self, attr): try: return self[attr] except KeyError: raise AttributeError(f"'Map' object has no attribute '{attr}'") def __setattr__(self, key, value): self.__setitem__(key, value) def __setitem__(self, key, value): super().__setitem__(key, value) self.__dict__.update({key: value}) def __delattr__(self, item): self.__delitem__(item) def __delitem__(self, key): super().__delitem__(key) del self.__dict__[key] def __repr__(self) -> str: return f"Map({dict.__repr__(self)})"
[docs] class _AAIndexMatrix: """Base class for AAindex2 and AAindex3 matrix database parsers. Provides shared parsing, lookup, search, and protocol methods for the lower-triangular 20x20 matrix databases. Subclasses call super().__init__(filename) with the appropriate base filename so the correct data file is loaded. Attributes: aaindex_module_path: Absolute path to the aaindex package directory. data_dir: Subdirectory name containing raw and cached data files. aaindex_filename: Base filename for this database (no extension). aaindex_json: Parsed database keyed by accession number. last_updated: Date string of the last published database update. """ def __init__(self, filename: str) -> None: #resolve the package directory for data file lookups self.aaindex_module_path = os.path.dirname( os.path.abspath(sys.modules[self.__module__].__file__) ) self.data_dir = "data" self.aaindex_filename = filename #load from cached JSON if available, otherwise parse the raw file json_path = os.path.join( self.aaindex_module_path, self.data_dir, self.aaindex_filename + ".json" ) if os.path.isfile(json_path): with open(json_path) as aai_json: self.aaindex_json = json.load(aai_json) else: self.aaindex_json = self.parse_aaindex() #date as shown on https://www.genome.jp/aaindex/ self.last_updated = "February 13, 2017"
[docs] def parse_aaindex(self) -> Dict: """Parse the raw AAindex database file into a nested dict and cache as JSON. Each record is keyed by its accession number and stores metadata alongside the full symmetric 20x20 matrix reconstructed from the lower-triangular source data. The result is written to a .json file in the data directory for fast subsequent loads. Returns: dict: Parsed database keyed by accession number. Raises: IOError: If the raw database file cannot be opened. ValueError: If a duplicate accession number is encountered. """ #template for each record's metadata fields template_dict = { "H": [], "D": [], "R": [], "A": [], "*": [], "T": [], "J": [], "C": [], "M": [], } tmp_filepath = os.path.join( self.aaindex_module_path, self.data_dir, self.aaindex_filename ) try: with open(tmp_filepath) as f: lines = f.readlines() except OSError as e: raise OSError( f"Error opening {self.aaindex_filename} file, " f"check it is present at: {tmp_filepath}." ) from e #regex to normalise double-quote characters in field values clean_up_pattern = re.compile("\"") aaindex_json: Dict = {} current_dict = copy.deepcopy(template_dict) current_entry: str = "H" # first non-space line in any block is always H for line in lines: if line.startswith("//"): #parse plain metadata fields name = clean_up_pattern.sub("'", " ".join(current_dict["H"])) description = clean_up_pattern.sub("'", " ".join(current_dict["D"])) a = " ".join(current_dict["A"]) t = " ".join(current_dict["T"]) j = " ".join(current_dict["J"]) references = clean_up_pattern.sub("'", f"{a} '{t}' {j}") pmid = ( clean_up_pattern.sub("'", " ".join(current_dict["R"])) .replace("PMID:", "") .replace("LIT:", "") .strip() ) notes = clean_up_pattern.sub("'", " ".join(current_dict["*"])) #parse correlation coefficients into a dict corr_str = clean_up_pattern.sub("'", " ".join(current_dict["C"])) correlation_coefficients: Dict = {} corr_pairs = [ corr_str.split()[n:n + 2] for n in range(0, len(corr_str.split()), 2) ] for pair in corr_pairs: correlation_coefficients[pair[0]] = pair[1] #parse the M block into a full symmetric matrix matrix: Dict = {} row_order: List[str] = [] col_order: List[str] = [] matrix_rows: List[List] = [] for m_line in current_dict["M"]: stripped = m_line.strip() if stripped.startswith("rows"): #format: rows = <AA_STRING> cols = <AA_STRING> parts = stripped.replace(",", "").split() row_order = list(parts[2]) col_order = list(parts[5]) else: row_vals: List = [] for token in stripped.split(): if token in ("NA", "-"): row_vals.append(None) else: try: row_vals.append(float(token)) except ValueError: row_vals.append(None) if row_vals: matrix_rows.append(row_vals) #reconstruct full symmetric matrix from lower-triangular data for i, row_aa in enumerate(row_order): if i >= len(matrix_rows): break matrix.setdefault(row_aa, {}) for j, val in enumerate(matrix_rows[i]): if j >= len(col_order): break col_aa = col_order[j] matrix[row_aa][col_aa] = val #fill the symmetric counterpart matrix.setdefault(col_aa, {}) matrix[col_aa][row_aa] = val if name in aaindex_json: raise ValueError(f"Duplicate accession number found: {name}.") aaindex_json[name] = { "description": description, "references": references, "pmid": pmid, "correlation_coefficients": correlation_coefficients, "notes": notes, "matrix": matrix, "row_order": row_order, "col_order": col_order, } current_dict = copy.deepcopy(template_dict) continue #route non-separator lines to their field bucket if line[0] != " ": current_entry = line[0] current_dict[current_entry].append(line[1:].strip()) #cache parsed database as JSON for fast subsequent loads json_out_path = os.path.join( self.aaindex_module_path, self.data_dir, self.aaindex_filename + ".json" ) with open(json_out_path, "w") as output_f: json.dump(aaindex_json, output_f, indent=4, sort_keys=True) return aaindex_json
[docs] def get(self, record_code: str, aa1: str, aa2: str) -> Optional[float]: """Return the pairwise matrix score for two amino acids from a given record. The matrix is symmetric so get(code, aa1, aa2) == get(code, aa2, aa1). Returns None when either amino acid carries an NA value in the source data or when the amino acid letter is not present in this record's matrix. Args: record_code: AAindex accession number. aa1: Single-letter code for the first amino acid. aa2: Single-letter code for the second amino acid. Returns: Pairwise score as float, or None if data is not available. Raises: TypeError: If aa1 or aa2 are not strings. ValueError: If record_code is not found in the database. """ record = self[record_code] try: aa1 = aa1.strip().upper() aa2 = aa2.strip().upper() except AttributeError: raise TypeError("aa1 and aa2 must be single-letter string amino acid codes.") matrix = record.matrix if aa1 in matrix and aa2 in matrix[aa1]: return matrix[aa1][aa2] return None
[docs] def values(self, record_code: str) -> Dict: """Return the full 20x20 matrix dict for a given record. Shortcut to avoid accessing the whole record when only the matrix is needed. Consistent with AAIndex1.values() which returns amino acid values. Args: record_code: AAindex accession number. Returns: Nested dict of pairwise scores keyed by single-letter amino acid codes. Raises: ValueError: If record_code is not found in the database. """ return self[record_code].matrix
[docs] def search(self, description: Union[str, List[str]]) -> Dict: """Search records by keyword(s) present in their description field. Args: description: Keyword string or list of keyword strings. Matching is case-insensitive. Returns: Dict of matching records keyed by accession number. Returns an empty dict if no records match. Raises: TypeError: If description is not a str or list. """ all_indices: Dict = {} if not isinstance(description, (list, str)): raise TypeError( f"description must be a list or str, got {type(description)}." ) if not isinstance(description, list): description = [description] for desc in description: for index, value in self.aaindex_json.items(): if desc.lower() in value["description"].lower(): all_indices[index] = value return all_indices
[docs] def amino_acids(self) -> List[str]: """Return sorted list of the 20 canonical amino acid single-letter codes. Derived from the row_order field of the first record in the database. Returns: Sorted list of single-letter amino acid codes. """ first_record = self.aaindex_json[next(iter(self.aaindex_json))] return sorted(first_record["row_order"])
[docs] def record_codes(self) -> List[str]: """Return sorted list of all accession numbers in the database. Returns: Sorted list of accession number strings. """ return sorted(self.aaindex_json.keys())
[docs] def num_records(self) -> int: """Return the total number of records in the database. Returns: Number of records as int. """ return len(self.aaindex_json)
[docs] def record_names(self) -> List[str]: """Return a list of description strings for all records. Returns: List of description strings in database insertion order. """ return [v["description"] for v in self.aaindex_json.values()]
[docs] def __getitem__(self, record_code: str) -> "Map": """Return a record by accession number wrapped in a Map (dot-notation dict). Args: record_code: AAindex accession number (case-insensitive, leading/trailing whitespace is stripped). Returns: Record data as a Map, accessible via dict or dot notation. Raises: TypeError: If record_code is not a string. ValueError: If record_code is not found in the database. """ try: record_code = record_code.strip().upper() except AttributeError: raise TypeError( f"record_code must be a string, got {type(record_code)}." ) if record_code not in self.aaindex_json: raise ValueError( f"Record ({record_code}) not found in {self.__class__.__name__}." ) return Map(self.aaindex_json[record_code])
[docs] def __len__(self) -> int: """Return total number of records in the database.""" return len(self.aaindex_json)
[docs] def __contains__(self, record_code: object) -> bool: """Return True if record_code exists in the database.""" return record_code in self.aaindex_json
[docs] def __iter__(self) -> Iterator[str]: """Iterate over all accession numbers in the database.""" return iter(self.aaindex_json)
[docs] def __repr__(self) -> str: """Return a canonical string representation of this instance.""" return ( f"{self.__class__.__name__}(" f"records={len(self.aaindex_json)}, " f"last_updated='{self.last_updated}')" )
[docs] def __sizeof__(self) -> int: """Return the on-disk size of the raw AAindex data file in bytes.""" return os.path.getsize( os.path.join(self.aaindex_module_path, self.data_dir, self.aaindex_filename) )
###################### Getters & Setters ###################### @property def data_dir(self) -> str: return self._data_dir @data_dir.setter def data_dir(self, value: str) -> None: self._data_dir = value @property def aaindex_filename(self) -> str: return self._aaindex_filename @aaindex_filename.setter def aaindex_filename(self, value: str) -> None: self._aaindex_filename = value @property def last_updated(self) -> str: return self._last_updated @last_updated.setter def last_updated(self, value: str) -> None: self._last_updated = value