################################################################################
################ AAindex1 #################
################################################################################
#importing required modules and dependencies
import json
import os
import sys
import copy
import re
import csv
from typing import Dict, Iterator, List, Union
from ._aaindex_matrix import Map
__all__: List[str] = ['AAIndex1', 'aaindex1']
[docs]
class AAIndex1():
"""Python parser for AAindex1: Amino Acid Index Database.
The AAindex is a database of numerical indices representing various
physicochemical and biochemical properties of amino acids. This class
stores the amino acid index of 20 numerical values for the 20 amino
acids — AAindex1 (http://www.genome.jp/aaindex/).
Attributes:
aaindex_module_path: Absolute path to the aaindex package directory.
data_dir: Subdirectory name containing raw and cached data files.
aaindex_filename: Base filename for this database (no extension).
aaindex_json: Parsed database keyed by accession number.
categories: Dict mapping each record code to its category.
last_updated: Date string of the last published database update.
"""
def __init__(self) -> None:
#resolve the package directory for data file lookups
self.aaindex_module_path = os.path.dirname(os.path.abspath(sys.modules[self.__module__].__file__))
self.data_dir = "data"
self.aaindex_filename = "aaindex1"
#get dict of categories
self.categories = self.get_all_categories()
#load from cached JSON if available, otherwise parse the raw file
json_path = os.path.join(self.aaindex_module_path, self.data_dir, f"{self.aaindex_filename}.json")
if os.path.isfile(json_path):
with open(json_path) as aai_json:
self.aaindex_json = json.load(aai_json)
else:
self.aaindex_json = self.parse_aaindex()
#date as shown on https://www.genome.jp/aaindex/
self.last_updated = "February 13, 2017"
#cache amino acid list once at init to avoid re-sorting on every call
self._amino_acids_cache: List[str] = sorted(
self.aaindex_json[next(iter(self.aaindex_json))]["values"].keys()
)
[docs]
def parse_aaindex(self) -> Dict:
"""Parse the raw AAindex1 database file into a nested dict and cache as JSON.
Each record is keyed by its accession number and stores metadata, amino
acid values, and category. The result is written to a .json file in the
data directory for fast subsequent loads.
Returns:
Parsed database keyed by accession number.
Raises:
IOError: If the raw database file cannot be opened.
ValueError: If a duplicate accession number is encountered.
"""
#initialise keys of AAi database
template_dict = {
"H": [], "D": [], "R": [], "A": [],
"*": [], "T": [], "J": [], "C": [], "I": [],
}
#open AAi file for reading and parsing
tmp_filepath = os.path.join(self.aaindex_module_path, self.data_dir, self.aaindex_filename)
try:
with open(tmp_filepath) as f:
lines = f.readlines()
except OSError as e:
raise OSError(f"Error opening AAindex1 file, check file is in filepath: {tmp_filepath}.") from e
#regex to normalise double-quote characters in field values
clean_up_pattern = re.compile("\"")
aaindex_json: Dict = {}
current_dict = copy.deepcopy(template_dict)
current_entry: str = "H" # first non-space line in any block is always H
#iterate through each line, parsing records delimited by '//'
for line in lines:
if line.startswith("//"):
#handle meta data of each record
name = clean_up_pattern.sub("'", " ".join(current_dict["H"]))
description = clean_up_pattern.sub("'", " ".join(current_dict["D"]))
#append author, title and journal name to reference
a = " ".join(current_dict["A"])
t = " ".join(current_dict["T"])
j = " ".join(current_dict["J"])
references = clean_up_pattern.sub("'", f"{a} '{t}' {j}")
#parse pub med article ID
pmid = clean_up_pattern.sub("'", " ".join(current_dict["R"]))
pmid = pmid.replace("PMID:", "")
#parse correlation coefficients into a dict
correlation_coefficient = clean_up_pattern.sub("'", " ".join(current_dict["C"]))
correlation_coefficient_ = {}
correlation_coefficient_list = [
correlation_coefficient.split()[n:n + 2]
for n in range(0, len(correlation_coefficient.split()), 2)
]
for correlation in correlation_coefficient_list:
correlation_coefficient_[correlation[0]] = correlation[1]
#parse notes from record
notes = clean_up_pattern.sub("'", " ".join(current_dict["*"]))
#parse individual amino acid values from I-lines
aa_lines = current_dict["I"]
aa_names = aa_lines[0].split()
row_0_names = [aa.split("/")[0] for aa in aa_names]
row_1_names = [aa.split("/")[1] for aa in aa_names]
row_0_values = aa_lines[1].split()
row_1_values = aa_lines[2].split()
values: Dict = {}
for i in range(len(row_0_values)):
try:
values[row_0_names[i]] = float(row_0_values[i])
except ValueError:
values[row_0_names[i]] = "NA"
try:
values[row_1_names[i]] = float(row_1_values[i])
except ValueError:
values[row_1_names[i]] = "NA"
#guard against duplicate accession numbers
if name in aaindex_json:
raise ValueError(f"Duplicate AAi Record found: {name}.")
aaindex_json[name] = {
"description": description,
"references": references,
"pmid": pmid,
"correlation_coefficients": correlation_coefficient_,
"notes": notes,
"values": values,
}
current_dict = copy.deepcopy(template_dict)
continue
if line[0] != " ":
current_entry = line[0]
current_dict[current_entry].append(line[1:].strip())
#post-process: set NA values to 0, add category and '-' gap placeholder
for index in aaindex_json:
for val in aaindex_json[index]['values']:
if aaindex_json[index]['values'][val] == 'NA':
aaindex_json[index]['values'][val] = 0
aaindex_json[index]['category'] = self.categories[index]
aaindex_json[index]['values']['-'] = 0
#cache parsed database as JSON for fast subsequent loads
json_out_path = os.path.join(
self.aaindex_module_path, self.data_dir, f"{self.aaindex_filename}.json"
)
with open(json_out_path, 'w') as output_f:
json.dump(aaindex_json, output_f, indent=4, sort_keys=True)
return aaindex_json
[docs]
def parse_categories(self, aaindex_category_file: str = 'aaindex_to_category.txt') -> Dict:
"""Parse category file mapping each AAi record to one of 8 categories.
Category file and parsing code inspired from https://github.com/harmslab/hops.
Args:
aaindex_category_file: Filename or full path of the category mapping
file. Defaults to ``aaindex_to_category.txt`` in the data directory.
Returns:
Dict mapping each record code to its category string.
Raises:
IOError: If the category file cannot be opened.
"""
#if input parameter is a full path, use it directly, else read from default 'data' dir
if os.path.isfile(aaindex_category_file):
category_filepath = aaindex_category_file
else:
category_filepath = os.path.join(self.aaindex_module_path, self.data_dir, aaindex_category_file)
try:
with open(category_filepath) as f:
category_lines = f.readlines()
except OSError as e:
raise OSError(f"Error opening AAindex1 category file: {category_filepath}.") from e
#write non-comment lines to parsed output file (strip '#' metadata lines)
category_output_file = "aaindex_categories.txt"
with open(os.path.join(self.aaindex_module_path, self.data_dir, category_output_file), "w") as f_out:
for line in category_lines:
if not line.startswith('#'):
f_out.write(line)
#open parsed category file for reading
with open(os.path.join(self.aaindex_module_path, self.data_dir, category_output_file)) as f_out:
reader = csv.reader(f_out, delimiter="\t")
lines = list(reader)
aaindex_category: Dict = {}
#iterate through all lines, map each record code to its category
for row in lines:
if len(row) >= 2:
category_substring = row[1].strip().split(" ", 1)
aaindex_category[row[0]] = category_substring[0]
return aaindex_category
[docs]
def get_all_categories(self, category_file: str = "aaindex_categories.txt") -> Dict:
"""Return dict mapping every record code to its category.
Reads from the parsed ``aaindex_categories.txt`` file produced by
:meth:`parse_categories`. If the file does not yet exist, it is
generated first.
Args:
category_file: Filename of the pre-parsed categories file
inside the data directory.
Returns:
Dict mapping each record code to its category string.
Raises:
IOError: If the categories file cannot be opened.
"""
#if parsed categories file doesn't exist in 'data' then call function to get it
if not os.path.isfile(os.path.join(self.aaindex_module_path, self.data_dir, category_file)):
self.parse_categories()
#read categories file and its content
cat_filepath = os.path.join(self.aaindex_module_path, self.data_dir, category_file)
try:
with open(cat_filepath) as f_out:
reader = csv.reader(f_out, delimiter="\t")
d = list(reader)
except OSError as e:
raise OSError(f"Error opening AAindex1 category file: {cat_filepath}.") from e
aaindex_category: Dict = {}
#iterate through all lines, map each record code to its category
for row in d:
if len(row) >= 2:
category_substring = row[1].strip().split(" ", 1)
aaindex_category[row[0]] = category_substring[0]
return aaindex_category
[docs]
def search(self, description: Union[str, List[str]]) -> Dict:
"""Search records by keyword(s) present in their description field.
Args:
description: Keyword string or list of keyword strings.
Matching is case-insensitive.
Returns:
Dict of matching records keyed by accession number.
Returns an empty dict if no records match.
Raises:
TypeError: If description is not a str or list.
"""
all_indices: Dict = {}
if not isinstance(description, (list, str)):
raise TypeError(f"Input description parameter must be a list or str, got {type(description)}.")
#convert description parameter to list to make iterable
if not isinstance(description, list):
description = [description]
#iterate over description list, matching keywords case-insensitively
for desc in description:
for index, value in self.aaindex_json.items():
if desc.lower() in self.aaindex_json[index]['description'].lower():
all_indices[index] = self.aaindex_json[index]
return all_indices
[docs]
def amino_acids(self) -> List[str]:
"""Return sorted list of amino acid single-letter codes.
Includes the ``-`` placeholder for absent/gap amino acids.
Returns:
Sorted list of amino acid codes including ``-``.
"""
#return pre-computed cache from __init__
return self._amino_acids_cache
[docs]
def record_codes(self) -> List[str]:
"""Return sorted list of all accession numbers in the database.
Returns:
Sorted list of accession number strings.
"""
records = list(self.aaindex_json.keys())
records.sort()
return records
[docs]
def num_records(self) -> int:
"""Return the total number of records in the database.
Returns:
Number of records as int.
"""
return len(self.aaindex_json)
[docs]
def record_names(self) -> List[str]:
"""Return a list of description strings for all records.
Returns:
List of description strings in database insertion order.
"""
return [v['description'] for v in self.aaindex_json.values()]
[docs]
def values(self, record_code: str) -> Dict:
"""Return the amino acid values dict for a given record.
Shortcut to avoid accessing the full record when only
the values are needed.
Args:
record_code: AAindex accession number.
Returns:
Dict of amino acid values for the specified record.
Raises:
ValueError: If record_code is not found in the database.
"""
return self[record_code]['values']
[docs]
def get_record_by_category(self, category: str) -> Dict:
"""Return all records belonging to a given category.
Args:
category: Category name to filter records by (case-insensitive).
Returns:
Dict of matching records keyed by accession number.
Raises:
TypeError: If category is not a string.
"""
if not isinstance(category, str):
raise TypeError(f"Input category parameter must be a str, got {type(category)}.")
#filter records by matching category field
category_records = {
code: record for code, record in self.aaindex_json.items()
if record.get('category', '').lower() == category.lower()
}
return category_records
[docs]
def __getitem__(self, record_code: str) -> "Map":
"""Return a record by accession number wrapped in a Map (dot-notation dict).
Args:
record_code: AAindex accession number (case-insensitive,
leading/trailing whitespace is stripped).
Returns:
Record data as a Map, accessible via dict or dot notation.
Raises:
TypeError: If record_code is not a string.
ValueError: If record_code is not found in the database.
"""
try:
record_code = record_code.strip().upper()
except AttributeError:
raise TypeError(
f"Input parameter {record_code} is not of correct datatype string, got {type(record_code)}."
)
if record_code not in self.aaindex_json:
raise ValueError(f"Record Index ({record_code}) not found in AAindex1.")
return Map(self.aaindex_json[record_code])
[docs]
def __sizeof__(self) -> int:
"""Return the on-disk size of the raw AAindex data file in bytes."""
return os.path.getsize(
os.path.join(self.aaindex_module_path, self.data_dir, self.aaindex_filename)
)
[docs]
def __len__(self) -> int:
"""Return total number of records in the database."""
return len(self.aaindex_json)
[docs]
def __contains__(self, record_code: object) -> bool:
"""Return True if record_code exists in the database."""
return record_code in self.aaindex_json
[docs]
def __iter__(self) -> Iterator[str]:
"""Iterate over all record codes in the database."""
return iter(self.aaindex_json)
[docs]
def __repr__(self) -> str:
"""Return a canonical string representation of this instance."""
return f"AAIndex1(records={len(self.aaindex_json)}, last_updated='{self.last_updated}')"
###################### Getters & Setters ######################
@property
def categories(self) -> Dict:
return self._categories
@categories.setter
def categories(self, value: Dict) -> None:
self._categories = value
@property
def data_dir(self) -> str:
return self._data_dir
@data_dir.setter
def data_dir(self, value: str) -> None:
self._data_dir = value
@property
def aaindex_filename(self) -> str:
return self._aaindex_filename
@aaindex_filename.setter
def aaindex_filename(self, value: str) -> None:
self._aaindex_filename = value
@property
def last_updated(self) -> str:
return self._last_updated
@last_updated.setter
def last_updated(self, value: str) -> None:
self._last_updated = value
#create instance of AAIndex1 class
aaindex1 = AAIndex1()