Source code for indra.preassembler.grounding_mapper.standardize

__all__ = ['standardize_agent_name', 'standardize_db_refs',
           'name_from_grounding']

import logging
from indra.databases import uniprot_client, hgnc_client, mesh_client, \
    chebi_client, go_client, efo_client, hp_client, doid_client

logger = logging.getLogger(__name__)


[docs]def standardize_db_refs(db_refs): """Return a standardized db refs dict for a given db refs dict. Parameters ---------- db_refs : dict A dict of db refs that may not be standardized, i.e., may be missing an available UP ID corresponding to an existing HGNC ID. Returns ------- dict The db_refs dict with standardized entries. """ # First we normalize out EFO, HP and DOID and map them to MESH for db_ns in ('EFO', 'HP', 'DOID'): if db_ns in db_refs: mesh_id = mesh_client.get_mesh_id_from_db_id(db_ns, db_refs[db_ns]) if mesh_id: db_refs['MESH'] = mesh_id break # Next we normalize out MESH to other name spaces mesh_id = db_refs.get('MESH') # TODO: in principle we could also do a reverse mapping to MESH IDs from # other name spaces if mesh_id: db_mapping = mesh_client.get_db_mapping(mesh_id) if db_mapping: db_ns, db_id = db_mapping if db_ns not in db_refs: db_refs[db_ns] = db_id # Next we look at gene/protein name spaces up_id = db_refs.get('UP') up_pro = db_refs.get('UPPRO') hgnc_id = db_refs.get('HGNC') # If we have a feature without its protein, we get it if up_pro and not up_id: up_id_mapped = uniprot_client.get_feature_of(up_pro) if up_id_mapped: db_refs['UP'] = up_id_mapped up_id = up_id_mapped # If we have a UP ID and no HGNC ID, we try to get a gene name, # and if possible, a HGNC ID from that if up_id and not hgnc_id: hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id # Otherwise, if we don't have a UP ID but have an HGNC ID, we try to # get the UP ID elif hgnc_id: # Now get the Uniprot ID for the gene mapped_up_id = hgnc_client.get_uniprot_id(hgnc_id) if mapped_up_id: # If we find an inconsistency, we explain it in an error # message and fall back on the mapped ID if up_id and up_id != mapped_up_id: # We handle a special case here in which mapped_up_id is # actually a list of UP IDs that we skip and just keep # the original up_id if ', ' not in mapped_up_id: # If we got a proper single protein mapping, we use # the mapped_up_id to standardize to. msg = ('Inconsistent groundings UP:%s not equal to ' 'UP:%s mapped from HGNC:%s, standardizing to ' 'UP:%s' % (up_id, mapped_up_id, hgnc_id, mapped_up_id)) logger.debug(msg) db_refs['UP'] = mapped_up_id # If there is no conflict, we can update the UP entry else: db_refs['UP'] = mapped_up_id # Now we normalize between chemical name spaces pc_id = db_refs.get('PUBCHEM') chebi_id = db_refs.get('CHEBI') hmdb_id = db_refs.get('HMDB') mapped_chebi_id = None mapped_pc_id = None hmdb_mapped_chebi_id = None # If we have original PUBCHEM and CHEBI IDs, we always keep those: if pc_id: mapped_chebi_id = chebi_client.get_chebi_id_from_pubchem(pc_id) if mapped_chebi_id and not mapped_chebi_id.startswith('CHEBI:'): mapped_chebi_id = 'CHEBI:%s' % mapped_chebi_id if chebi_id: mapped_pc_id = chebi_client.get_pubchem_id(chebi_id) if hmdb_id: hmdb_mapped_chebi_id = chebi_client.get_chebi_id_from_hmdb(hmdb_id) if hmdb_mapped_chebi_id and \ not hmdb_mapped_chebi_id.startswith('CHEBI:'): hmdb_mapped_chebi_id = 'CHEBI:%s' % hmdb_mapped_chebi_id # We always keep originals if both are present but display warnings # if there are inconsistencies if pc_id and chebi_id and mapped_pc_id and pc_id != mapped_pc_id: msg = ('Inconsistent groundings PUBCHEM:%s not equal to ' 'PUBCHEM:%s mapped from %s, standardizing to ' 'PUBCHEM:%s.' % (pc_id, mapped_pc_id, chebi_id, pc_id)) logger.debug(msg) elif pc_id and chebi_id and mapped_chebi_id and chebi_id != \ mapped_chebi_id: msg = ('Inconsistent groundings %s not equal to ' '%s mapped from PUBCHEM:%s, standardizing to ' '%s.' % (chebi_id, mapped_chebi_id, pc_id, chebi_id)) logger.debug(msg) # If we have PC and not CHEBI but can map to CHEBI, we do that elif pc_id and not chebi_id and mapped_chebi_id: db_refs['CHEBI'] = mapped_chebi_id elif hmdb_id and chebi_id and hmdb_mapped_chebi_id and \ hmdb_mapped_chebi_id != chebi_id: msg = ('Inconsistent groundings %s not equal to ' '%s mapped from %s, standardizing to ' '%s.' % (chebi_id, hmdb_mapped_chebi_id, hmdb_id, chebi_id)) logger.debug(msg) elif hmdb_id and not chebi_id and hmdb_mapped_chebi_id: db_refs['CHEBI'] = hmdb_mapped_chebi_id # If we have CHEBI and not PC but can map to PC, we do that elif chebi_id and not pc_id and mapped_pc_id: db_refs['PUBCHEM'] = mapped_pc_id # Finally, we standardize between MESH and GO go_id = db_refs.get('GO') if mesh_id and not go_id: mapped_go_id = mesh_client.get_go_id(mesh_id) if mapped_go_id: db_refs['GO'] = mapped_go_id elif go_id and not mesh_id: mapped_mesh_id = mesh_client.get_mesh_id_from_go_id(go_id) if mapped_mesh_id: db_refs['MESH'] = mapped_mesh_id # Otherwise there is no useful mapping that we can add and no # further conflict to resolve. return db_refs
[docs]def standardize_agent_name(agent, standardize_refs=True): """Standardize the name of an Agent based on grounding information. If an agent contains a FamPlex grounding, the FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID, an attempt is made to find the associated HGNC gene name. If one can be found it is used as the agent name and the associated HGNC ID is added as an entry to the db_refs. Similarly, CHEBI, MESH and GO IDs are used in this order of priority to assign a standardized name to the Agent. If no relevant IDs are found, the name is not changed. Parameters ---------- agent : indra.statements.Agent An INDRA Agent whose name attribute should be standardized based on grounding information. standardize_refs : Optional[bool] If True, this function assumes that the Agent's db_refs need to be standardized, e.g., HGNC mapped to UP. Default: True Returns ------- bool True if a new name was set, False otherwise. """ # We return immediately for None Agents if agent is None: return False if standardize_refs: agent.db_refs = standardize_db_refs(agent.db_refs) # We next look for prioritized grounding, if missing, we return db_ns, db_id = agent.get_grounding() # If there's no grounding then we can't do more to standardize the # name and return if not db_ns or not db_id: return False # If there is grounding available, we can try to get the standardized name # and in the rare case that we don't get it, we don't set it. standard_name = name_from_grounding(db_ns, db_id) # Handle special case with UPPRO, if we can't get a feature name # we fall back on regular gene/protein naming if not standard_name and db_ns == 'UPPRO': db_ns, db_id = agent.get_grounding(ns_order=['HGNC', 'UP']) if not db_ns or not db_id: return False standard_name = name_from_grounding(db_ns, db_id) if not standard_name: return False agent.name = standard_name return True
[docs]def name_from_grounding(db_ns, db_id): """Return a standardized name given a name space and an ID. Parameters ---------- db_ns : str The name space in which the ID is defined. db_id : str The ID within the name space. Returns ------- str or None The standardized name corresponding to the grounding or None if not available. """ if db_ns == 'FPLX': return db_id elif db_ns == 'HGNC': return hgnc_client.get_hgnc_name(db_id) elif db_ns == 'UP': return uniprot_client.get_gene_name(db_id, web_fallback=False) elif db_ns == 'CHEBI': return chebi_client.get_chebi_name_from_id(db_id) elif db_ns == 'MESH': return mesh_client.get_mesh_name(db_id, False) elif db_ns == 'GO': return go_client.get_go_label(db_id) elif db_ns == 'HP': return hp_client.get_hp_name_from_hp_id(db_id) elif db_ns == 'EFO': return efo_client.get_efo_name_from_efo_id(db_id) elif db_ns == 'DOID': return doid_client.get_doid_name_from_doid_id(db_id) elif db_ns == 'UPPRO': feat = uniprot_client.get_feature_by_id(db_id) if feat and feat.name: return feat.name return None