Source code for indra.preassembler.grounding_mapper.standardize

__all__ = ['standardize_agent_name', 'standardize_db_refs',
           'name_from_grounding']

import logging
from indra.databases import uniprot_client, hgnc_client, mesh_client, \
    chebi_client, go_client, efo_client, hp_client, doid_client

logger = logging.getLogger(__name__)


[docs]def standardize_db_refs(db_refs):
    """Return a standardized db refs dict for a given db refs dict.

    Parameters
    ----------
    db_refs : dict
        A dict of db refs that may not be standardized, i.e., may be
        missing an available UP ID corresponding to an existing HGNC ID.

    Returns
    -------
    dict
        The db_refs dict with standardized entries.
    """
    # First we normalize out EFO, HP and DOID and map them to MESH
    for db_ns in ('EFO', 'HP', 'DOID'):
        if db_ns in db_refs:
            mesh_id = mesh_client.get_mesh_id_from_db_id(db_ns, db_refs[db_ns])
            if mesh_id:
                db_refs['MESH'] = mesh_id
                break

    # Next we normalize out MESH to other name spaces
    mesh_id = db_refs.get('MESH')
    # TODO: in principle we could also do a reverse mapping to MESH IDs from
    # other name spaces
    if mesh_id:
        db_mapping = mesh_client.get_db_mapping(mesh_id)
        if db_mapping:
            db_ns, db_id = db_mapping
            if db_ns not in db_refs:
                db_refs[db_ns] = db_id

    # Next we look at gene/protein name spaces
    up_id = db_refs.get('UP')
    up_pro = db_refs.get('UPPRO')
    hgnc_id = db_refs.get('HGNC')

    # If we have a feature without its protein, we get it
    if up_pro and not up_id:
        up_id_mapped = uniprot_client.get_feature_of(up_pro)
        if up_id_mapped:
            db_refs['UP'] = up_id_mapped
            up_id = up_id_mapped

    # If we have a UP ID and no HGNC ID, we try to get a gene name,
    # and if possible, a HGNC ID from that
    if up_id and not hgnc_id:
        hgnc_id = uniprot_client.get_hgnc_id(up_id)
        if hgnc_id:
            db_refs['HGNC'] = hgnc_id
    # Otherwise, if we don't have a UP ID but have an HGNC ID, we try to
    # get the UP ID
    elif hgnc_id:
        # Now get the Uniprot ID for the gene
        mapped_up_id = hgnc_client.get_uniprot_id(hgnc_id)
        if mapped_up_id:
            # If we find an inconsistency, we explain it in an error
            # message and fall back on the mapped ID
            if up_id and up_id != mapped_up_id:
                # We handle a special case here in which mapped_up_id is
                # actually a list of UP IDs that we skip and just keep
                # the original up_id
                if ', ' not in mapped_up_id:
                    # If we got a proper single protein mapping, we use
                    # the mapped_up_id to standardize to.
                    msg = ('Inconsistent groundings UP:%s not equal to '
                           'UP:%s mapped from HGNC:%s, standardizing to '
                           'UP:%s' % (up_id, mapped_up_id, hgnc_id,
                                      mapped_up_id))
                    logger.debug(msg)
                    db_refs['UP'] = mapped_up_id
            # If there is no conflict, we can update the UP entry
            else:
                db_refs['UP'] = mapped_up_id

    # Now we normalize between chemical name spaces
    pc_id = db_refs.get('PUBCHEM')
    chebi_id = db_refs.get('CHEBI')
    hmdb_id = db_refs.get('HMDB')
    mapped_chebi_id = None
    mapped_pc_id = None
    hmdb_mapped_chebi_id = None
    # If we have original PUBCHEM and CHEBI IDs, we always keep those:
    if pc_id:
        mapped_chebi_id = chebi_client.get_chebi_id_from_pubchem(pc_id)
        if mapped_chebi_id and not mapped_chebi_id.startswith('CHEBI:'):
            mapped_chebi_id = 'CHEBI:%s' % mapped_chebi_id
    if chebi_id:
        mapped_pc_id = chebi_client.get_pubchem_id(chebi_id)
    if hmdb_id:
        hmdb_mapped_chebi_id = chebi_client.get_chebi_id_from_hmdb(hmdb_id)
        if hmdb_mapped_chebi_id and \
                not hmdb_mapped_chebi_id.startswith('CHEBI:'):
            hmdb_mapped_chebi_id = 'CHEBI:%s' % hmdb_mapped_chebi_id
    # We always keep originals if both are present but display warnings
    # if there are inconsistencies
    if pc_id and chebi_id and mapped_pc_id and pc_id != mapped_pc_id:
        msg = ('Inconsistent groundings PUBCHEM:%s not equal to '
               'PUBCHEM:%s mapped from %s, standardizing to '
               'PUBCHEM:%s.' % (pc_id, mapped_pc_id, chebi_id, pc_id))
        logger.debug(msg)
    elif pc_id and chebi_id and mapped_chebi_id and chebi_id != \
            mapped_chebi_id:
        msg = ('Inconsistent groundings %s not equal to '
               '%s mapped from PUBCHEM:%s, standardizing to '
               '%s.' % (chebi_id, mapped_chebi_id, pc_id, chebi_id))
        logger.debug(msg)
    # If we have PC and not CHEBI but can map to CHEBI, we do that
    elif pc_id and not chebi_id and mapped_chebi_id:
        db_refs['CHEBI'] = mapped_chebi_id
    elif hmdb_id and chebi_id and hmdb_mapped_chebi_id and \
            hmdb_mapped_chebi_id != chebi_id:
        msg = ('Inconsistent groundings %s not equal to '
               '%s mapped from %s, standardizing to '
               '%s.' % (chebi_id, hmdb_mapped_chebi_id, hmdb_id, chebi_id))
        logger.debug(msg)
    elif hmdb_id and not chebi_id and hmdb_mapped_chebi_id:
        db_refs['CHEBI'] = hmdb_mapped_chebi_id
    # If we have CHEBI and not PC but can map to PC, we do that
    elif chebi_id and not pc_id and mapped_pc_id:
        db_refs['PUBCHEM'] = mapped_pc_id

    # Finally, we standardize between MESH and GO
    go_id = db_refs.get('GO')
    if mesh_id and not go_id:
        mapped_go_id = mesh_client.get_go_id(mesh_id)
        if mapped_go_id:
            db_refs['GO'] = mapped_go_id
    elif go_id and not mesh_id:
        mapped_mesh_id = mesh_client.get_mesh_id_from_go_id(go_id)
        if mapped_mesh_id:
            db_refs['MESH'] = mapped_mesh_id

    # Otherwise there is no useful mapping that we can add and no
    # further conflict to resolve.
    return db_refs


[docs]def standardize_agent_name(agent, standardize_refs=True):
    """Standardize the name of an Agent based on grounding information.

    If an agent contains a FamPlex grounding, the FamPlex ID is used as a
    name. Otherwise if it contains a Uniprot ID, an attempt is made to find
    the associated HGNC gene name. If one can be found it is used as the
    agent name and the associated HGNC ID is added as an entry to the
    db_refs. Similarly, CHEBI, MESH and GO IDs are used in this order of
    priority to assign a standardized name to the Agent. If no relevant
    IDs are found, the name is not changed.

    Parameters
    ----------
    agent : indra.statements.Agent
        An INDRA Agent whose name attribute should be standardized based
        on grounding information.
    standardize_refs : Optional[bool]
        If True, this function assumes that the Agent's db_refs need to
        be standardized, e.g., HGNC mapped to UP.
        Default: True

    Returns
    -------
    bool
        True if a new name was set, False otherwise.
    """
    # We return immediately for None Agents
    if agent is None:
        return False

    if standardize_refs:
        agent.db_refs = standardize_db_refs(agent.db_refs)

    # We next look for prioritized grounding, if missing, we return
    db_ns, db_id = agent.get_grounding()

    # If there's no grounding then we can't do more to standardize the
    # name and return
    if not db_ns or not db_id:
        return False

    # If there is grounding available, we can try to get the standardized name
    # and in the rare case that we don't get it, we don't set it.
    standard_name = name_from_grounding(db_ns, db_id)
    # Handle special case with UPPRO, if we can't get a feature name
    # we fall back on regular gene/protein naming
    if not standard_name and db_ns == 'UPPRO':
        db_ns, db_id = agent.get_grounding(ns_order=['HGNC', 'UP'])
        if not db_ns or not db_id:
            return False
        standard_name = name_from_grounding(db_ns, db_id)
    if not standard_name:
        return False

    agent.name = standard_name
    return True


[docs]def name_from_grounding(db_ns, db_id):
    """Return a standardized name given a name space and an ID.

    Parameters
    ----------
    db_ns : str
        The name space in which the ID is defined.
    db_id : str
        The ID within the name space.

    Returns
    -------
    str or None
        The standardized name corresponding to the grounding or None if
        not available.
    """
    if db_ns == 'FPLX':
        return db_id
    elif db_ns == 'HGNC':
        return hgnc_client.get_hgnc_name(db_id)
    elif db_ns == 'UP':
        return uniprot_client.get_gene_name(db_id, web_fallback=False)
    elif db_ns == 'CHEBI':
        return chebi_client.get_chebi_name_from_id(db_id)
    elif db_ns == 'MESH':
        return mesh_client.get_mesh_name(db_id, False)
    elif db_ns == 'GO':
        return go_client.get_go_label(db_id)
    elif db_ns == 'HP':
        return hp_client.get_hp_name_from_hp_id(db_id)
    elif db_ns == 'EFO':
        return efo_client.get_efo_name_from_efo_id(db_id)
    elif db_ns == 'DOID':
        return doid_client.get_doid_name_from_doid_id(db_id)
    elif db_ns == 'UPPRO':
        feat = uniprot_client.get_feature_by_id(db_id)
        if feat and feat.name:
            return feat.name
    return None