Source code for indra.sources.lincs_drug.processor

from __future__ import absolute_import, print_function, unicode_literals

__all__ = ['LincsProcessor']

import re

from indra.statements import Agent, Inhibition, Evidence
from indra.databases.lincs_client import LincsClient
from indra.databases import uniprot_client, hgnc_client, chebi_client


[docs]class LincsProcessor(object):
    """Processor for the HMS LINCS drug target dataset.

    Parameters
    ----------
    lincs_data : list[dict]
        A list of dicts with keys set by the header of the csv, and values from
        the data in the csv.

    Attributes
    ----------
    statements : list[indra.statements.Statement]
        A list of indra statements extracted from the CSV file.
    """

    def __init__(self, lincs_data):
        self._data = lincs_data
        self._lc = LincsClient()

        # Process all the lines (skipping the header)
        self.statements = []
        for line in self._data:
            self._process_line(line)
        return

    def _process_line(self, line):
        drug = self._extract_drug(line)
        prot = self._extract_protein(line)
        if prot is None:
            return
        evidence = self._make_evidence(line)
        self.statements.append(Inhibition(drug, prot, evidence=evidence))

    def _extract_drug(self, line):
        drug_name = line['Small Molecule Name']
        lincs_id = line['Small Molecule HMS LINCS ID']
        refs = self._lc.get_small_molecule_refs(lincs_id)
        if 'PUBCHEM' in refs:
            chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM'])
            if chebi_id:
                refs['CHEBI'] = chebi_id

        return Agent(drug_name, db_refs=refs)

    def _extract_protein(self, line):
        # Extract key information from the lines.
        prot_name = line['Protein Name']
        prot_id = line['Protein HMS LINCS ID']

        # Get available db-refs.
        db_refs = {}
        if prot_id:
            db_refs.update(self._lc.get_protein_refs(prot_id))
            # Since the resource only gives us an UP ID (not HGNC), we
            # try to get that and standardize the name to the gene name
            up_id = db_refs.get('UP')
            if up_id:
                hgnc_id = uniprot_client.get_hgnc_id(up_id)
                if hgnc_id:
                    db_refs['HGNC'] = hgnc_id
                    prot_name = hgnc_client.get_hgnc_name(hgnc_id)
                else:
                    gene_name = uniprot_client.get_gene_name(up_id)
                    if gene_name:
                        prot_name = gene_name
        # In some cases lines are missing protein information in which
        # case we return None
        else:
            return None

        # Create the agent.
        return Agent(prot_name, db_refs=db_refs)

    def _make_evidence(self, line):
        ev_list = []
        key_refs = line['Key References'].split(';')
        generic_notes = {
            'is_nominal': line['Is Nominal'],
            'effective_concentration': line['Effective Concentration']
            }
        patt = re.compile('(?:pmid|pubmed\s+id):\s+(\d+)', re.IGNORECASE)
        for ref in key_refs:
            # Only extracting pmids, but there is generally more info available.
            m = patt.search(ref)
            if m is None:
                pmid = None
            else:
                pmid = m.groups()[0]
            annotations = {'reference': ref}
            annotations.update(generic_notes)
            ev = Evidence('lincs_drug', pmid=pmid, annotations=annotations,
                          epistemics={'direct': True})
            ev_list.append(ev)
        return ev_list