Source code for indra.sources.cwms.rdf_processor

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import logging
import rdflib
from indra.statements import Influence, Agent, Evidence

logger = logging.getLogger(__name__)

prefixes = """
PREFIX role: <http://www.cs.rochester.edu/research/trips/role#>
PREFIX lf: <http://www.cs.rochester.edu/research/trips/LF#>
"""


[docs]class CWMSRDFProcessor(object):
    """This processor extracts INDRA statements from CWMS RDF output.

    Parameters
    ----------
    text: str
        The source sentence as text
    rdf_filename: str
        A string containing the RDF output returned by CWMS for that sentence

    Attributes
    ----------
    statements: list[indra.statements.Statement]
        A list of INDRA statements that were extracted by the processor.
    """
    def __init__(self, text, rdf_filename):
        self.text = text

        # Read in the RDF graph
        g = rdflib.Graph()
        with open(rdf_filename, 'rb') as f:
            logger.info('Started loading graph from %s' % rdf_filename)
            g.parse(f, format='application/rdf+xml')
            logger.info('Finished loading graph')
        self.graph = g

        # Extract statements
        self.statements = []
        self.extract_statements()

[docs]    def extract_statement_from_query_result(self, res):
        """Adds a statement based on one element of a rdflib SPARQL query.

        Parameters
        ----------
        res: rdflib.query.ResultRow
            Element of rdflib SPARQL query result
        """
        agent_start, agent_end, affected_start, affected_end = res

        # Convert from rdflib literals to python integers so we can use
        # them to index strings
        agent_start = int(agent_start)
        agent_end = int(agent_end)
        affected_start = int(affected_start)
        affected_end = int(affected_end)

        # Find the text corresponding to these indices
        agent = self.text[agent_start:agent_end]
        affected = self.text[affected_start:affected_end]

        # Strip off surrounding whitespace
        agent = agent.lstrip().rstrip()
        affected = affected.lstrip().rstrip()

        # Make an Agent object for both the subject and the object
        subj = Agent(agent, db_refs={'TEXT': agent})
        obj = Agent(affected, db_refs={'TEXT': affected})

        statement = Influence(subj=subj, obj=obj)

        # Add the statement to the list of statements
        self.statements.append(statement)

[docs]    def extract_statements(self):
        """Extracts INDRA statements from the RDF graph via SPARQL queries.
        """

        # Look for events that have an AGENT and an AFFECTED, and get the
        # start and ending text indices for each.
        query = prefixes + """
        SELECT
            ?agent_start
            ?agent_end
            ?affected_start
            ?affected_end
        WHERE {
            ?rel role:AGENT ?agent .
            ?rel role:AFFECTED ?affected .
            ?agent lf:start ?agent_start .
            ?agent lf:end ?agent_end .
            ?affected lf:start ?affected_start .
            ?affected lf:end ?affected_end .
        }
        """
        results = self.graph.query(query)
        for res in results:
            # Make a statement for each query match
            self.extract_statement_from_query_result(res)

        # Look for events that have an AGENT and a RESULT, and get the start
        # and ending text indices for each.
        query = query.replace('role:AFFECTED', 'role:RESULT')
        results = self.graph.query(query)
        for res in results:
            # Make a statement for each query match
            self.extract_statement_from_query_result(res)