# -*- coding: utf-8 -*-
"""High level API functions for the PyBEL processor."""
import zlib
import json
import pybel
import logging
import requests
from functools import lru_cache
from .processor import PybelProcessor
logger = logging.getLogger(__name__)
version = 'v1.0.0'
branch = 'https://github.com/cthoyt/selventa-knowledge/raw/' \
'{}/selventa_knowledge/{}'
large_corpus_url = branch.format(version, 'large_corpus.bel.nodelink.json.gz')
small_corpus_url = branch.format(version, 'small_corpus.bel.nodelink.json.gz')
[docs]def process_small_corpus():
"""Return PybelProcessor with statements from Selventa Small Corpus.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
its statements attribute.
"""
return process_pybel_network(network_type='graph_jsongz_url',
network_file=small_corpus_url)
[docs]def process_large_corpus():
"""Return PybelProcessor with statements from Selventa Large Corpus.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
its statements attribute.
"""
return process_pybel_network(network_type='graph_jsongz_url',
network_file=large_corpus_url)
[docs]def process_pybel_network(network_type, network_file, **kwargs):
"""Return PybelProcessor by processing a given network file.
Parameters
----------
network_type : str
The type of network that network_file is. The options are:
belscript, json, cbn_jgif, graph_pickle, and graph_jsongz_url.
Default: graph_jsongz_url
network_file : str
Path to the network file/URL to process.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
if network_type == 'belscript':
return process_belscript(network_file, **kwargs)
elif network_type == 'json':
return process_json_file(network_file)
elif network_type == 'cbn_jgif':
return process_cbn_jgif_file(network_file)
elif network_type == 'graph_jsongz_url':
if not network_file:
network_file = large_corpus_url
logger.info('Loading %s' % network_file)
res = requests.get(network_file)
res.raise_for_status()
contentb = zlib.decompress(res.content, zlib.MAX_WBITS | 32)
content = contentb.decode('utf-8')
graph = pybel.from_nodelink_jsons(content)
return process_pybel_graph(graph)
elif network_type == 'graph_pickle':
graph = pybel.from_pickle(network_file)
return process_pybel_graph(graph)
else:
raise ValueError('Unknown network type: %s' % network_type)
[docs]def process_pybel_neighborhood(entity_names, network_type='graph_jsongz_url',
network_file=None, **kwargs):
"""Return PybelProcessor around neighborhood of given genes in a network.
This function processes the given network file and filters the returned
Statements to ones that contain genes in the given list.
Parameters
----------
entity_names : list[str]
A list of entity names (e.g., gene names) which will be used as the
basis of filtering the result. If any of the Agents of an extracted
INDRA Statement has a name appearing in this list, the Statement is
retained in the result.
network_type : Optional[str]
The type of network that network_file is. The options are:
belscript, json, cbn_jgif, graph_pickle, and graph_jsongz_url.
Default: graph_jsongz_url
network_file : Optional[str]
Path to the network file/URL to process. If not given, by default, the
Selventa Large Corpus is used via a URL pointing to a gzipped PyBEL
Graph JSON file.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
bp = process_pybel_network(network_type, network_file, **kwargs)
filtered_stmts = []
filter_names = set(entity_names)
for stmt in bp.statements:
found = False
for agent in stmt.agent_list():
if agent is not None:
if agent.name in filter_names:
found = True
if found:
filtered_stmts.append(stmt)
bp.statements = filtered_stmts
return bp
[docs]@lru_cache(maxsize=100)
def process_pybel_graph(graph):
"""Return a PybelProcessor by processing a PyBEL graph.
Parameters
----------
graph : pybel.struct.BELGraph
A PyBEL graph to process
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
bp = PybelProcessor(graph)
bp.get_statements()
if bp.annot_manager.failures:
logger.warning('missing %d annotation pairs',
sum(len(v)
for v in bp.annot_manager.failures.values()))
return bp
[docs]def process_belscript(file_name, **kwargs):
"""Return a PybelProcessor by processing a BEL script file.
Key word arguments are passed directly to pybel.from_path,
for further information, see
pybel.readthedocs.io/en/latest/io.html#pybel.from_path
Some keyword arguments we use here differ from the defaults
of PyBEL, namely we set `citation_clearing` to False
and `no_identifier_validation` to True.
Parameters
----------
file_name : str
The path to a BEL script file.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
if 'citation_clearing' not in kwargs:
kwargs['citation_clearing'] = False
if 'no_identifier_validation' not in kwargs:
kwargs['no_identifier_validation'] = True
pybel_graph = pybel.from_bel_script(file_name, **kwargs)
return process_pybel_graph(pybel_graph)
[docs]def process_json_file(file_name):
"""Return a PybelProcessor by processing a Node-Link JSON file.
For more information on this format, see:
http://pybel.readthedocs.io/en/latest/io.html#node-link-json
Parameters
----------
file_name : str
The path to a Node-Link JSON file.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
pybel_graph = pybel.from_nodelink_file(file_name, check_version=False)
return process_pybel_graph(pybel_graph)
[docs]def process_cbn_jgif_file(file_name):
"""Return a PybelProcessor by processing a CBN JGIF JSON file.
Parameters
----------
file_name : str
The path to a CBN JGIF JSON file.
Returns
-------
bp : PybelProcessor
A PybelProcessor object which contains INDRA Statements in
bp.statements.
"""
with open(file_name, 'r') as jgf:
return process_pybel_graph(pybel.from_cbn_jgif(json.load(jgf)))
[docs]def process_belrdf(rdf_str, print_output=True):
"""Deprecated: Return a BelRdfProcessor for a BEL/RDF string.
Parameters
----------
rdf_str : str
A BEL/RDF string to be processed. This will usually come from reading
a .rdf file.
print_output : Optional[bool]
If True, print statistics of what has been extracted from the given
BEL/RDF network. Default: True
Returns
-------
bp : BelRdfProcessor
A BelRdfProcessor object which contains INDRA Statements in
its statements attribute.
Notes
-----
This function calls all the specific get_type_of_mechanism()
functions of the newly constructed BelRdfProcessor to extract
INDRA Statements.
"""
import rdflib
from rdflib.plugins.parsers.ntriples import ParseError
from .rdf_processor import BelRdfProcessor
logger.warning('The BEL/RDF format is deprecated and the results of '
'this function are not guaranteed to be correct. '
'Running this function requires rdflib==4.2.1, which is '
'older than the rdflib dependency installed by default.')
g = rdflib.Graph()
try:
g.parse(data=rdf_str, format='nt')
except ParseError as e:
logger.error('Could not parse rdf: %s' % e)
return None
# Build INDRA statements from RDF
bp = BelRdfProcessor(g)
bp.get_complexes()
bp.get_activating_subs()
bp.get_modifications()
bp.get_activating_mods()
bp.get_transcription()
bp.get_activation()
bp.get_conversions()
# Print some output about the process
if print_output:
bp.print_statement_coverage()
bp.print_statements()
return bp