Source code for nidm.query

'''
query: part of the nidm-api
general functions to work with query
data structures for nidm-queries

'''

import os
import re
import stat
import uuid
import json
import numpy
import rdflib
import shutil
import tempfile
import sys
if sys.version_info[0] < 3:
    from StringIO import StringIO
else:
    from io import StringIO
from git import Repo
from glob import glob
from pandas import DataFrame
from nidm.utils import load_json, get_query_template, has_internet_connectivity, \
find_directories, set_permissions

[docs]class Queries(): def __init__(self,components=["experiment","results","workflow"]): self.store = get_query_directory() if isinstance(components,str): components = [components] self.queries = validate_queries(self.store,components=components) self.query_dict = make_lookup(self.queries,key_field="uid")
[docs]def generate_query_template(output_dir=None,template_path=None,fields=None): '''generate_query_template Parameters ========== output_dir: str full path to output directory for json data structure. if none specified, will not save the data structure template_path: str path to json file to use as a template. Only should be specified if the user has reason to use a custom template default is the standard provided by nidm-api. fields: dict (optional) a dictionary with fields that correspond to template keys. if provided, template will be filled with keys. Possible values include Returns ======= template: json (dict) A python dictionary (json) that can be filled with new query information ''' if template_path == None: template = get_query_template() template = load_json(template) # Each template is given a uid uid = str(uuid.uuid4()) template["uid"] = uid # the user has provided data to fill template #TODO: template validation if fields != None: for key,value in fields.iteritems(): if key in template: # Parameters are generated from sparql query based on "SELECT" line if key == "sparql": template["parameters"],template["sparql"] = format_sparql(value) else: template[key] = value # the user wants to save to file if output_dir != None: save_query_template(template,output_dir) return template
[docs]def save_query_template(template,output_dir): '''generate_query_template Parameters ========== output_dir: string path full path to output directory for json data structure. the template filename is generated from the uid variable Returns ======= success: boolean True if save was successful, false otherwise ''' filepath = "%s/%s.json" %(output_dir,template["uid"]) try: json.dump(template,open(filepath,"wb")) return True except: return False
[docs]def do_query(ttl_file,query,rdf_format="turtle",serialize_format="csv",output_df=True): g = rdflib.Graph() g.parse(ttl_file,format=rdf_format) result = g.query(query) result = result.serialize(format=serialize_format) if output_df == True: result = StringIO(result) return DataFrame.from_csv(result,sep=",") else: return result
[docs]def make_lookup(query_list,key_field): '''make_lookup returns dict object to quickly look up query based on uid Parameters ========== query_list: list a list of query (dict objects) key_field: str the key in the dictionary to base the lookup key Returns ======= query_dict: dict dict (json) with key as "key_field" from query_list ''' lookup = dict() for single_query in query_list: lookup_key = single_query[key_field] lookup[lookup_key] = single_query return lookup
[docs]def validate_queries(query_dir,queries=None,components=["sparql"]): '''validate_queries returns json object with query data structures, and a field 'valid' to describe if query was valid Parameters ========== queries: list a list of full paths to json files, each a query query_dir: str full path to a nidm-query repo components: folders to include corresponding to nidm query language (currently only option is sparql) Returns ======= queries: json dict (json) with all read in queries available from nidm-query, provided by API ''' component_folders = [] if isinstance(components,str): components = [components] for folder in components: if folder in ["sparql"]: component_folders.append("%s/%s" %(query_dir,folder)) #TODO: validation should include testing sparql, # as well as if fields possible to return are # possible given the query. It would be more ideal # to remove these "hard coded" options and have them # derived directly from the query at runtime if queries == None: query_folders = find_directories(query_dir) if len(component_folders) > 0: query_folders = [q for q in query_folders if q in component_folders] query_paths = find_queries(query_folders) queries = read_queries(query_paths) #TODO: need to decide how to validate :) return queries
[docs]def get_query_directory(tmpdir=None): '''get_query_directory: Download queries repo to tmp directory Parameters ========== tmpdir: str path to directory to download queries to ''' if tmpdir == None: tmpdir = tempfile.mkdtemp() # Check for internet connection if has_internet_connectivity(): print "Updating queries at %s" %(tmpdir) download_queries(tmpdir) return tmpdir
[docs]def find_queries(query_folders,search_pattern="*.json"): '''find_queries searches one or more folders for valid queries, meaning json files. In the case of multiple directories, will append the folder name as a variable to indicate the type Parameters ========== query_folders: list or str one or more full paths to directories with json objects search_pattern: str pattern for glob to use to find query objects default is "*.json" Returns ======= queries: list a list of full paths to query object files ''' queries = [] if isinstance(query_folders,str): query_folders = [query_folders] for query_folder in query_folders: queries = queries + glob("%s/%s" %(query_folder,search_pattern)) return queries
[docs]def read_queries(query_paths): '''read_queries Read in a list of query (json) objects. Parameters ========== query_paths: list a list of full paths to query objects to read Returns ======= queries_: list dict to be served as json describing queries available a "type" variable is added to indicate folder query was found in ''' queries = [] for query_path in query_paths: ext = os.path.splitext(query_path)[1] if ext == ".json": query_type = query_path.split("/")[-2] uid = query_path.split("/")[-1].replace(ext,"") tmp = json.load(open(query_path,"rb")) # sparql should be joined into single string tmp["sparql"] = "\n".join(tmp["sparql"]) tmp["type"] = query_type tmp["uid"] = uid queries.append(tmp) else: print "Skipping file %s, extension is not .json" %(query_path) return queries # Currently hard coded for query repo, if we have more # data types can be changed to a variable
[docs]def download_queries(destination): '''download_queries Download queries repo to a destination Parameters ========== destination: the full path to download the repo to ''' repo = Repo.clone_from("https://github.com/incf-nidash/nidm-query.git",destination) return repo
[docs]def format_sparql(sparql_text): '''format_sparql split sparql text into a list, and extract parameter options from select. ''' lines = sparql_text.split("\n") lines = [line.strip("\r") for line in lines] params = [] # Find any lines with select and extract variables from it expression = re.compile("select") param_expression = re.compile(r"([#?]\w+)\b") for line in lines: if expression.search(line.lower()): if param_expression.search(line): params = params + param_expression.findall(line) params = numpy.unique(params).tolist() return params,lines