Viewing file: parser.py (11.44 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
""" SPARQL Lexer, Parser and Function-Mapper By Shawn Brown <http://shawnbrown.com/contact>
TO DO: swap current parser functions for Michelp's pyparsing setup add mapping for FILTER/constraints typed literals integer, double or boolean abbreviations language tags (e.g., @fr) nested OPTIONALs ??? blank node and RDF collection syntax ??? GRAPH statements ???
CURRENTLY SUPPORTED: Simple SELECT queries Predicate-object and object list shorthand (e.g., ?x foaf:name ?name ; foaf:mbox ?mbox ; vcard:TITLE ?title) Multi-line/triple-quoted literals BASE, PREFIX, SELECT, WHERE, UNION, OPTIONAL, multiple UNIONs and multiple OPTIONALs (but not nested OPTIONALs)
USAGE: #from sparql_lpm import doSPARQL from rdflib.sparql.parser import doSPARQL ...load graph... ...define SPARQL query as string... result = doSPARQL(queryStr, sparqlGr)
"""
import base64 import re from rdflib.URIRef import URIRef from rdflib.sparql.graphPattern import GraphPattern
def _escape(text): return base64.encodestring(text).replace("\n", "") def _unescape(text): return base64.decodestring(text)
def _escapeLiterals(query): """ escape all literals with escape() """ fn = lambda m: "'" + _escape(m.group(2)) + "'" + m.group(3) pat = r"(\"\"\"|'''|[\"'])([^\1]*?[^\\]?)\1" # literal return re.sub(pat+"(\s*[.,;\}])", fn, query)
def _resolveShorthand(query): """ resolve some of the syntactic shorthand (2.8 Other Syntactic Forms) """ def doList(pat, text): pat = re.compile(pat) while pat.search(text): text = re.sub(pat, r"\1\2\3 . \2\4", text) return text # 2.8.1 Predicate-Object Lists pat = r"(\{.*?)([^ ]+ )([^ ]+ [^ ]+)\s?; ([^ ]+ [^ ]+\s?[,;\.\}])" query = doList(pat, query) # 2.8.2 Object Lists pat = r"(\{.*?)([^ ]+ [^ ]+ )([^ ]+\s?), ([^ ]+\s?[,\.\}])" query = doList(pat, query) # TO DO: look at adding all that other crazy stuff!!! return query
def _resolvePrefixes(query): """ resolve prefixed IRIs, remove PREFIX statements """ # parse PREFIX statements prefixes = re.findall("PREFIX ([\w\d]+:) <([^<>]+)>", query) # get list of prefix tuples prefixes.extend([ ("rdf:", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"), ("rdfs:", "http://www.w3.org/2000/01/rdf-schema#"), ("xsd:", "http://www.w3.org/2001/XMLSchema#"), ("fn:", "http://www.w3.org/2004/07/xpath-functions")]) matches = re.search("PREFIX : <([^<>]+)>", query) # parse colon-only PREFIX if matches != None: prefixes.append((":", matches.group(1))) query = re.sub("PREFIX [\w\d]*:[ ]?<[^<>]+>[ ]?", "", query) # remove PREFIX statements # escape IRIs (unescaped in ??) fn = lambda m: "<" + _escape(m.group(1)) + ">" query = re.sub("<([^<>]+)>", fn, query) # resolve prefixed IRIs for pair in prefixes: fn = lambda m: "<" + _escape(pair[1]+m.group(1)) + ">" # escaped too query = re.sub(pair[0]+"([^ .\}]+)", fn, query) return query
def _resolveBase(query): """ resolve relative IRIs using BASE IRI, remove BASE statement """ pat = re.compile("BASE <([^<>]+)>\s?") base = pat.search(query) if base != None: fn = lambda m: "<" + base.group(1) + m.group(1) + ">" query = re.sub("<([^<>: ]+)>", fn, query) # resolve relative IRIs query = re.sub(pat, "", query) # remove BASE statement return query
def _parseSelect(query): """ returns tuple of SELECTed variables or None """ var = "[?$][\\w\\d]+" # SELECT variable pattern select = re.search("SELECT(?: " + var + ")+", query) if select != None: select = re.findall(var, select.group(0)) select = tuple(select) return select
class _StackManager: """ manages token stack for _parser() """ def __tokenGen(self, tokens): for token in tokens: yield token def __init__(self, tokenList): self.stack = self.__tokenGen(tokenList) self.current = self.stack.next() def next(self): try: self.current = self.stack.next() if self.current == "": self.next() # if blank, move to next except StopIteration: self.current = None def token(self): return self.current
# # The following classes, _listTypes dictionary and _makeList() function are # used to test for recognized keywords and to create "typed" lists for nested # statements when parsing the SPARQL query's WHERE statement # class Where(list): pass class Union(list): pass class Optional(list): pass _listTypes = { "OPTIONAL": lambda : Optional([]), "UNION": lambda : Union([]), "WHERE": lambda : Where([]) } def _makeList(keyword): """ return list of given type or None """ global _listTypes if keyword in _listTypes: return _listTypes[keyword]() return None
def _parser(stack, listType="WHERE"): """ simple recursive descent SPARQL parser """ typedList = _makeList(listType) nestedType = listType while stack.token() != None: token = stack.token() if _makeList(token) != None: nestedType = token elif token == "{": stack.next() # iterate to next token typedList.append(_parser(stack, nestedType)) nestedType = listType # reset nestedType elif token == "}": return typedList elif token != ".": statement = "" while token != None and token != "." and token != "{" and token != "}": statement += " " + token stack.next() token = stack.token() statement = statement.strip() typedList.append(statement) continue stack.next() return typedList
def _parseWhere(query): """ split query into tokens, return parsed object """ stackObj = _StackManager(query) return _parser(stackObj)
def _findStatements(stmntType, stmntList): """ recurse over nested list, compile & return flat list of matching statement strings used by _getStatements() """ statements = [] typedList = _makeList(stmntType) for stmnt in stmntList: if type(stmnt) is str: statements.append(stmnt) if type(stmnt) == type(typedList): statements.extend(_findStatements(stmntType, stmnt)) return statements
def _getStatements(stmntType, stmntList): """ gets statements of given type from given list """ statements = [] typedList = _makeList(stmntType) for item in stmntList: if type(item) == type(typedList): statements.append(_findStatements(stmntType, item)) return statements
def _buildGraphPattern(triples): # split strings into tuples of strings triples = map((lambda x: tuple(re.split(" ", x))), triples) # convert tuples of strings into tuples of RDFLib objects isIRI = lambda x: x[0]=="<" and x[-1]==">" isLit = lambda x: x[0]=="'" and x[-1]=="'" or x[0]=='"' and x[-1]=='"' for i in range(len(triples)): sub = triples[i][0] pred = triples[i][1] obj = triples[i][2] # unescape and define objects for IRIs and literals if isIRI(sub): sub = URIRef(_unescape(sub[1:-1])) if isIRI(pred): pred = URIRef(_unescape(pred[1:-1])) if isIRI(obj): obj = URIRef(_unescape(obj[1:-1])) elif isLit(obj): obj = _unescape(obj[1:-1]) # build final triple triples[i] = (sub, pred, obj) return GraphPattern(triples)
def _buildQueryArgs(query): """ """ # query lexer query = _escapeLiterals(query) # are unescaped in _buildGraphPattern() query = re.sub("\s+", " ", query).strip() # normalize whitespace query = _resolveShorthand(query) # resolve pred-obj and obj lists query = _resolveBase(query) # resolve relative IRIs query = _resolvePrefixes(query) # resolve prefixes query = re.sub(r"\s*([.;,\{\}])\s*", r" \1 ", query) # normalize punctuation whereObj = query[query.find("{")+1:query.rfind("}")].strip() # strip non-WHERE bits whereObj = whereObj.split(" ") # split into token stack # query parser select = _parseSelect(query) # select is tuple of select variables whereObj = _parseWhere(whereObj) # stack parsed into nested list of typed lists # map parsed object to arrays of RDFLib graphPattern objects where = _getStatements("WHERE", [whereObj]) # pass whereObj as nested list where.extend(_getStatements("UNION", whereObj)) where = map(_buildGraphPattern, where) optional = _getStatements("OPTIONAL", whereObj) optional = map(_buildGraphPattern, optional) # run query #return sparqlGr.query(select, where, optional) return { "select":select, "where":where, "optional":optional }
def doSPARQL(query, sparqlGr): """ Takes SPARQL query & SPARQL graph, returns SPARQL query result object. """ x = _buildQueryArgs(query) return sparqlGr.query(x["select"], x["where"], x["optional"])
if __name__ == "__main__": testCases = [ # basic """ SELECT ?name WHERE { ?a <http://xmlns.com/foaf/0.1/name> ?name } """, # simple prefix """ PREFIX foaf: <http://xmlns.com/foaf/0.1/> SELECT ?name WHERE { ?a foaf:name ?name } """, # base statement """ BASE <http://xmlns.com/foaf/0.1/> SELECT ?name WHERE { ?a <name> ?name } """, # prefix and colon-only prefix """ PREFIX : <http://xmlns.com/foaf/0.1/> PREFIX vcard: <http://www.w3.org/2001/vcard-rdf/3.0#> SELECT ?name ?title WHERE { ?a :name ?name . ?a vcard:TITLE ?title } """, # predicate-object list notation """ PREFIX foaf: <http://xmlns.com/foaf/0.1/> SELECT ?name ?mbox WHERE { ?x foaf:name ?name ; foaf:mbox ?mbox . } """, # object list notation """ PREFIX foaf: <http://xmlns.com/foaf/0.1/> SELECT ?x WHERE { ?x foaf:nick "Alice" , "Alice_" . } """, # escaped literals """ PREFIX tag: <http://xmlns.com/foaf/0.1/> PREFIX vcard: <http://www.w3.org/2001/vcard-rdf/3.0#> SELECT ?name WHERE { ?a tag:name ?name ; vcard:TITLE "escape test vcard:TITLE " ; <tag://test/escaping> "This is a ''' Test \"\"\"" ; <tag://test/escaping> ?d } """, # key word as variable """ PREFIX foaf: <http://xmlns.com/foaf/0.1/> SELECT ?PREFIX ?WHERE WHERE { ?x foaf:name ?PREFIX ; foaf:mbox ?WHERE . } """, # key word as prefix """ PREFIX WHERE: <http://xmlns.com/foaf/0.1/> SELECT ?name ?mbox WHERE { ?x WHERE:name ?name ; WHERE:mbox ?mbox . } """, # some test cases from grammar.py "SELECT ?title WHERE { <http://example.org/book/book1> <http://purl.org/dc/elements/1.1/title> ?title . }",
"""PREFIX foaf: <http://xmlns.com/foaf/0.1/> SELECT ?name ?mbox WHERE { ?person foaf:name ?name . OPTIONAL { ?person foaf:mbox ?mbox} }""",
"""PREFIX foaf: <http://xmlns.com/foaf/0.1/> SELECT ?name ?name2 WHERE { ?person foaf:name ?name . OPTIONAL { ?person foaf:knows ?p2 . ?p2 foaf:name ?name2 . } }""",
"""PREFIX foaf: <http://xmlns.com/foaf/0.1/> #PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> SELECT ?name ?mbox WHERE { { ?person rdf:type foaf:Person } . OPTIONAL { ?person foaf:name ?name } . OPTIONAL {?person foaf:mbox ?mbox} . }""" ]
print "Content-type: text/plain\n\n" for query in testCases: print "\n-----\n" print '>>> query = """' + query.replace("\n", "\n... ") + '"""' print ">>> result = doSPARQL(query, sparqlGr)\n" result = _buildQueryArgs(query); print "select = ", result["select"], "\n" print "where = ", result["where"], "\n" print "optional = ", result["optional"], "\n" print "result = sparqlGr.query(select, where, optional)"
|