diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d80da8..1d88666 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,10 @@ (including `None`) are no longer allowed, hence missing base IRIs in the JSON-LD context are now handled outside the function call. - Add unittests + - **BREAKING**: The `IdentifierIssuer` class was moved to `identifier_issuer.py`. It's now available + at `pyld.identifier_issuer`. + - **BREAKING**: The classes `URDNA2015` and `URGNA2012` were moved to `canon.py`. They are now available + at `pyld.canon`. ## 2.0.4 - 2024-02-16 diff --git a/lib/pyld/canon.py b/lib/pyld/canon.py new file mode 100644 index 0000000..e76c758 --- /dev/null +++ b/lib/pyld/canon.py @@ -0,0 +1,554 @@ + +import hashlib +from pyld.nquads import parse_nquads, to_nquad +from pyld.identifier_issuer import IdentifierIssuer +import copy + + +class URDNA2015(object): + """ + URDNA2015 implements the URDNA2015 RDF Dataset Normalization Algorithm. + """ + + def __init__(self): + self.blank_node_info = {} + self.hash_to_blank_nodes = {} + self.canonical_issuer = IdentifierIssuer('_:c14n') + self.quads = [] + self.POSITIONS = {'subject': 's', 'object': 'o', 'name': 'g'} + + # 4.4) Normalization Algorithm + def main(self, dataset, options): + # handle invalid output format + if 'format' in options: + if (options['format'] != 'application/n-quads' and + options['format'] != 'application/nquads'): + raise UnknownFormatError( + 'Unknown output format.', options['format']) + + # 1) Create the normalization state. + + # 2) For every quad in input dataset: + for graph_name, triples in dataset.items(): + if graph_name == '@default': + graph_name = None + for triple in triples: + quad = triple + if graph_name is not None: + if graph_name.startswith('_:'): + quad['name'] = {'type': 'blank node'} + else: + quad['name'] = {'type': 'IRI'} + quad['name']['value'] = graph_name + self.quads.append(quad) + + # 2.1) For each blank node that occurs in the quad, add a + # reference to the quad using the blank node identifier in the + # blank node to quads map, creating a new entry if necessary. + for key, component in quad.items(): + if key == 'predicate' or component['type'] != 'blank node': + continue + id_ = component['value'] + self.blank_node_info.setdefault( + id_, {'quads': []})['quads'].append(quad) + + # 3) Create a list of non-normalized blank node identifiers and + # populate it using the keys from the blank node to quads map. + non_normalized = set(self.blank_node_info.keys()) + + # 4) Initialize simple, a boolean flag, to true. + simple = True + + # 5) While simple is true, issue canonical identifiers for blank nodes: + while simple: + # 5.1) Set simple to false. + simple = False + + # 5.2) Clear hash to blank nodes map. + self.hash_to_blank_nodes = {} + + # 5.3) For each blank node identifier identifier in non-normalized + # identifiers: + for id_ in non_normalized: + # 5.3.1) Create a hash, hash, according to the Hash First + # Degree Quads algorithm. + hash = self.hash_first_degree_quads(id_) + + # 5.3.2) Add hash and identifier to hash to blank nodes map, + # creating a new entry if necessary. + self.hash_to_blank_nodes.setdefault(hash, []).append(id_) + + # 5.4) For each hash to identifier list mapping in hash to blank + # nodes map, lexicographically-sorted by hash: + for hash, id_list in sorted(self.hash_to_blank_nodes.items()): + # 5.4.1) If the length of identifier list is greater than 1, + # continue to the next mapping. + if len(id_list) > 1: + continue + + # 5.4.2) Use the Issue Identifier algorithm, passing canonical + # issuer and the single blank node identifier in identifier + # list, identifier, to issue a canonical replacement identifier + # for identifier. + # TODO: consider changing `get_id` to `issue` + id_ = id_list[0] + self.canonical_issuer.get_id(id_) + + # 5.4.3) Remove identifier from non-normalized identifiers. + non_normalized.remove(id_) + + # 5.4.4) Remove hash from the hash to blank nodes map. + del self.hash_to_blank_nodes[hash] + + # 5.4.5) Set simple to true. + simple = True + + # 6) For each hash to identifier list mapping in hash to blank nodes + # map, lexicographically-sorted by hash: + for hash, id_list in sorted(self.hash_to_blank_nodes.items()): + # 6.1) Create hash path list where each item will be a result of + # running the Hash N-Degree Quads algorithm. + hash_path_list = [] + + # 6.2) For each blank node identifier identifier in identifier + # list: + for id_ in id_list: + # 6.2.1) If a canonical identifier has already been issued for + # identifier, continue to the next identifier. + if self.canonical_issuer.has_id(id_): + continue + + # 6.2.2) Create temporary issuer, an identifier issuer + # initialized with the prefix _:b. + issuer = IdentifierIssuer('_:b') + + # 6.2.3) Use the Issue Identifier algorithm, passing temporary + # issuer and identifier, to issue a new temporary blank node + # identifier for identifier. + issuer.get_id(id_) + + # 6.2.4) Run the Hash N-Degree Quads algorithm, passing + # temporary issuer, and append the result to the hash path + # list. + hash_path_list.append(self.hash_n_degree_quads(id_, issuer)) + + # 6.3) For each result in the hash path list, + # lexicographically-sorted by the hash in result: + for result in sorted(hash_path_list, key=lambda r: r['hash']): + # 6.3.1) For each blank node identifier, existing identifier, + # that was issued a temporary identifier by identifier issuer + # in result, issue a canonical identifier, in the same order, + # using the Issue Identifier algorithm, passing canonical + # issuer and existing identifier. + for existing in result['issuer'].order: + self.canonical_issuer.get_id(existing) + + # Note: At this point all blank nodes in the set of RDF quads have been + # assigned canonical identifiers, which have been stored in the + # canonical issuer. Here each quad is updated by assigning each of its + # blank nodes its new identifier. + + # 7) For each quad, quad, in input dataset: + normalized = [] + for quad in self.quads: + # 7.1) Create a copy, quad copy, of quad and replace any existing + # blank node identifiers using the canonical identifiers previously + # issued by canonical issuer. Note: We optimize away the copy here. + for key, component in quad.items(): + if key == 'predicate': + continue + if(component['type'] == 'blank node' and not + component['value'].startswith( + self.canonical_issuer.prefix)): + component['value'] = self.canonical_issuer.get_id( + component['value']) + + # 7.2) Add quad copy to the normalized dataset. + normalized.append(to_nquad(quad)) + + # sort normalized output + normalized.sort() + + # 8) Return the normalized dataset. + if (options.get('format') == 'application/n-quads' or + options.get('format') == 'application/nquads'): + return ''.join(normalized) + return parse_nquads(''.join(normalized)) + + # 4.6) Hash First Degree Quads + def hash_first_degree_quads(self, id_): + # return cached hash + info = self.blank_node_info[id_] + if 'hash' in info: + return info['hash'] + + # 1) Initialize nquads to an empty list. It will be used to store quads + # in N-Quads format. + nquads = [] + + # 2) Get the list of quads quads associated with the reference blank + # node identifier in the blank node to quads map. + quads = info['quads'] + + # 3) For each quad quad in quads: + for quad in quads: + # 3.1) Serialize the quad in N-Quads format with the following + # special rule: + + # 3.1.1) If any component in quad is an blank node, then serialize + # it using a special identifier as follows: + copy = {} + for key, component in quad.items(): + if key == 'predicate': + copy[key] = component + continue + # 3.1.2) If the blank node's existing blank node identifier + # matches the reference blank node identifier then use the + # blank node identifier _:a, otherwise, use the blank node + # identifier _:z. + copy[key] = self.modify_first_degree_component( + id_, component, key) + nquads.append(to_nquad(copy)) + + # 4) Sort nquads in lexicographical order. + nquads.sort() + + # 5) Return the hash that results from passing the sorted, joined + # nquads through the hash algorithm. + info['hash'] = self.hash_nquads(nquads) + return info['hash'] + + # helper for modifying component during Hash First Degree Quads + def modify_first_degree_component(self, id_, component, key): + if component['type'] != 'blank node': + return component + component = copy.deepcopy(component) + component['value'] = '_:a' if component['value'] == id_ else '_:z' + return component + + # 4.7) Hash Related Blank Node + def hash_related_blank_node(self, related, quad, issuer, position): + # 1) Set the identifier to use for related, preferring first the + # canonical identifier for related if issued, second the identifier + # issued by issuer if issued, and last, if necessary, the result of + # the Hash First Degree Quads algorithm, passing related. + if self.canonical_issuer.has_id(related): + id_ = self.canonical_issuer.get_id(related) + elif issuer.has_id(related): + id_ = issuer.get_id(related) + else: + id_ = self.hash_first_degree_quads(related) + + # 2) Initialize a string input to the value of position. + # Note: We use a hash object instead. + md = self.create_hash() + md.update(position.encode('utf8')) + + # 3) If position is not g, append <, the value of the predicate in + # quad, and > to input. + if position != 'g': + md.update(self.get_related_predicate(quad).encode('utf8')) + + # 4) Append identifier to input. + md.update(id_.encode('utf8')) + + # 5) Return the hash that results from passing input through the hash + # algorithm. + return md.hexdigest() + + # helper for getting a related predicate + def get_related_predicate(self, quad): + return '<' + quad['predicate']['value'] + '>' + + # 4.8) Hash N-Degree Quads + def hash_n_degree_quads(self, id_, issuer): + # 1) Create a hash to related blank nodes map for storing hashes that + # identify related blank nodes. + # Note: 2) and 3) handled within `createHashToRelated` + hash_to_related = self.create_hash_to_related(id_, issuer) + + # 4) Create an empty string, data to hash. + # Note: We create a hash object instead. + md = self.create_hash() + + # 5) For each related hash to blank node list mapping in hash to + # related blank nodes map, sorted lexicographically by related hash: + for hash, blank_nodes in sorted(hash_to_related.items()): + # 5.1) Append the related hash to the data to hash. + md.update(hash.encode('utf8')) + + # 5.2) Create a string chosen path. + chosen_path = '' + + # 5.3) Create an unset chosen issuer variable. + chosen_issuer = None + + # 5.4) For each permutation of blank node list: + for permutation in permutations(blank_nodes): + # 5.4.1) Create a copy of issuer, issuer copy. + issuer_copy = copy.deepcopy(issuer) + + # 5.4.2) Create a string path. + path = '' + + # 5.4.3) Create a recursion list, to store blank node + # identifiers that must be recursively processed by this + # algorithm. + recursion_list = [] + + # 5.4.4) For each related in permutation: + skip_to_next_permutation = False + for related in permutation: + # 5.4.4.1) If a canonical identifier has been issued for + # related, append it to path. + if(self.canonical_issuer.has_id(related)): + path += self.canonical_issuer.get_id(related) + # 5.4.4.2) Otherwise: + else: + # 5.4.4.2.1) If issuer copy has not issued an + # identifier for related, append related to recursion + # list. + if not issuer_copy.has_id(related): + recursion_list.append(related) + + # 5.4.4.2.2) Use the Issue Identifier algorithm, + # passing issuer copy and related and append the result + # to path. + path += issuer_copy.get_id(related) + + # 5.4.4.3) If chosen path is not empty and the length of + # path is greater than or equal to the length of chosen + # path and path is lexicographically greater than chosen + # path, then skip to the next permutation. + if(len(chosen_path) != 0 and + len(path) >= len(chosen_path) and + path > chosen_path): + skip_to_next_permutation = True + break + + if skip_to_next_permutation: + continue + + # 5.4.5) For each related in recursion list: + for related in recursion_list: + # 5.4.5.1) Set result to the result of recursively + # executing the Hash N-Degree Quads algorithm, passing + # related for identifier and issuer copy for path + # identifier issuer. + result = self.hash_n_degree_quads(related, issuer_copy) + + # 5.4.5.2) Use the Issue Identifier algorithm, passing + # issuer copy and related and append the result to path. + path += issuer_copy.get_id(related) + + # 5.4.5.3) Append <, the hash in result, and > to path. + path += '<' + result['hash'] + '>' + + # 5.4.5.4) Set issuer copy to the identifier issuer in + # result. + issuer_copy = result['issuer'] + + # 5.4.5.5) If chosen path is not empty and the length of + # path is greater than or equal to the length of chosen + # path and path is lexicographically greater than chosen + # path, then skip to the next permutation. + if(len(chosen_path) != 0 and + len(path) >= len(chosen_path) and + path > chosen_path): + skip_to_next_permutation = True + break + + if skip_to_next_permutation: + continue + + # 5.4.6) If chosen path is empty or path is lexicographically + # less than chosen path, set chosen path to path and chosen + # issuer to issuer copy. + if len(chosen_path) == 0 or path < chosen_path: + chosen_path = path + chosen_issuer = issuer_copy + + # 5.5) Append chosen path to data to hash. + md.update(chosen_path.encode('utf8')) + + # 5.6) Replace issuer, by reference, with chosen issuer. + issuer = chosen_issuer + + # 6) Return issuer and the hash that results from passing data to hash + # through the hash algorithm. + return {'hash': md.hexdigest(), 'issuer': issuer} + + # helper for creating hash to related blank nodes map + def create_hash_to_related(self, id_, issuer): + # 1) Create a hash to related blank nodes map for storing hashes that + # identify related blank nodes. + hash_to_related = {} + + # 2) Get a reference, quads, to the list of quads in the blank node to + # quads map for the key identifier. + quads = self.blank_node_info[id_]['quads'] + + # 3) For each quad in quads: + for quad in quads: + # 3.1) For each component in quad, if component is the subject, + # object, and graph name and it is a blank node that is not + # identified by identifier: + for key, component in quad.items(): + if(key != 'predicate' and + component['type'] == 'blank node' and + component['value'] != id_): + # 3.1.1) Set hash to the result of the Hash Related Blank + # Node algorithm, passing the blank node identifier for + # component as related, quad, path identifier issuer as + # issuer, and position as either s, o, or g based on + # whether component is a subject, object, graph name, + # respectively. + related = component['value'] + position = self.POSITIONS[key] + hash = self.hash_related_blank_node( + related, quad, issuer, position) + + # 3.1.2) Add a mapping of hash to the blank node identifier + # for component to hash to related blank nodes map, adding + # an entry as necessary. + hash_to_related.setdefault(hash, []).append(related) + + return hash_to_related + + # helper to create appropriate hash object + def create_hash(self): + return hashlib.sha256() + + # helper to hash a list of nquads + def hash_nquads(self, nquads): + md = self.create_hash() + for nquad in nquads: + md.update(nquad.encode('utf8')) + return md.hexdigest() + + +class URGNA2012(URDNA2015): + """ + URGNA2012 implements the URGNA2012 RDF Graph Normalization Algorithm. + """ + + def __init__(self): + URDNA2015.__init__(self) + + # helper for modifying component during Hash First Degree Quads + def modify_first_degree_component(self, id_, component, key): + if component['type'] != 'blank node': + return component + component = copy.deepcopy(component) + if key == 'name': + component['value'] = '_:g' + else: + component['value'] = '_:a' if component['value'] == id_ else '_:z' + return component + + # helper for getting a related predicate + def get_related_predicate(self, quad): + return quad['predicate']['value'] + + # helper for creating hash to related blank nodes map + def create_hash_to_related(self, id_, issuer): + # 1) Create a hash to related blank nodes map for storing hashes that + # identify related blank nodes. + hash_to_related = {} + + # 2) Get a reference, quads, to the list of quads in the blank node to + # quads map for the key identifier. + quads = self.blank_node_info[id_]['quads'] + + # 3) For each quad in quads: + for quad in quads: + # 3.1) If the quad's subject is a blank node that does not match + # identifier, set hash to the result of the Hash Related Blank Node + # algorithm, passing the blank node identifier for subject as + # related, quad, path identifier issuer as issuer, and p as + # position. + if(quad['subject']['type'] == 'blank node' and + quad['subject']['value'] != id_): + related = quad['subject']['value'] + position = 'p' + # 3.2) Otherwise, if quad's object is a blank node that does + # not match identifier, to the result of the Hash Related Blank + # Node algorithm, passing the blank node identifier for object + # as related, quad, path identifier issuer as issuer, and r + # as position. + elif(quad['object']['type'] == 'blank node' and + quad['object']['value'] != id_): + related = quad['object']['value'] + position = 'r' + # 3.3) Otherwise, continue to the next quad. + else: + continue + + # 3.4) Add a mapping of hash to the blank node identifier for the + # component that matched (subject or object) to hash to related + # blank nodes map, adding an entry as necessary. + hash = self.hash_related_blank_node( + related, quad, issuer, position) + hash_to_related.setdefault(hash, []).append(related) + + return hash_to_related + + # helper to create appropriate hash object + def create_hash(self): + return hashlib.sha1() + + +def permutations(elements): + """ + Generates all of the possible permutations for the given list of elements. + + :param elements: the list of elements to permutate. + """ + # begin with sorted elements + elements.sort() + # initialize directional info for permutation algorithm + left = {} + for v in elements: + left[v] = True + + length = len(elements) + last = length - 1 + while True: + yield elements + + # Calculate the next permutation using the Steinhaus-Johnson-Trotter + # permutation algorithm. + + # get largest mobile element k + # (mobile: element is greater than the one it is looking at) + k, pos = None, 0 + for i in range(length): + e = elements[i] + is_left = left[e] + if((k is None or e > k) and + ((is_left and i > 0 and e > elements[i - 1]) or + (not is_left and i < last and e > elements[i + 1]))): + k, pos = e, i + + # no more permutations + if k is None: + return + + # swap k and the element it is looking at + swap = pos - 1 if left[k] else pos + 1 + elements[pos], elements[swap] = elements[swap], k + + # reverse the direction of all elements larger than k + for i in range(length): + if elements[i] > k: + left[elements[i]] = not left[elements[i]] + + +class UnknownFormatError(ValueError): + """ + Base class for unknown format errors. + """ + + def __init__(self, message, format): + Exception.__init__(self, message) + self.format = format \ No newline at end of file diff --git a/lib/pyld/identifier_issuer.py b/lib/pyld/identifier_issuer.py new file mode 100644 index 0000000..a05d40b --- /dev/null +++ b/lib/pyld/identifier_issuer.py @@ -0,0 +1,52 @@ +class IdentifierIssuer(object): + """ + An IdentifierIssuer issues unique identifiers, keeping track of any + previously issued identifiers. + """ + + def __init__(self, prefix): + """ + Initializes a new IdentifierIssuer. + + :param prefix: the prefix to use (''). + """ + self.prefix = prefix + self.counter = 0 + self.existing = {} + self.order = [] + + """ + Gets the new identifier for the given old identifier, where if no old + identifier is given a new identifier will be generated. + + :param [old]: the old identifier to get the new identifier for. + + :return: the new identifier. + """ + def get_id(self, old=None): + # return existing old identifier + if old and old in self.existing: + return self.existing[old] + + # get next identifier + id_ = self.prefix + str(self.counter) + self.counter += 1 + + # save mapping + if old is not None: + self.existing[old] = id_ + self.order.append(old) + + return id_ + + def has_id(self, old): + """ + Returns True if the given old identifier has already been assigned a + new identifier. + + :param old: the old identifier to check. + + :return: True if the old identifier has been assigned a new identifier, + False if not. + """ + return old in self.existing diff --git a/lib/pyld/jsonld.py b/lib/pyld/jsonld.py index 5fd3f35..9948fdd 100644 --- a/lib/pyld/jsonld.py +++ b/lib/pyld/jsonld.py @@ -15,13 +15,16 @@ """ import copy -import hashlib import json import re import sys from urllib.parse import urlparse import warnings import uuid + +from pyld.canon import URDNA2015, URGNA2012, UnknownFormatError +from pyld.nquads import ParserError, parse_nquads, to_nquad, to_nquads +from pyld.identifier_issuer import IdentifierIssuer from .context_resolver import ContextResolver from c14n.Canonicalize import canonicalize from cachetools import LRUCache @@ -925,7 +928,14 @@ def normalize(self, input_, options): # do normalization if options['algorithm'] == 'URDNA2015': - return URDNA2015().main(dataset, options) + try: + return URDNA2015().main(dataset, options) + except UnknownFormatError as cause: + raise JsonLdError( + str(cause), + 'jsonld.UnknownFormat', + {'format': cause.format}) from cause + # assume URGNA2012 return URGNA2012().main(dataset, options) @@ -1321,112 +1331,14 @@ def parse_nquads(input_): :return: an RDF dataset. """ - # define partial regexes - iri = '(?:<([^:]+:[^>]*)>)' - bnode = '(_:(?:[A-Za-z][A-Za-z0-9]*))' - plain = '"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"' - datatype = '(?:\\^\\^' + iri + ')' - language = '(?:@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*))' - literal = '(?:' + plain + '(?:' + datatype + '|' + language + ')?)' - ws = '[ \\t]+' - wso = '[ \\t]*' - eoln = r'(?:\r\n)|(?:\n)|(?:\r)' - empty = r'^' + wso + '$' - - # define quad part regexes - subject = '(?:' + iri + '|' + bnode + ')' + ws - property = iri + ws - object = '(?:' + iri + '|' + bnode + '|' + literal + ')' + wso - graph = '(?:\\.|(?:(?:' + iri + '|' + bnode + ')' + wso + '\\.))' - - # Note: Notice that the graph position does not include literals - # even though they are specified as a possible value in the - # N-Quads note (http://sw.deri.org/2008/07/n-quads/). This is - # intentional, as literals in that position are not supported by the - # RDF data model or the JSON-LD data model. - # See: https://github.com/digitalbazaar/pyld/pull/19 - - # full quad regex - quad = r'^' + wso + subject + property + object + graph + wso + '$' - - # build RDF dataset - dataset = {} - - # split N-Quad input into lines - lines = re.split(eoln, input_) - line_number = 0 - for line in lines: - line_number += 1 - - # skip empty lines - if re.search(empty, line) is not None: - continue - - # parse quad - match = re.search(quad, line) - if match is None: - raise JsonLdError( - 'Error while parsing N-Quads invalid quad.', - 'jsonld.ParseError', {'line': line_number}) - match = match.groups() - - # create RDF triple - triple = {'subject': {}, 'predicate': {}, 'object': {}} - - # get subject - if match[0] is not None: - triple['subject'] = {'type': 'IRI', 'value': match[0]} - else: - triple['subject'] = {'type': 'blank node', 'value': match[1]} - - # get predicate - triple['predicate'] = {'type': 'IRI', 'value': match[2]} - - # get object - if match[3] is not None: - triple['object'] = {'type': 'IRI', 'value': match[3]} - elif match[4] is not None: - triple['object'] = {'type': 'blank node', 'value': match[4]} - else: - triple['object'] = {'type': 'literal'} - unescaped = ( - match[5] - .replace('\\"', '\"') - .replace('\\t', '\t') - .replace('\\n', '\n') - .replace('\\r', '\r') - .replace('\\\\', '\\')) - if match[6] is not None: - triple['object']['datatype'] = match[6] - elif match[7] is not None: - triple['object']['datatype'] = RDF_LANGSTRING - triple['object']['language'] = match[7] - else: - triple['object']['datatype'] = XSD_STRING - triple['object']['value'] = unescaped - - # get graph name ('@default' is used for the default graph) - name = '@default' - if match[8] is not None: - name = match[8] - elif match[9] is not None: - name = match[9] - - # initialize graph in dataset - if name not in dataset: - dataset[name] = [triple] - # add triple if unique to its graph - else: - unique = True - triples = dataset[name] - for t in dataset[name]: - if JsonLdProcessor._compare_rdf_triples(t, triple): - unique = False - break - if unique: - triples.append(triple) - - return dataset + try: + result = parse_nquads(input_) + return result + except ParserError as cause: + raise JsonLdError( + str(cause), + 'jsonld.ParseError', + {'line': cause.line_number}) from cause @staticmethod def to_nquads(dataset): @@ -1437,80 +1349,11 @@ def to_nquads(dataset): :return: the N-Quads string. """ - quads = [] - for graph_name, triples in dataset.items(): - for triple in triples: - if graph_name == '@default': - graph_name = None - quads.append(JsonLdProcessor.to_nquad(triple, graph_name)) - quads.sort() - return ''.join(quads) + return to_nquads(dataset) @staticmethod def to_nquad(triple, graph_name=None): - """ - Converts an RDF triple and graph name to an N-Quad string (a single - quad). - - :param triple: the RDF triple or quad to convert (a triple or quad - may be passed, if a triple is passed then `graph_name` should be - given to specify the name of the graph the triple is in, `None` - for the default graph). - :param graph_name: the name of the graph containing the triple, None - for the default graph. - - :return: the N-Quad string. - """ - s = triple['subject'] - p = triple['predicate'] - o = triple['object'] - g = triple.get('name', {'value': graph_name})['value'] - - quad = '' - - # subject is an IRI - if s['type'] == 'IRI': - quad += '<' + s['value'] + '>' - else: - quad += s['value'] - quad += ' ' - - # property is an IRI - if p['type'] == 'IRI': - quad += '<' + p['value'] + '>' - else: - quad += p['value'] - quad += ' ' - - # object is IRI, bnode, or literal - if o['type'] == 'IRI': - quad += '<' + o['value'] + '>' - elif(o['type'] == 'blank node'): - quad += o['value'] - else: - escaped = ( - o['value'] - .replace('\\', '\\\\') - .replace('\t', '\\t') - .replace('\n', '\\n') - .replace('\r', '\\r') - .replace('\"', '\\"')) - quad += '"' + escaped + '"' - if o['datatype'] == RDF_LANGSTRING: - if o['language']: - quad += '@' + o['language'] - elif o['datatype'] != XSD_STRING: - quad += '^^<' + o['datatype'] + '>' - - # graph - if g is not None: - if not g.startswith('_:'): - quad += ' <' + g + '>' - else: - quad += ' ' + g - - quad += ' .\n' - return quad + return to_nquad(triple, graph_name) @staticmethod def arrayify(value): @@ -1524,28 +1367,6 @@ def arrayify(value): """ return value if _is_array(value) else [value] - @staticmethod - def _compare_rdf_triples(t1, t2): - """ - Compares two RDF triples for equality. - - :param t1: the first triple. - :param t2: the second triple. - - :return: True if the triples are the same, False if not. - """ - for attr in ['subject', 'predicate', 'object']: - if(t1[attr]['type'] != t2[attr]['type'] or - t1[attr]['value'] != t2[attr]['value']): - return False - - if t1['object'].get('language') != t2['object'].get('language'): - return False - if t1['object'].get('datatype') != t2['object'].get('datatype'): - return False - - return True - def _compact(self, active_ctx, active_property, element, options): """ Recursively compacts an element using the given active context. All @@ -5423,7 +5244,6 @@ def _clone_active_context(self, active_ctx): child['@vocab'] = active_ctx['@vocab'] return child - class JsonLdError(Exception): """ Base class for JSON-LD errors. @@ -5444,601 +5264,6 @@ def __str__(self): rval += '\nDetails: ' + repr(self.details) return rval - -class IdentifierIssuer(object): - """ - An IdentifierIssuer issues unique identifiers, keeping track of any - previously issued identifiers. - """ - - def __init__(self, prefix): - """ - Initializes a new IdentifierIssuer. - - :param prefix: the prefix to use (''). - """ - self.prefix = prefix - self.counter = 0 - self.existing = {} - self.order = [] - - """ - Gets the new identifier for the given old identifier, where if no old - identifier is given a new identifier will be generated. - - :param [old]: the old identifier to get the new identifier for. - - :return: the new identifier. - """ - def get_id(self, old=None): - # return existing old identifier - if old and old in self.existing: - return self.existing[old] - - # get next identifier - id_ = self.prefix + str(self.counter) - self.counter += 1 - - # save mapping - if old is not None: - self.existing[old] = id_ - self.order.append(old) - - return id_ - - def has_id(self, old): - """ - Returns True if the given old identifier has already been assigned a - new identifier. - - :param old: the old identifier to check. - - :return: True if the old identifier has been assigned a new identifier, - False if not. - """ - return old in self.existing - - -class URDNA2015(object): - """ - URDNA2015 implements the URDNA2015 RDF Dataset Normalization Algorithm. - """ - - def __init__(self): - self.blank_node_info = {} - self.hash_to_blank_nodes = {} - self.canonical_issuer = IdentifierIssuer('_:c14n') - self.quads = [] - self.POSITIONS = {'subject': 's', 'object': 'o', 'name': 'g'} - - # 4.4) Normalization Algorithm - def main(self, dataset, options): - # handle invalid output format - if 'format' in options: - if (options['format'] != 'application/n-quads' and - options['format'] != 'application/nquads'): - raise JsonLdError( - 'Unknown output format.', - 'jsonld.UnknownFormat', {'format': options['format']}) - - # 1) Create the normalization state. - - # 2) For every quad in input dataset: - for graph_name, triples in dataset.items(): - if graph_name == '@default': - graph_name = None - for triple in triples: - quad = triple - if graph_name is not None: - if graph_name.startswith('_:'): - quad['name'] = {'type': 'blank node'} - else: - quad['name'] = {'type': 'IRI'} - quad['name']['value'] = graph_name - self.quads.append(quad) - - # 2.1) For each blank node that occurs in the quad, add a - # reference to the quad using the blank node identifier in the - # blank node to quads map, creating a new entry if necessary. - for key, component in quad.items(): - if key == 'predicate' or component['type'] != 'blank node': - continue - id_ = component['value'] - self.blank_node_info.setdefault( - id_, {'quads': []})['quads'].append(quad) - - # 3) Create a list of non-normalized blank node identifiers and - # populate it using the keys from the blank node to quads map. - non_normalized = set(self.blank_node_info.keys()) - - # 4) Initialize simple, a boolean flag, to true. - simple = True - - # 5) While simple is true, issue canonical identifiers for blank nodes: - while simple: - # 5.1) Set simple to false. - simple = False - - # 5.2) Clear hash to blank nodes map. - self.hash_to_blank_nodes = {} - - # 5.3) For each blank node identifier identifier in non-normalized - # identifiers: - for id_ in non_normalized: - # 5.3.1) Create a hash, hash, according to the Hash First - # Degree Quads algorithm. - hash = self.hash_first_degree_quads(id_) - - # 5.3.2) Add hash and identifier to hash to blank nodes map, - # creating a new entry if necessary. - self.hash_to_blank_nodes.setdefault(hash, []).append(id_) - - # 5.4) For each hash to identifier list mapping in hash to blank - # nodes map, lexicographically-sorted by hash: - for hash, id_list in sorted(self.hash_to_blank_nodes.items()): - # 5.4.1) If the length of identifier list is greater than 1, - # continue to the next mapping. - if len(id_list) > 1: - continue - - # 5.4.2) Use the Issue Identifier algorithm, passing canonical - # issuer and the single blank node identifier in identifier - # list, identifier, to issue a canonical replacement identifier - # for identifier. - # TODO: consider changing `get_id` to `issue` - id_ = id_list[0] - self.canonical_issuer.get_id(id_) - - # 5.4.3) Remove identifier from non-normalized identifiers. - non_normalized.remove(id_) - - # 5.4.4) Remove hash from the hash to blank nodes map. - del self.hash_to_blank_nodes[hash] - - # 5.4.5) Set simple to true. - simple = True - - # 6) For each hash to identifier list mapping in hash to blank nodes - # map, lexicographically-sorted by hash: - for hash, id_list in sorted(self.hash_to_blank_nodes.items()): - # 6.1) Create hash path list where each item will be a result of - # running the Hash N-Degree Quads algorithm. - hash_path_list = [] - - # 6.2) For each blank node identifier identifier in identifier - # list: - for id_ in id_list: - # 6.2.1) If a canonical identifier has already been issued for - # identifier, continue to the next identifier. - if self.canonical_issuer.has_id(id_): - continue - - # 6.2.2) Create temporary issuer, an identifier issuer - # initialized with the prefix _:b. - issuer = IdentifierIssuer('_:b') - - # 6.2.3) Use the Issue Identifier algorithm, passing temporary - # issuer and identifier, to issue a new temporary blank node - # identifier for identifier. - issuer.get_id(id_) - - # 6.2.4) Run the Hash N-Degree Quads algorithm, passing - # temporary issuer, and append the result to the hash path - # list. - hash_path_list.append(self.hash_n_degree_quads(id_, issuer)) - - # 6.3) For each result in the hash path list, - # lexicographically-sorted by the hash in result: - for result in sorted(hash_path_list, key=lambda r: r['hash']): - # 6.3.1) For each blank node identifier, existing identifier, - # that was issued a temporary identifier by identifier issuer - # in result, issue a canonical identifier, in the same order, - # using the Issue Identifier algorithm, passing canonical - # issuer and existing identifier. - for existing in result['issuer'].order: - self.canonical_issuer.get_id(existing) - - # Note: At this point all blank nodes in the set of RDF quads have been - # assigned canonical identifiers, which have been stored in the - # canonical issuer. Here each quad is updated by assigning each of its - # blank nodes its new identifier. - - # 7) For each quad, quad, in input dataset: - normalized = [] - for quad in self.quads: - # 7.1) Create a copy, quad copy, of quad and replace any existing - # blank node identifiers using the canonical identifiers previously - # issued by canonical issuer. Note: We optimize away the copy here. - for key, component in quad.items(): - if key == 'predicate': - continue - if(component['type'] == 'blank node' and not - component['value'].startswith( - self.canonical_issuer.prefix)): - component['value'] = self.canonical_issuer.get_id( - component['value']) - - # 7.2) Add quad copy to the normalized dataset. - normalized.append(JsonLdProcessor.to_nquad(quad)) - - # sort normalized output - normalized.sort() - - # 8) Return the normalized dataset. - if (options.get('format') == 'application/n-quads' or - options.get('format') == 'application/nquads'): - return ''.join(normalized) - return JsonLdProcessor.parse_nquads(''.join(normalized)) - - # 4.6) Hash First Degree Quads - def hash_first_degree_quads(self, id_): - # return cached hash - info = self.blank_node_info[id_] - if 'hash' in info: - return info['hash'] - - # 1) Initialize nquads to an empty list. It will be used to store quads - # in N-Quads format. - nquads = [] - - # 2) Get the list of quads quads associated with the reference blank - # node identifier in the blank node to quads map. - quads = info['quads'] - - # 3) For each quad quad in quads: - for quad in quads: - # 3.1) Serialize the quad in N-Quads format with the following - # special rule: - - # 3.1.1) If any component in quad is an blank node, then serialize - # it using a special identifier as follows: - copy = {} - for key, component in quad.items(): - if key == 'predicate': - copy[key] = component - continue - # 3.1.2) If the blank node's existing blank node identifier - # matches the reference blank node identifier then use the - # blank node identifier _:a, otherwise, use the blank node - # identifier _:z. - copy[key] = self.modify_first_degree_component( - id_, component, key) - nquads.append(JsonLdProcessor.to_nquad(copy)) - - # 4) Sort nquads in lexicographical order. - nquads.sort() - - # 5) Return the hash that results from passing the sorted, joined - # nquads through the hash algorithm. - info['hash'] = self.hash_nquads(nquads) - return info['hash'] - - # helper for modifying component during Hash First Degree Quads - def modify_first_degree_component(self, id_, component, key): - if component['type'] != 'blank node': - return component - component = copy.deepcopy(component) - component['value'] = '_:a' if component['value'] == id_ else '_:z' - return component - - # 4.7) Hash Related Blank Node - def hash_related_blank_node(self, related, quad, issuer, position): - # 1) Set the identifier to use for related, preferring first the - # canonical identifier for related if issued, second the identifier - # issued by issuer if issued, and last, if necessary, the result of - # the Hash First Degree Quads algorithm, passing related. - if self.canonical_issuer.has_id(related): - id_ = self.canonical_issuer.get_id(related) - elif issuer.has_id(related): - id_ = issuer.get_id(related) - else: - id_ = self.hash_first_degree_quads(related) - - # 2) Initialize a string input to the value of position. - # Note: We use a hash object instead. - md = self.create_hash() - md.update(position.encode('utf8')) - - # 3) If position is not g, append <, the value of the predicate in - # quad, and > to input. - if position != 'g': - md.update(self.get_related_predicate(quad).encode('utf8')) - - # 4) Append identifier to input. - md.update(id_.encode('utf8')) - - # 5) Return the hash that results from passing input through the hash - # algorithm. - return md.hexdigest() - - # helper for getting a related predicate - def get_related_predicate(self, quad): - return '<' + quad['predicate']['value'] + '>' - - # 4.8) Hash N-Degree Quads - def hash_n_degree_quads(self, id_, issuer): - # 1) Create a hash to related blank nodes map for storing hashes that - # identify related blank nodes. - # Note: 2) and 3) handled within `createHashToRelated` - hash_to_related = self.create_hash_to_related(id_, issuer) - - # 4) Create an empty string, data to hash. - # Note: We create a hash object instead. - md = self.create_hash() - - # 5) For each related hash to blank node list mapping in hash to - # related blank nodes map, sorted lexicographically by related hash: - for hash, blank_nodes in sorted(hash_to_related.items()): - # 5.1) Append the related hash to the data to hash. - md.update(hash.encode('utf8')) - - # 5.2) Create a string chosen path. - chosen_path = '' - - # 5.3) Create an unset chosen issuer variable. - chosen_issuer = None - - # 5.4) For each permutation of blank node list: - for permutation in permutations(blank_nodes): - # 5.4.1) Create a copy of issuer, issuer copy. - issuer_copy = copy.deepcopy(issuer) - - # 5.4.2) Create a string path. - path = '' - - # 5.4.3) Create a recursion list, to store blank node - # identifiers that must be recursively processed by this - # algorithm. - recursion_list = [] - - # 5.4.4) For each related in permutation: - skip_to_next_permutation = False - for related in permutation: - # 5.4.4.1) If a canonical identifier has been issued for - # related, append it to path. - if(self.canonical_issuer.has_id(related)): - path += self.canonical_issuer.get_id(related) - # 5.4.4.2) Otherwise: - else: - # 5.4.4.2.1) If issuer copy has not issued an - # identifier for related, append related to recursion - # list. - if not issuer_copy.has_id(related): - recursion_list.append(related) - - # 5.4.4.2.2) Use the Issue Identifier algorithm, - # passing issuer copy and related and append the result - # to path. - path += issuer_copy.get_id(related) - - # 5.4.4.3) If chosen path is not empty and the length of - # path is greater than or equal to the length of chosen - # path and path is lexicographically greater than chosen - # path, then skip to the next permutation. - if(len(chosen_path) != 0 and - len(path) >= len(chosen_path) and - path > chosen_path): - skip_to_next_permutation = True - break - - if skip_to_next_permutation: - continue - - # 5.4.5) For each related in recursion list: - for related in recursion_list: - # 5.4.5.1) Set result to the result of recursively - # executing the Hash N-Degree Quads algorithm, passing - # related for identifier and issuer copy for path - # identifier issuer. - result = self.hash_n_degree_quads(related, issuer_copy) - - # 5.4.5.2) Use the Issue Identifier algorithm, passing - # issuer copy and related and append the result to path. - path += issuer_copy.get_id(related) - - # 5.4.5.3) Append <, the hash in result, and > to path. - path += '<' + result['hash'] + '>' - - # 5.4.5.4) Set issuer copy to the identifier issuer in - # result. - issuer_copy = result['issuer'] - - # 5.4.5.5) If chosen path is not empty and the length of - # path is greater than or equal to the length of chosen - # path and path is lexicographically greater than chosen - # path, then skip to the next permutation. - if(len(chosen_path) != 0 and - len(path) >= len(chosen_path) and - path > chosen_path): - skip_to_next_permutation = True - break - - if skip_to_next_permutation: - continue - - # 5.4.6) If chosen path is empty or path is lexicographically - # less than chosen path, set chosen path to path and chosen - # issuer to issuer copy. - if len(chosen_path) == 0 or path < chosen_path: - chosen_path = path - chosen_issuer = issuer_copy - - # 5.5) Append chosen path to data to hash. - md.update(chosen_path.encode('utf8')) - - # 5.6) Replace issuer, by reference, with chosen issuer. - issuer = chosen_issuer - - # 6) Return issuer and the hash that results from passing data to hash - # through the hash algorithm. - return {'hash': md.hexdigest(), 'issuer': issuer} - - # helper for creating hash to related blank nodes map - def create_hash_to_related(self, id_, issuer): - # 1) Create a hash to related blank nodes map for storing hashes that - # identify related blank nodes. - hash_to_related = {} - - # 2) Get a reference, quads, to the list of quads in the blank node to - # quads map for the key identifier. - quads = self.blank_node_info[id_]['quads'] - - # 3) For each quad in quads: - for quad in quads: - # 3.1) For each component in quad, if component is the subject, - # object, and graph name and it is a blank node that is not - # identified by identifier: - for key, component in quad.items(): - if(key != 'predicate' and - component['type'] == 'blank node' and - component['value'] != id_): - # 3.1.1) Set hash to the result of the Hash Related Blank - # Node algorithm, passing the blank node identifier for - # component as related, quad, path identifier issuer as - # issuer, and position as either s, o, or g based on - # whether component is a subject, object, graph name, - # respectively. - related = component['value'] - position = self.POSITIONS[key] - hash = self.hash_related_blank_node( - related, quad, issuer, position) - - # 3.1.2) Add a mapping of hash to the blank node identifier - # for component to hash to related blank nodes map, adding - # an entry as necessary. - hash_to_related.setdefault(hash, []).append(related) - - return hash_to_related - - # helper to create appropriate hash object - def create_hash(self): - return hashlib.sha256() - - # helper to hash a list of nquads - def hash_nquads(self, nquads): - md = self.create_hash() - for nquad in nquads: - md.update(nquad.encode('utf8')) - return md.hexdigest() - - -class URGNA2012(URDNA2015): - """ - URGNA2012 implements the URGNA2012 RDF Graph Normalization Algorithm. - """ - - def __init__(self): - URDNA2015.__init__(self) - - # helper for modifying component during Hash First Degree Quads - def modify_first_degree_component(self, id_, component, key): - if component['type'] != 'blank node': - return component - component = copy.deepcopy(component) - if key == 'name': - component['value'] = '_:g' - else: - component['value'] = '_:a' if component['value'] == id_ else '_:z' - return component - - # helper for getting a related predicate - def get_related_predicate(self, quad): - return quad['predicate']['value'] - - # helper for creating hash to related blank nodes map - def create_hash_to_related(self, id_, issuer): - # 1) Create a hash to related blank nodes map for storing hashes that - # identify related blank nodes. - hash_to_related = {} - - # 2) Get a reference, quads, to the list of quads in the blank node to - # quads map for the key identifier. - quads = self.blank_node_info[id_]['quads'] - - # 3) For each quad in quads: - for quad in quads: - # 3.1) If the quad's subject is a blank node that does not match - # identifier, set hash to the result of the Hash Related Blank Node - # algorithm, passing the blank node identifier for subject as - # related, quad, path identifier issuer as issuer, and p as - # position. - if(quad['subject']['type'] == 'blank node' and - quad['subject']['value'] != id_): - related = quad['subject']['value'] - position = 'p' - # 3.2) Otherwise, if quad's object is a blank node that does - # not match identifier, to the result of the Hash Related Blank - # Node algorithm, passing the blank node identifier for object - # as related, quad, path identifier issuer as issuer, and r - # as position. - elif(quad['object']['type'] == 'blank node' and - quad['object']['value'] != id_): - related = quad['object']['value'] - position = 'r' - # 3.3) Otherwise, continue to the next quad. - else: - continue - - # 3.4) Add a mapping of hash to the blank node identifier for the - # component that matched (subject or object) to hash to related - # blank nodes map, adding an entry as necessary. - hash = self.hash_related_blank_node( - related, quad, issuer, position) - hash_to_related.setdefault(hash, []).append(related) - - return hash_to_related - - # helper to create appropriate hash object - def create_hash(self): - return hashlib.sha1() - - -def permutations(elements): - """ - Generates all of the possible permutations for the given list of elements. - - :param elements: the list of elements to permutate. - """ - # begin with sorted elements - elements.sort() - # initialize directional info for permutation algorithm - left = {} - for v in elements: - left[v] = True - - length = len(elements) - last = length - 1 - while True: - yield elements - - # Calculate the next permutation using the Steinhaus-Johnson-Trotter - # permutation algorithm. - - # get largest mobile element k - # (mobile: element is greater than the one it is looking at) - k, pos = None, 0 - for i in range(length): - e = elements[i] - is_left = left[e] - if((k is None or e > k) and - ((is_left and i > 0 and e > elements[i - 1]) or - (not is_left and i < last and e > elements[i + 1]))): - k, pos = e, i - - # no more permutations - if k is None: - return - - # swap k and the element it is looking at - swap = pos - 1 if left[k] else pos + 1 - elements[pos], elements[swap] = elements[swap], k - - # reverse the direction of all elements larger than k - for i in range(length): - if elements[i] > k: - left[elements[i]] = not left[elements[i]] - - def _is_keyword(v): """ Returns whether or not the given value is a keyword. diff --git a/lib/pyld/nquads.py b/lib/pyld/nquads.py new file mode 100644 index 0000000..12988c8 --- /dev/null +++ b/lib/pyld/nquads.py @@ -0,0 +1,232 @@ + +import re + +XSD_STRING = 'http://www.w3.org/2001/XMLSchema#string' +RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' +RDF_LANGSTRING = RDF + 'langString' + +def parse_nquads(input_): + """ + Parses RDF in the form of N-Quads. + + :param input_: the N-Quads input to parse. + + :return: an RDF dataset. + """ + # define partial regexes + iri = '(?:<([^:]+:[^>]*)>)' + bnode = '(_:(?:[A-Za-z][A-Za-z0-9]*))' + plain = '"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"' + datatype = '(?:\\^\\^' + iri + ')' + language = '(?:@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*))' + literal = '(?:' + plain + '(?:' + datatype + '|' + language + ')?)' + ws = '[ \\t]+' + wso = '[ \\t]*' + eoln = r'(?:\r\n)|(?:\n)|(?:\r)' + empty = r'^' + wso + '$' + + # define quad part regexes + subject = '(?:' + iri + '|' + bnode + ')' + ws + property = iri + ws + object = '(?:' + iri + '|' + bnode + '|' + literal + ')' + wso + graph = '(?:\\.|(?:(?:' + iri + '|' + bnode + ')' + wso + '\\.))' + + # Note: Notice that the graph position does not include literals + # even though they are specified as a possible value in the + # N-Quads note (http://sw.deri.org/2008/07/n-quads/). This is + # intentional, as literals in that position are not supported by the + # RDF data model or the JSON-LD data model. + # See: https://github.com/digitalbazaar/pyld/pull/19 + + # full quad regex + quad = r'^' + wso + subject + property + object + graph + wso + '$' + + # build RDF dataset + dataset = {} + + # split N-Quad input into lines + lines = re.split(eoln, input_) + line_number = 0 + for line in lines: + line_number += 1 + + # skip empty lines + if re.search(empty, line) is not None: + continue + + # parse quad + match = re.search(quad, line) + if match is None: + raise ParserError('Error while parsing N-Quads invalid quad.', line_number=line_number) + match = match.groups() + + # create RDF triple + triple = {'subject': {}, 'predicate': {}, 'object': {}} + + # get subject + if match[0] is not None: + triple['subject'] = {'type': 'IRI', 'value': match[0]} + else: + triple['subject'] = {'type': 'blank node', 'value': match[1]} + + # get predicate + triple['predicate'] = {'type': 'IRI', 'value': match[2]} + + # get object + if match[3] is not None: + triple['object'] = {'type': 'IRI', 'value': match[3]} + elif match[4] is not None: + triple['object'] = {'type': 'blank node', 'value': match[4]} + else: + triple['object'] = {'type': 'literal'} + unescaped = ( + match[5] + .replace('\\"', '\"') + .replace('\\t', '\t') + .replace('\\n', '\n') + .replace('\\r', '\r') + .replace('\\\\', '\\')) + if match[6] is not None: + triple['object']['datatype'] = match[6] + elif match[7] is not None: + triple['object']['datatype'] = RDF_LANGSTRING + triple['object']['language'] = match[7] + else: + triple['object']['datatype'] = XSD_STRING + triple['object']['value'] = unescaped + + # get graph name ('@default' is used for the default graph) + name = '@default' + if match[8] is not None: + name = match[8] + elif match[9] is not None: + name = match[9] + + # initialize graph in dataset + if name not in dataset: + dataset[name] = [triple] + # add triple if unique to its graph + else: + unique = True + triples = dataset[name] + for t in dataset[name]: + if _compare_rdf_triples(t, triple): + unique = False + break + if unique: + triples.append(triple) + + return dataset + +def to_nquads(dataset): + """ + Converts an RDF dataset to N-Quads. + + :param dataset: the RDF dataset to convert. + + :return: the N-Quads string. + """ + quads = [] + for graph_name, triples in dataset.items(): + for triple in triples: + if graph_name == '@default': + graph_name = None + quads.append(to_nquad(triple, graph_name)) + quads.sort() + return ''.join(quads) + +def to_nquad(triple, graph_name=None): + """ + Converts an RDF triple and graph name to an N-Quad string (a single + quad). + + :param triple: the RDF triple or quad to convert (a triple or quad + may be passed, if a triple is passed then `graph_name` should be + given to specify the name of the graph the triple is in, `None` + for the default graph). + :param graph_name: the name of the graph containing the triple, None + for the default graph. + + :return: the N-Quad string. + """ + s = triple['subject'] + p = triple['predicate'] + o = triple['object'] + g = triple.get('name', {'value': graph_name})['value'] + + quad = '' + + # subject is an IRI + if s['type'] == 'IRI': + quad += '<' + s['value'] + '>' + else: + quad += s['value'] + quad += ' ' + + # property is an IRI + if p['type'] == 'IRI': + quad += '<' + p['value'] + '>' + else: + quad += p['value'] + quad += ' ' + + # object is IRI, bnode, or literal + if o['type'] == 'IRI': + quad += '<' + o['value'] + '>' + elif(o['type'] == 'blank node'): + quad += o['value'] + else: + escaped = ( + o['value'] + .replace('\\', '\\\\') + .replace('\t', '\\t') + .replace('\n', '\\n') + .replace('\r', '\\r') + .replace('\"', '\\"')) + quad += '"' + escaped + '"' + if o['datatype'] == RDF_LANGSTRING: + if o['language']: + quad += '@' + o['language'] + elif o['datatype'] != XSD_STRING: + quad += '^^<' + o['datatype'] + '>' + + # graph + if g is not None: + if not g.startswith('_:'): + quad += ' <' + g + '>' + else: + quad += ' ' + g + + quad += ' .\n' + return quad + + +def _compare_rdf_triples(t1, t2): + """ + Compares two RDF triples for equality. + + :param t1: the first triple. + :param t2: the second triple. + + :return: True if the triples are the same, False if not. + """ + for attr in ['subject', 'predicate', 'object']: + if(t1[attr]['type'] != t2[attr]['type'] or + t1[attr]['value'] != t2[attr]['value']): + return False + + if t1['object'].get('language') != t2['object'].get('language'): + return False + if t1['object'].get('datatype') != t2['object'].get('datatype'): + return False + + return True + +class ParserError(ValueError): + """ + Base class for parsing errors. + """ + + def __init__(self, message, line_number=None): + Exception.__init__(self, message) + self.line_number = line_number