diff --git a/irods/experimental/client/http/README.md b/irods/experimental/client/http/README.md new file mode 100644 index 000000000..9e7f7acf9 --- /dev/null +++ b/irods/experimental/client/http/README.md @@ -0,0 +1,61 @@ +``` +(py3) userXY@HOSTNAME:~/python-irodsclient/irods/experimental/client/http$ bash iter_or_page.sh page +--- +[row(COLL_NAME='/'), + row(COLL_NAME='/tempZone'), + row(COLL_NAME='/tempZone/home')] +--- +[row(COLL_NAME='/tempZone/home/alice'), + row(COLL_NAME="/tempZone/home/alice/a'b"), + row(COLL_NAME='/tempZone/home/public')] +--- +[row(COLL_NAME='/tempZone/home/public/rods'), + row(COLL_NAME='/tempZone/home/public/thing'), + row(COLL_NAME='/tempZone/home/rods')] +--- +[row(COLL_NAME='/tempZone/home/rods/c_files'), + row(COLL_NAME='/tempZone/home/rods/hello'), + row(COLL_NAME='/tempZone/trash')] +--- +[row(COLL_NAME='/tempZone/trash/home'), + row(COLL_NAME='/tempZone/trash/home/alice'), + row(COLL_NAME='/tempZone/trash/home/public')] +--- +[row(COLL_NAME='/tempZone/trash/home/rods')] +--- +(py3) userXY@HOSTNAME:~/python-irodsclient/irods/experimental/client/http$ bash iter_or_page.sh iter +--- +row(COLL_NAME='/') +--- +row(COLL_NAME='/tempZone') +--- +row(COLL_NAME='/tempZone/home') +--- +row(COLL_NAME='/tempZone/home/alice') +--- +row(COLL_NAME="/tempZone/home/alice/a'b") +--- +row(COLL_NAME='/tempZone/home/public') +--- +row(COLL_NAME='/tempZone/home/public/rods') +--- +row(COLL_NAME='/tempZone/home/public/thing') +--- +row(COLL_NAME='/tempZone/home/rods') +--- +row(COLL_NAME='/tempZone/home/rods/c_files') +--- +row(COLL_NAME='/tempZone/home/rods/hello') +--- +row(COLL_NAME='/tempZone/trash') +--- +row(COLL_NAME='/tempZone/trash/home') +--- +row(COLL_NAME='/tempZone/trash/home/alice') +--- +row(COLL_NAME='/tempZone/trash/home/public') +--- +row(COLL_NAME='/tempZone/trash/home/rods') + +>>> +``` diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py new file mode 100644 index 000000000..7f1e33e8c --- /dev/null +++ b/irods/experimental/client/http/__init__.py @@ -0,0 +1,230 @@ +import collections +import enum +import functools +import itertools +import json +import logging +import requests +import sys +from .iterator_functions import * + +logger = logging.getLogger(__name__) +MAX_INT32 = 2**31-1 +DEFAULT_PAGE_SIZE = 512 + +# ----- + +# Abstractions that let us either page through a general query items at a time, +# or treat it like a Pythonic generator aka stateful iterator. +# (See the README.md in this directory.) + +# TODO: The README is temporary. Make some better docs. + +class _pageable: + def __init__(self, callable_): + """callable_ is a function-like object called without parameters. + It pages once through the set of query results and should be + stateful in terms of maintaining current offset within the query. + """ + self.callable_ = callable_ + def next_page(self): + page = list(self.callable_()) + return page + +class _iterable(_pageable): + """Adapts a pageable interface to return one query row at a time. An + empty [] returned from next_page signals the end of query results. + """ + @functools.wraps(_pageable.__init__) + def __init__(self,*_): + super().__init__(*_) + self.__P = None + self.index = 0 + # Allow iter() on instances. + def __iter__(self): return self + def __next__(self): + """Called implicitly by any iteration over the _iterable instance. + Returns one query row. + """ + if self.__P is None or self.index >= len(self.__P): + self.__P = self.next_page() + self.index = 0 + if 0 == len(self.__P): + raise StopIteration + element = self.__P[self.index] + self.index += 1 + return element + +# ----- + +class HTTP_operation_error(RuntimeError): + pass + +def _normalized_columns(columns): + if not isinstance(columns,(list,tuple)): + columns = filter(None, (_.strip() for _ in columns.split(','))) + + # de-duplicate + columns = collections.OrderedDict((col,None) for col in columns) + + col_names = tuple(columns.keys()) + cls = collections.namedtuple('row', col_names) + return cls, ",".join(col_names) + +class DataObject: + class column: + class enum(enum.Enum): + DATA_ID = 401 + DATA_COLL_ID = 402 + DATA_NAME = 403 + DATA_REPL_NUM = 404 + # TODO: complete this list + names = [k for k in enum.__members__.keys()] + +class Collection: + class column: + class enum(enum.Enum): + COLL_ID = 500 + COLL_NAME = 501 + # TODO: complete this list + names = [k for k in enum.__members__.keys()] + + # for heavyweight style of getter only! + def __init__(self, mgr, id_): + self.id = id_ + self.mgr = mgr + + @property + def name(self): + return self.mgr.value_by_column_name( self.id, 'COLL_NAME' ) + +# ----------------- +# Manager/heavyweight approach to a catalog object "getter": +# +# This is an approximation of the old PRC approach +# for getting an instance of a collection by its nain +# identifying data, the logical pathname. +# +# We most likely will not be doing things this way. +# (See Session.data_object_replicas() method below.) + +class Manager: + def __init__(self, session): + sess = self.sess = session + + def value_by_column_name(self, id_, column_name:str): + first_row = one(self.sess.genquery1(columns = [column_name], + condition = "COLL_ID = '{}'", args = [id_])) + return getattr(first_row, column_name) + +class CollManager(Manager): + + def name_from_id(self, id_): + return one(self.sess.genquery1(columns = ['COLL_NAME'], + condition = "COLL_ID = '{}'", args = [id_])).COLL_NAME + + def get(self, collname): + r = self.sess.genquery1( columns = 'COLL_ID', + condition = "COLL_NAME = '{}'", args = [collname] ) + return Collection(self, int(one(r).COLL_ID)) + +# ----------------- + +class Session: + + url_base_template = 'http://{self.host}:{self.port}/irods-http/{self.version}' + + # Convenient object properties. + + @property + def url_base(self): + return self.url_base_template.format(**locals()) + + def url(self, endpoint_name): + return self.url_base + "/" + endpoint_name.strip("/") + + @property + def auth_header(self): + return {'Authorization': 'Bearer ' + self.bearer_token} + + # Low-level basis for implementing an endpoint via HTTP 'GET'. + + def http_get(self, endpoint_name, **param_key_value_pairs): + r = requests.get( self.url(endpoint_name), + headers = self.auth_header, + params = param_key_value_pairs ) + if not r.ok: + raise HTTP_operation_error("Failed in GET.") + return r.content.decode() + + # ----------------- + # Thin/lightweight approach to catalog object "getter": + # + def data_object(self, logical_path, *, + query_options=(('offset',0),('count',DEFAULT_PAGE_SIZE))): + coll,data = logical_path.rsplit('/',1) + # TODO: embedded quotes in object names will not work here. + return self.genquery1(DataObject.column.names + Collection.column.names, + "COLL_NAME = '{}' and DATA_NAME = '{}'".format(coll,data), + extra_query_options=dict(query_options)) + + # Each endpoint can have its own method definition. + + def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()): + """Return a generator-style iterator over all row results. + Example: + for row in session.genquery1( 'COLL_NAME' ): + print(row.COLL_NAME) + + By default, one HTTP call to the server returns a single "row", which is not` + terribly efficient. We can override the "count" option with an arbitrary + positive integer, effectively increasing the paging size for the query: + + session.genquery1(columns, extra_query_options=dict(count=512)). + + Since this function's result (a row-wise iterator) is page-size agnostic, its + usage is not altered, whereas the efficiency for large queries will greatly + improve due to the 512-fold decrease in the number of API calls. + """ + condition = condition.format(*args) + row_class, columns = _normalized_columns(columns) + where = '' if condition == '' else ' WHERE ' + + # d's default argument (being mutable) gets memoized in the context of the + # current closure, which persists beyond in the genquery1 call frame in which it + # originated and persists and across multiple calls to get_r. + # This can be leveraged to increment the query offset at the end of each get_r call + # by the length of the rows array retrieved. + + def get_r(local_ = locals(), d = dict(extra_query_options)): + if 'offset' not in d: + d['offset'] = 0 + d['offset'] = int(d['offset']) + result = self.http_get('/query', + op = "execute_genquery", + query = "SELECT {columns}{where}{condition}".format(**local_), + **d) + json_result = json.loads(result) + errcode = json_result['irods_response']['error_code'] + if errcode != 0: + logger.warn('irods error code of [%s] in genquery1',errcode) + rows = [row_class(*i) for i in json_result['rows']] + d['offset'] += len(rows) + return rows + + return _iterable(get_r) + + def __init__(self, username, password, *, + host = 'localhost', + port = 9000, + version = '0.9.5'): + + self.username = username + self.password = password + (self.host, self.port, self.version) = (host, port, version) + url = self.url_base + '/authenticate' + r = requests.post(url, auth = (self.username, self.password)) + if not r.ok: + raise HTTP_operation_error("Failed to connect: url = '%s', status code = %s", + url, r.status_code) + self.bearer_token = r.text diff --git a/irods/experimental/client/http/iter_or_page.sh b/irods/experimental/client/http/iter_or_page.sh new file mode 100644 index 000000000..83e5b2aac --- /dev/null +++ b/irods/experimental/client/http/iter_or_page.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +if [ $# -gt 0 ]; then + arg=${1:=iter} +else + echo >&2 "usage: $0 [page|iter]"; exit 1 +fi + +python -c " +import pprint +from irods.experimental.client.http import * +s = Session('rods','rods') +i = s.genquery1('COLL_NAME', condition='', args=(), extra_query_options=dict(count=3)) +import sys +if sys.argv[1] == 'page': + while True: + print('---') + p = i.next_page() + if not p: + break + pprint.pprint(p) +elif sys.argv[1] == 'iter': + for j in i: + print('---') + pprint.pprint(j) + " ${arg} diff --git a/irods/experimental/client/http/iterator_functions.py b/irods/experimental/client/http/iterator_functions.py new file mode 100644 index 000000000..c42404500 --- /dev/null +++ b/irods/experimental/client/http/iterator_functions.py @@ -0,0 +1,33 @@ +#/usr/bin/env python3 +import itertools +import sys +import typing + +class too_many_results(Exception): pass +class too_few_results(Exception): pass + +__all__ = ['first_n','one','too_many_results','too_few_results'] + +def first_n(iterable: typing.Iterable, n: int): + return list(itertools.islice(iterable,n)) + +def one(iterable: typing.Iterable): + i = first_n(iterable,2) + if i[1:]: + raise too_many_results + if not i: + raise too_few_results + return i[0] + +def test_one(): + assert( + one(iter(range(10,10+i))) == 10 + ) + +def test_first_n(): + assert( + first_n(iter(range(10,10+i)),2) == [10,11] + ) + +if __name__=='__main__': + test_one() diff --git a/irods/prc_http_client_demo.py b/irods/prc_http_client_demo.py new file mode 100644 index 000000000..258f7eb7a --- /dev/null +++ b/irods/prc_http_client_demo.py @@ -0,0 +1,39 @@ +import pprint + +from irods.experimental.client.http import * +from irods.experimental.client.http.iterator_functions import * + +s = Session('rods','rods',host='prec3431') +c = CollManager(s).get("/tempZone/home/rods") + +print ("Got a collection {c.name}, id = {c.id}".format(**locals())) + +# Query collections by explicit column list. +result = s.genquery1(['COLL_ID', 'COLL_NAME'], # columns + "COLL_NAME like '%'", # condition + extra_query_options=dict(count='512')) +print("Result of collection query:\n" + "---------------------------\n") + +result = list(result) +pprint.pprint(result) +print('Length of result was:',len(result)) + +# For a query of all data objects (note lack of condition argument), list full paths. +for row in s.genquery1('COLL_NAME,DATA_NAME', + extra_query_options=dict(count='512')): + print('path = {COLL_NAME}/{DATA_NAME}'.format(**row._asdict())) + +# Fetch the data object requested. +data_path = "/tempZone/home/alice/new_alice.dat" + +print ('-- fetch first replica --') + +data_obj = first_n(s.data_object(data_path),n=1) +print(data_obj) + +print ('-- fetch all replicas without paging --') + +MAX_REPLICAS = 2**31-1 +data_obj_replicas = list(s.data_object(data_path, query_options=dict(count=MAX_REPLICAS))) +pprint.pprint(data_obj_replicas) diff --git a/setup.py b/setup.py index d280ced1c..bb3ee8826 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ 'six>=1.10.0', 'PrettyTable>=0.7.2', 'defusedxml', + 'requests', # - the new syntax: #'futures; python_version == "2.7"' ],