From f406e17cdf35f3ad3db09aeb9c82a9154d9ecb1b Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Tue, 18 Jul 2023 13:13:39 -0400 Subject: [PATCH 01/12] [_465] introduce irods.client.http.Session and supporting classes irods.client.http.Session is a new way of connecting and reflects one possible future of iRODS clients, ie communicating via HTTP instead of the traditional iRODS protocol. --- irods/client/experimental/http/__init__.py | 116 +++++++++++++++++++++ irods/prc_http_client_demo.py | 26 +++++ setup.py | 1 + 3 files changed, 143 insertions(+) create mode 100644 irods/client/experimental/http/__init__.py create mode 100644 irods/prc_http_client_demo.py diff --git a/irods/client/experimental/http/__init__.py b/irods/client/experimental/http/__init__.py new file mode 100644 index 000000000..9a613923e --- /dev/null +++ b/irods/client/experimental/http/__init__.py @@ -0,0 +1,116 @@ +import collections +import json +import logging +import requests +import sys + +def _normalized_columns(columns): + if not isinstance(columns,(list,tuple)): + columns = filter(None, (_.strip() for _ in columns.split(','))) + + # de-duplicate + columns = collections.OrderedDict((col,None) for col in columns) + + col_names = tuple(columns.keys()) + cls = collections.namedtuple('row', col_names) + return cls, ",".join(col_names) + +logger = logging.getLogger(__name__) + +class HTTP_operation_error(RuntimeError): + pass + +class Collection: + + def __init__(self, mgr, id_): + self.id = id_ + self.mgr = mgr + + @property + def name(self): + return self.mgr.value_by_column_name( self.id, 'COLL_NAME' ) + +# ----------------- + +class Manager: + def __init__(self, session): + sess = self.sess = session + + def value_by_column_name(self, id_, column_name:str): + first_row = self.sess.genquery1(columns = [column_name], + condition = "COLL_ID = '{}'", args = [id_])[0] + return getattr(first_row, column_name) + +class CollManager(Manager): + + def name_from_id(self, id_): + return self.sess.genquery1(columns = ['COLL_NAME'], + condition = "COLL_ID = '{}'", args = [id_])[0].COLL_NAME + + def get(self, collname): + jr = self.sess.genquery1( columns = 'COLL_ID', + condition = "COLL_NAME = '{}'", args = [collname] ) + return Collection(self, int(jr[0].COLL_ID)) + +# ----------------- + +class Session: + + url_base_template = 'http://{self.host}:{self.port}/irods-http/{self.version}' + + # Convenient object properties. + + @property + def url_base(self): + return self.url_base_template.format(**locals()) + + def url(self, endpoint_name): + return self.url_base + "/" + endpoint_name.strip("/") + + @property + def auth_header(self): + return {'Authorization': 'Bearer ' + self.bearer_token} + + # Low-level basis for implementing an endpoint via HTTP 'GET'. + + def http_get(self, endpoint_name, **param_key_value_pairs): + r = requests.get( self.url(endpoint_name), + headers = self.auth_header, + params = param_key_value_pairs ) + if not r.ok: + raise HTTP_operation_error("Failed in GET.") + return r.content.decode() + + # Each endpoint can have its own method definition. + + def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()): + ## maybe require Python3.8 so we can have format strings, for example - + # query_text = f"SELECT {columns} where {condition.format(*args)}" + condition = condition.format(*args) + cls, columns = _normalized_columns(columns) + where = '' if condition == '' else ' WHERE ' + r = self.http_get( '/query', + op = "execute_genquery", + query = "SELECT {columns}{where}{condition}".format(**locals()), + **dict(extra_query_options)) + J = json.loads(r) + errcode = J['irods_response']['error_code'] + if errcode != 0: + logger.warn('irods error code of [%s] in genquery1',errcode) + return [cls(*i) for i in J['rows']] + + def __init__(self, username, password, *, + host = 'localhost', + port = 9000, + version = '0.9.5'): + + self.username = username + self.password = password + (self.host, self.port, self.version) = (host, port, version) + url = self.url_base + '/authenticate' + r = requests.post(url, auth = (self.username, self.password)) + if not r.ok: + raise HTTP_operation_error("Failed to connect: url = '%s', status code = %s", + url, r.status_code) + self.bearer_token = r.text + diff --git a/irods/prc_http_client_demo.py b/irods/prc_http_client_demo.py new file mode 100644 index 000000000..a095c2343 --- /dev/null +++ b/irods/prc_http_client_demo.py @@ -0,0 +1,26 @@ +import pprint + +from irods.client.experimental.http import * + +s = Session('rods','rods',host='prec3431') +c = CollManager(s).get("/tempZone/home/rods") + +print ("Got a collection {c.name}, id = {c.id}".format(**locals())) + +# TODO: a *_generator or *_pager method which iterates or pages through results + +# collections + +result = s.genquery1(['COLL_ID', 'COLL_NAME'], # columns + "COLL_NAME like '%'", # condition + extra_query_options=dict(count='512')) + +pprint.pprint(result) +print('len=',len(result)) + +# data objects, list full paths + +for row in s.genquery1('COLL_NAME,DATA_NAME', # note 1 - we can also parse the from a string + # note 2 - argument is optional + extra_query_options=dict(count='512')): + print('path = {row.COLL_NAME}/{row.DATA_NAME}'.format(**locals())) diff --git a/setup.py b/setup.py index d280ced1c..bb3ee8826 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ 'six>=1.10.0', 'PrettyTable>=0.7.2', 'defusedxml', + 'requests', # - the new syntax: #'futures; python_version == "2.7"' ], From f7c5910847b9f83128f98ca6552c5b979d348543 Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Wed, 19 Jul 2023 16:54:20 -0400 Subject: [PATCH 02/12] more lightweight objects & getters. (See Session.data_object_replicas). Also, swap the client and experimental namespaces. And rework the demo slightly. --- .../client}/http/__init__.py | 51 ++++++++++++++++--- irods/prc_http_client_demo.py | 23 +++++---- 2 files changed, 58 insertions(+), 16 deletions(-) rename irods/{client/experimental => experimental/client}/http/__init__.py (72%) diff --git a/irods/client/experimental/http/__init__.py b/irods/experimental/client/http/__init__.py similarity index 72% rename from irods/client/experimental/http/__init__.py rename to irods/experimental/client/http/__init__.py index 9a613923e..71d99cc0e 100644 --- a/irods/client/experimental/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -1,9 +1,15 @@ import collections +import enum import json import logging import requests import sys +logger = logging.getLogger(__name__) + +class HTTP_operation_error(RuntimeError): + pass + def _normalized_columns(columns): if not isinstance(columns,(list,tuple)): columns = filter(None, (_.strip() for _ in columns.split(','))) @@ -15,13 +21,25 @@ def _normalized_columns(columns): cls = collections.namedtuple('row', col_names) return cls, ",".join(col_names) -logger = logging.getLogger(__name__) - -class HTTP_operation_error(RuntimeError): - pass +class DataObject: + class column: + class enum(enum.Enum): + DATA_ID = 401 + DATA_COLL_ID = 402 + DATA_NAME = 403 + DATA_REPL_NUM = 404 + # TODO: complete this list + names = [k for k in enum.__members__.keys()] class Collection: - + class column: + class enum(enum.Enum): + COLL_ID = 500 + COLL_NAME = 501 + # TODO: complete this list + names = [k for k in enum.__members__.keys()] + + # for heavyweight style of getter only! def __init__(self, mgr, id_): self.id = id_ self.mgr = mgr @@ -31,6 +49,14 @@ def name(self): return self.mgr.value_by_column_name( self.id, 'COLL_NAME' ) # ----------------- +# Manager/heavyweight approach to a catalog object "getter": +# +# This is an approximation of the old PRC approach +# for getting an instance of a collection by its nain +# identifying data, the logical pathname. +# +# We most likely will not be doing things this way. +# (See Session.data_object_replicas() method below.) class Manager: def __init__(self, session): @@ -81,11 +107,24 @@ def http_get(self, endpoint_name, **param_key_value_pairs): raise HTTP_operation_error("Failed in GET.") return r.content.decode() + # ----------------- + # Thin/lightweight approach to catalog object "getter": + # + def data_object_replicas(self, logical_path): + coll,data = logical_path.rsplit('/',1) + # TODO: embedded quotes in object names will not work here. + return self.genquery1(DataObject.column.names + Collection.column.names, + "COLL_NAME = '{}' and DATA_NAME = '{}'".format(coll,data), + extra_query_options={'count':500}) + # Each endpoint can have its own method definition. def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()): - ## maybe require Python3.8 so we can have format strings, for example - + + # TODO/discuss: + # Should we require Python3.8 so we can have format strings, e.g.: # query_text = f"SELECT {columns} where {condition.format(*args)}" + condition = condition.format(*args) cls, columns = _normalized_columns(columns) where = '' if condition == '' else ' WHERE ' diff --git a/irods/prc_http_client_demo.py b/irods/prc_http_client_demo.py index a095c2343..d3ad4f013 100644 --- a/irods/prc_http_client_demo.py +++ b/irods/prc_http_client_demo.py @@ -1,6 +1,6 @@ import pprint -from irods.client.experimental.http import * +from irods.experimental.client.http import * s = Session('rods','rods',host='prec3431') c = CollManager(s).get("/tempZone/home/rods") @@ -9,18 +9,21 @@ # TODO: a *_generator or *_pager method which iterates or pages through results -# collections - +# Query collections by explicit column list. result = s.genquery1(['COLL_ID', 'COLL_NAME'], # columns "COLL_NAME like '%'", # condition extra_query_options=dict(count='512')) - +print("Result of collection query:\n" + "---------------------------\n") pprint.pprint(result) -print('len=',len(result)) +print('Length of result was:',len(result)) -# data objects, list full paths - -for row in s.genquery1('COLL_NAME,DATA_NAME', # note 1 - we can also parse the from a string - # note 2 - argument is optional +# For a query of all data objects (note lack of condition argument), list full paths. +for row in s.genquery1('COLL_NAME,DATA_NAME', extra_query_options=dict(count='512')): - print('path = {row.COLL_NAME}/{row.DATA_NAME}'.format(**locals())) + print('path = {COLL_NAME}/{DATA_NAME}'.format(**row._asdict())) + +# Fetch all columns for the data object requested. +data_path = "/tempZone/home/alice/new_alice.dat" +x = s.data_object_replicas(data_path) +print("'{}' has {} replicas we can access".format(data_path, len(x))) From 91bf56315c2cf9a477116d05a365e94de3399d44 Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Thu, 20 Jul 2023 02:31:00 -0400 Subject: [PATCH 03/12] allow genquery1 to be both paged and iterated by row --- irods/experimental/client/http/README.md | 61 +++++++++++++++ irods/experimental/client/http/__init__.py | 78 ++++++++++++++++--- .../experimental/client/http/iter_or_page.sh | 26 +++++++ 3 files changed, 155 insertions(+), 10 deletions(-) create mode 100644 irods/experimental/client/http/README.md create mode 100644 irods/experimental/client/http/iter_or_page.sh diff --git a/irods/experimental/client/http/README.md b/irods/experimental/client/http/README.md new file mode 100644 index 000000000..9e7f7acf9 --- /dev/null +++ b/irods/experimental/client/http/README.md @@ -0,0 +1,61 @@ +``` +(py3) userXY@HOSTNAME:~/python-irodsclient/irods/experimental/client/http$ bash iter_or_page.sh page +--- +[row(COLL_NAME='/'), + row(COLL_NAME='/tempZone'), + row(COLL_NAME='/tempZone/home')] +--- +[row(COLL_NAME='/tempZone/home/alice'), + row(COLL_NAME="/tempZone/home/alice/a'b"), + row(COLL_NAME='/tempZone/home/public')] +--- +[row(COLL_NAME='/tempZone/home/public/rods'), + row(COLL_NAME='/tempZone/home/public/thing'), + row(COLL_NAME='/tempZone/home/rods')] +--- +[row(COLL_NAME='/tempZone/home/rods/c_files'), + row(COLL_NAME='/tempZone/home/rods/hello'), + row(COLL_NAME='/tempZone/trash')] +--- +[row(COLL_NAME='/tempZone/trash/home'), + row(COLL_NAME='/tempZone/trash/home/alice'), + row(COLL_NAME='/tempZone/trash/home/public')] +--- +[row(COLL_NAME='/tempZone/trash/home/rods')] +--- +(py3) userXY@HOSTNAME:~/python-irodsclient/irods/experimental/client/http$ bash iter_or_page.sh iter +--- +row(COLL_NAME='/') +--- +row(COLL_NAME='/tempZone') +--- +row(COLL_NAME='/tempZone/home') +--- +row(COLL_NAME='/tempZone/home/alice') +--- +row(COLL_NAME="/tempZone/home/alice/a'b") +--- +row(COLL_NAME='/tempZone/home/public') +--- +row(COLL_NAME='/tempZone/home/public/rods') +--- +row(COLL_NAME='/tempZone/home/public/thing') +--- +row(COLL_NAME='/tempZone/home/rods') +--- +row(COLL_NAME='/tempZone/home/rods/c_files') +--- +row(COLL_NAME='/tempZone/home/rods/hello') +--- +row(COLL_NAME='/tempZone/trash') +--- +row(COLL_NAME='/tempZone/trash/home') +--- +row(COLL_NAME='/tempZone/trash/home/alice') +--- +row(COLL_NAME='/tempZone/trash/home/public') +--- +row(COLL_NAME='/tempZone/trash/home/rods') + +>>> +``` diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py index 71d99cc0e..19b94a583 100644 --- a/irods/experimental/client/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -1,5 +1,6 @@ import collections import enum +import itertools import json import logging import requests @@ -7,6 +8,39 @@ logger = logging.getLogger(__name__) +# ----- + +# Abstractions that let us either page through a general query items at a time, +# or treat it like a Pythonic generator aka stateful iterator. +# (See the README.md in this directory.) + +# TODO: The README is temporary. Make some better docs. + +class _pageable: + def __init__(self, callable_): + self.callable_ = callable_ + def next_page(self): + page = list(self.callable_()) + return page + +class _iterable(_pageable): + def __init__(self,*_): + super().__init__(*_) + self.__P = None + self.index = 0 + def __iter__(self): return self + def __next__(self): + if self.__P is None or self.index >= len(self.__P): + self.__P = self.next_page() + self.index = 0 + if 0 == len(self.__P): + raise StopIteration + element = self.__P[self.index] + self.index += 1 + return element + +# ----- + class HTTP_operation_error(RuntimeError): pass @@ -119,7 +153,7 @@ def data_object_replicas(self, logical_path): # Each endpoint can have its own method definition. - def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()): + def genquery1(self, columns, condition='', *, args=(), extra_query_options = (('offset',0),)): # TODO/discuss: # Should we require Python3.8 so we can have format strings, e.g.: @@ -128,15 +162,39 @@ def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()) condition = condition.format(*args) cls, columns = _normalized_columns(columns) where = '' if condition == '' else ' WHERE ' - r = self.http_get( '/query', - op = "execute_genquery", - query = "SELECT {columns}{where}{condition}".format(**locals()), - **dict(extra_query_options)) - J = json.loads(r) - errcode = J['irods_response']['error_code'] - if errcode != 0: - logger.warn('irods error code of [%s] in genquery1',errcode) - return [cls(*i) for i in J['rows']] + + extra_query_options_d = dict(extra_query_options) + + # --- For the time being, genquery1 returns variable types depending on offset parameter. + # + # If *NO* offset is given (ie extra_query_options parameter is forced to {} or {'count':C}, + # we return a result than can be either paged or row-iterated. (Again, see README) + + # But if an offset is given, we just return what the API hands us, which seems to be + # one result (count=1) by default. + + def get_r(local_ = locals(), d = extra_query_options_d.copy()): + if 'offset' not in d: + d['offset'] = 0 + r = self.http_get( '/query', + op = "execute_genquery", + query = "SELECT {columns}{where}{condition}".format(**local_), + **d) + + d['offset'] += d.get('count',512) + + J = json.loads(r) + errcode = J['irods_response']['error_code'] + if errcode != 0: + logger.warn('irods error code of [%s] in genquery1',errcode) + return [cls(*i) for i in J['rows']] + return r + + if 'offset' in extra_query_options_d: + return get_r() + else: + return _iterable(get_r) + #return (get_r) def __init__(self, username, password, *, host = 'localhost', diff --git a/irods/experimental/client/http/iter_or_page.sh b/irods/experimental/client/http/iter_or_page.sh new file mode 100644 index 000000000..83e5b2aac --- /dev/null +++ b/irods/experimental/client/http/iter_or_page.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +if [ $# -gt 0 ]; then + arg=${1:=iter} +else + echo >&2 "usage: $0 [page|iter]"; exit 1 +fi + +python -c " +import pprint +from irods.experimental.client.http import * +s = Session('rods','rods') +i = s.genquery1('COLL_NAME', condition='', args=(), extra_query_options=dict(count=3)) +import sys +if sys.argv[1] == 'page': + while True: + print('---') + p = i.next_page() + if not p: + break + pprint.pprint(p) +elif sys.argv[1] == 'iter': + for j in i: + print('---') + pprint.pprint(j) + " ${arg} From 92217c3bd7a7c4045907c3c496de87c8d351e2af Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Thu, 20 Jul 2023 03:07:50 -0400 Subject: [PATCH 04/12] [_465] fix minor things about demo and module regularize use of int and string for offset, count data_object gets one replica by default can pass genquery options to session.data_object --- irods/experimental/client/http/__init__.py | 8 +++++--- irods/prc_http_client_demo.py | 19 ++++++++++++++++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py index 19b94a583..331c53ff7 100644 --- a/irods/experimental/client/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -144,12 +144,13 @@ def http_get(self, endpoint_name, **param_key_value_pairs): # ----------------- # Thin/lightweight approach to catalog object "getter": # - def data_object_replicas(self, logical_path): + def data_object(self, logical_path, *, + query_options=(('offset',0),('count',1))): coll,data = logical_path.rsplit('/',1) # TODO: embedded quotes in object names will not work here. return self.genquery1(DataObject.column.names + Collection.column.names, "COLL_NAME = '{}' and DATA_NAME = '{}'".format(coll,data), - extra_query_options={'count':500}) + extra_query_options=dict(query_options)) # Each endpoint can have its own method definition. @@ -176,12 +177,13 @@ def genquery1(self, columns, condition='', *, args=(), extra_query_options = ((' def get_r(local_ = locals(), d = extra_query_options_d.copy()): if 'offset' not in d: d['offset'] = 0 + d['offset'] = int(d['offset']) r = self.http_get( '/query', op = "execute_genquery", query = "SELECT {columns}{where}{condition}".format(**local_), **d) - d['offset'] += d.get('count',512) + d['offset'] += int(d.get('count','512')) J = json.loads(r) errcode = J['irods_response']['error_code'] diff --git a/irods/prc_http_client_demo.py b/irods/prc_http_client_demo.py index d3ad4f013..05a74fbd5 100644 --- a/irods/prc_http_client_demo.py +++ b/irods/prc_http_client_demo.py @@ -15,15 +15,28 @@ extra_query_options=dict(count='512')) print("Result of collection query:\n" "---------------------------\n") + +result = list(result) pprint.pprint(result) print('Length of result was:',len(result)) +#exit()#dwm + # For a query of all data objects (note lack of condition argument), list full paths. for row in s.genquery1('COLL_NAME,DATA_NAME', extra_query_options=dict(count='512')): print('path = {COLL_NAME}/{DATA_NAME}'.format(**row._asdict())) -# Fetch all columns for the data object requested. +# Fetch the data object requested. data_path = "/tempZone/home/alice/new_alice.dat" -x = s.data_object_replicas(data_path) -print("'{}' has {} replicas we can access".format(data_path, len(x))) + +print ('-- fetch first replica --') + +data_obj = s.data_object(data_path) +print(data_obj) + +print ('-- fetch all replicas --') + +MAX_REPLICAS = 2**31-1 +data_obj_replicas = list(s.data_object(data_path, query_options=dict(count=MAX_REPLICAS))) +pprint.pprint(data_obj_replicas) From dd4acc8326aeddd94ee7cc8e5132f1438681e3ac Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Thu, 20 Jul 2023 16:23:08 -0400 Subject: [PATCH 05/12] correct the genquery code offset no longer used to indicate return type. len of page properly used to update offset in each call to get_r to get initial page, we can call next_page() init on returned iterator. other offsets of pages should be done Pythonically ie with itertools.islice --- irods/experimental/client/http/__init__.py | 27 +++++++--------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py index 331c53ff7..5fd038780 100644 --- a/irods/experimental/client/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -1,5 +1,6 @@ import collections import enum +import functools import itertools import json import logging @@ -154,7 +155,7 @@ def data_object(self, logical_path, *, # Each endpoint can have its own method definition. - def genquery1(self, columns, condition='', *, args=(), extra_query_options = (('offset',0),)): + def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()): # TODO/discuss: # Should we require Python3.8 so we can have format strings, e.g.: @@ -166,14 +167,6 @@ def genquery1(self, columns, condition='', *, args=(), extra_query_options = ((' extra_query_options_d = dict(extra_query_options) - # --- For the time being, genquery1 returns variable types depending on offset parameter. - # - # If *NO* offset is given (ie extra_query_options parameter is forced to {} or {'count':C}, - # we return a result than can be either paged or row-iterated. (Again, see README) - - # But if an offset is given, we just return what the API hands us, which seems to be - # one result (count=1) by default. - def get_r(local_ = locals(), d = extra_query_options_d.copy()): if 'offset' not in d: d['offset'] = 0 @@ -183,20 +176,16 @@ def get_r(local_ = locals(), d = extra_query_options_d.copy()): query = "SELECT {columns}{where}{condition}".format(**local_), **d) - d['offset'] += int(d.get('count','512')) - J = json.loads(r) errcode = J['irods_response']['error_code'] if errcode != 0: logger.warn('irods error code of [%s] in genquery1',errcode) - return [cls(*i) for i in J['rows']] - return r - - if 'offset' in extra_query_options_d: - return get_r() - else: - return _iterable(get_r) - #return (get_r) + + rows = [cls(*i) for i in J['rows']] + d['offset'] += len(rows) + return rows + + return _iterable(get_r) def __init__(self, username, password, *, host = 'localhost', From 807697a58cb46a1ab42abe808a07055d991654ff Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Thu, 20 Jul 2023 17:39:21 -0400 Subject: [PATCH 06/12] tidy; improve variable names --- irods/experimental/client/http/__init__.py | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py index 5fd038780..2e692af68 100644 --- a/irods/experimental/client/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -162,26 +162,28 @@ def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()) # query_text = f"SELECT {columns} where {condition.format(*args)}" condition = condition.format(*args) - cls, columns = _normalized_columns(columns) + row_class, columns = _normalized_columns(columns) where = '' if condition == '' else ' WHERE ' - extra_query_options_d = dict(extra_query_options) + # d's default argument (being mutable) gets memoized in the context of the + # current closure, which persists beyond in the genquery1 call frame in which it + # originated and persists and across multiple calls to get_r. + # This can be leveraged to increment the query offset at the end of each get_r call + # by the length of the rows array retrieved. - def get_r(local_ = locals(), d = extra_query_options_d.copy()): + def get_r(local_ = locals(), d = dict(extra_query_options)): if 'offset' not in d: d['offset'] = 0 d['offset'] = int(d['offset']) - r = self.http_get( '/query', - op = "execute_genquery", - query = "SELECT {columns}{where}{condition}".format(**local_), - **d) - - J = json.loads(r) - errcode = J['irods_response']['error_code'] + result = self.http_get('/query', + op = "execute_genquery", + query = "SELECT {columns}{where}{condition}".format(**local_), + **d) + json_result = json.loads(result) + errcode = json_result['irods_response']['error_code'] if errcode != 0: logger.warn('irods error code of [%s] in genquery1',errcode) - - rows = [cls(*i) for i in J['rows']] + rows = [row_class(*i) for i in json_result['rows']] d['offset'] += len(rows) return rows From ec7ca36ab298dbc04e496e50cbaf52aef59dcb9c Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Sun, 23 Jul 2023 07:54:13 -0400 Subject: [PATCH 07/12] docstrings for _pageable and _iterable interfaces --- irods/experimental/client/http/__init__.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py index 2e692af68..42838ca56 100644 --- a/irods/experimental/client/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -17,20 +17,32 @@ # TODO: The README is temporary. Make some better docs. -class _pageable: +class _pageable: def __init__(self, callable_): + """callable_ is a function-like object called without parameters. + It pages once through the set of query results and should be + stateful in terms of maintaining current offset within the query. + """ self.callable_ = callable_ def next_page(self): page = list(self.callable_()) return page class _iterable(_pageable): + """Adapts a pageable interface to return one query row at a time. An + empty [] returned from next_page signals the end of query results. + """ + @functools.wraps(_pageable.__init__) def __init__(self,*_): super().__init__(*_) self.__P = None self.index = 0 + # Allow iter() on instances. def __iter__(self): return self def __next__(self): + """Called implicitly by any iteration over the _iterable instance. + Returns one query row. + """ if self.__P is None or self.index >= len(self.__P): self.__P = self.next_page() self.index = 0 From 1b729376f2123a9cd1343802107def63d9f0823f Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Sun, 23 Jul 2023 09:56:50 -0400 Subject: [PATCH 08/12] add iterator functions --- irods/experimental/client/http/__init__.py | 4 --- .../client/http/iterator_functions.py | 31 +++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 irods/experimental/client/http/iterator_functions.py diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py index 42838ca56..eb9c3dfdf 100644 --- a/irods/experimental/client/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -169,10 +169,6 @@ def data_object(self, logical_path, *, def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()): - # TODO/discuss: - # Should we require Python3.8 so we can have format strings, e.g.: - # query_text = f"SELECT {columns} where {condition.format(*args)}" - condition = condition.format(*args) row_class, columns = _normalized_columns(columns) where = '' if condition == '' else ' WHERE ' diff --git a/irods/experimental/client/http/iterator_functions.py b/irods/experimental/client/http/iterator_functions.py new file mode 100644 index 000000000..bf0950845 --- /dev/null +++ b/irods/experimental/client/http/iterator_functions.py @@ -0,0 +1,31 @@ +#/usr/bin/env python3 +import itertools +import sys +import typing + +class too_many_results(Exception): pass +class too_few_results(Exception): pass + +def first_n(iterable: typing.Iterable, n: int): + return list(itertools.islice(iterable,n)) + +def one(iterable: typing.Iterable): + i = first_n(iterable,2) + if i[1:]: + raise too_many_results + if not i: + raise too_few_results + return i[0] + +def test_one(): + assert( + one(iter(range(10,10+i))) == 10 + ) + +def test_first_n(): + assert( + first_n(iter(range(10,10+i)),2) == [10,11] + ) + +if __name__=='__main__': + test_one() From 92b915ab1a577a111071e88c6c292ba0219e972f Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Sun, 23 Jul 2023 10:23:23 -0400 Subject: [PATCH 09/12] use iterator_functions in lightweight getter for data_object --- irods/experimental/client/http/__init__.py | 20 ++++++++++--------- .../client/http/iterator_functions.py | 2 ++ irods/prc_http_client_demo.py | 7 +++---- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py index eb9c3dfdf..f913d4579 100644 --- a/irods/experimental/client/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -6,8 +6,11 @@ import logging import requests import sys +from .iterator_functions import * logger = logging.getLogger(__name__) +MAX_INT32 = 2**31-1 +DEFAULT_PAGE_SIZE = 512 # ----- @@ -110,20 +113,20 @@ def __init__(self, session): sess = self.sess = session def value_by_column_name(self, id_, column_name:str): - first_row = self.sess.genquery1(columns = [column_name], - condition = "COLL_ID = '{}'", args = [id_])[0] + first_row = one(self.sess.genquery1(columns = [column_name], + condition = "COLL_ID = '{}'", args = [id_])) return getattr(first_row, column_name) class CollManager(Manager): def name_from_id(self, id_): - return self.sess.genquery1(columns = ['COLL_NAME'], - condition = "COLL_ID = '{}'", args = [id_])[0].COLL_NAME + return one(self.sess.genquery1(columns = ['COLL_NAME'], + condition = "COLL_ID = '{}'", args = [id_])).COLL_NAME def get(self, collname): - jr = self.sess.genquery1( columns = 'COLL_ID', - condition = "COLL_NAME = '{}'", args = [collname] ) - return Collection(self, int(jr[0].COLL_ID)) + r = self.sess.genquery1( columns = 'COLL_ID', + condition = "COLL_NAME = '{}'", args = [collname] ) + return Collection(self, int(one(r).COLL_ID)) # ----------------- @@ -158,7 +161,7 @@ def http_get(self, endpoint_name, **param_key_value_pairs): # Thin/lightweight approach to catalog object "getter": # def data_object(self, logical_path, *, - query_options=(('offset',0),('count',1))): + query_options=(('offset',0),('count',DEFAULT_PAGE_SIZE))): coll,data = logical_path.rsplit('/',1) # TODO: embedded quotes in object names will not work here. return self.genquery1(DataObject.column.names + Collection.column.names, @@ -211,4 +214,3 @@ def __init__(self, username, password, *, raise HTTP_operation_error("Failed to connect: url = '%s', status code = %s", url, r.status_code) self.bearer_token = r.text - diff --git a/irods/experimental/client/http/iterator_functions.py b/irods/experimental/client/http/iterator_functions.py index bf0950845..c42404500 100644 --- a/irods/experimental/client/http/iterator_functions.py +++ b/irods/experimental/client/http/iterator_functions.py @@ -6,6 +6,8 @@ class too_many_results(Exception): pass class too_few_results(Exception): pass +__all__ = ['first_n','one','too_many_results','too_few_results'] + def first_n(iterable: typing.Iterable, n: int): return list(itertools.islice(iterable,n)) diff --git a/irods/prc_http_client_demo.py b/irods/prc_http_client_demo.py index 05a74fbd5..c5c08be8e 100644 --- a/irods/prc_http_client_demo.py +++ b/irods/prc_http_client_demo.py @@ -1,6 +1,7 @@ import pprint from irods.experimental.client.http import * +from irods.experimental.client.http.iterator_functions import * s = Session('rods','rods',host='prec3431') c = CollManager(s).get("/tempZone/home/rods") @@ -20,8 +21,6 @@ pprint.pprint(result) print('Length of result was:',len(result)) -#exit()#dwm - # For a query of all data objects (note lack of condition argument), list full paths. for row in s.genquery1('COLL_NAME,DATA_NAME', extra_query_options=dict(count='512')): @@ -32,10 +31,10 @@ print ('-- fetch first replica --') -data_obj = s.data_object(data_path) +data_obj = first_n(s.data_object(data_path),n=1) print(data_obj) -print ('-- fetch all replicas --') +print ('-- fetch all replicas without paging --') MAX_REPLICAS = 2**31-1 data_obj_replicas = list(s.data_object(data_path, query_options=dict(count=MAX_REPLICAS))) From 4f7692b15b9aab6a7783861332964b81ec352ef5 Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Sun, 23 Jul 2023 10:30:15 -0400 Subject: [PATCH 10/12] delete TODO comment, have now imp'd paging --- irods/prc_http_client_demo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/irods/prc_http_client_demo.py b/irods/prc_http_client_demo.py index c5c08be8e..258f7eb7a 100644 --- a/irods/prc_http_client_demo.py +++ b/irods/prc_http_client_demo.py @@ -8,8 +8,6 @@ print ("Got a collection {c.name}, id = {c.id}".format(**locals())) -# TODO: a *_generator or *_pager method which iterates or pages through results - # Query collections by explicit column list. result = s.genquery1(['COLL_ID', 'COLL_NAME'], # columns "COLL_NAME like '%'", # condition From ddf6b3eb9fb1e3c818d8b3e50002273f4c15ab75 Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Sun, 23 Jul 2023 10:56:30 -0400 Subject: [PATCH 11/12] document genquery1()'s pagesize-agnostic, rowwise iterative behavior --- irods/experimental/client/http/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py index f913d4579..85a724c6d 100644 --- a/irods/experimental/client/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -171,7 +171,20 @@ def data_object(self, logical_path, *, # Each endpoint can have its own method definition. def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()): + """Return a generator-style iterator over all row results. + Example: + for row in session.genquery1( 'COLL_NAME' ): + print(row.COLL_NAME) + By default, one HTTP call to the server returns a single "row", which is not` + terribly efficient. We can override the "count" option with an arbitrary + positive integer: + + session.genquery1(columns, extra_query_options=dict(count=512)). + + and since this function is agnostic to pagesize and simply returns a row-wise + iterator, its row-wise iterative behavior will not change. + """ condition = condition.format(*args) row_class, columns = _normalized_columns(columns) where = '' if condition == '' else ' WHERE ' From 5385e79eaef3dd0753689d1915b82cf3750cbfbe Mon Sep 17 00:00:00 2001 From: d-w-moore Date: Tue, 25 Jul 2023 08:35:57 -0400 Subject: [PATCH 12/12] rephrase the doc --- irods/experimental/client/http/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/irods/experimental/client/http/__init__.py b/irods/experimental/client/http/__init__.py index 85a724c6d..7f1e33e8c 100644 --- a/irods/experimental/client/http/__init__.py +++ b/irods/experimental/client/http/__init__.py @@ -178,12 +178,13 @@ def genquery1(self, columns, condition='', *, args=(), extra_query_options = ()) By default, one HTTP call to the server returns a single "row", which is not` terribly efficient. We can override the "count" option with an arbitrary - positive integer: + positive integer, effectively increasing the paging size for the query: session.genquery1(columns, extra_query_options=dict(count=512)). - and since this function is agnostic to pagesize and simply returns a row-wise - iterator, its row-wise iterative behavior will not change. + Since this function's result (a row-wise iterator) is page-size agnostic, its + usage is not altered, whereas the efficiency for large queries will greatly + improve due to the 512-fold decrease in the number of API calls. """ condition = condition.format(*args) row_class, columns = _normalized_columns(columns)