22
33import argparse
44import hashlib
5+ import json
56import logging
67import os
78import time
89from datetime import datetime
910from io import StringIO
10- from typing import Final
11+ from typing import Final , Optional , TypedDict
1112
1213import Bio
1314import requests
8182
8283# Find all releases (and their corresponding tags) of the HLA data at
8384# https://github.com/ANHIG/IMGTHLA/releases
84- REPO_PATH : Final [str ] = os .environ .get (
85- "EASYHLA_REPO_PATH" ,
86- "https://raw.githubusercontent.com/ANHIG/IMGTHLA" ,
85+ REPO_OWNER : Final [str ] = os .environ .get (
86+ "EASYHLA_REPO_OWNER" ,
87+ "ANHIG" ,
88+ )
89+ REPO_NAME : Final [str ] = os .environ .get (
90+ "EASYHLA_REPO_NAME" ,
91+ "IMGTHLA" ,
8792)
8893HLA_ALLELES_FILENAME : Final [str ] = os .environ .get (
8994 "EASYHLA_REPO_ALLELES_FILENAME" ,
@@ -95,21 +100,109 @@ class RetrieveAllelesError(Exception):
95100 pass
96101
97102
103+ class RetrieveCommitHashError (Exception ):
104+ pass
105+
106+
98107def get_alleles_file (
99108 tag : str ,
100- base_url : str = REPO_PATH ,
109+ repo_owner : str = REPO_OWNER ,
110+ repo_name : str = REPO_NAME ,
101111 alleles_filename : str = HLA_ALLELES_FILENAME ,
102112) -> str :
103113 """
104114 Retrieve the HLA alleles file from the specified tag.
105115 """
106- url : str = f"{ base_url } /{ tag } /{ alleles_filename } "
107- response : requests .Response = requests .get (url )
116+ url : str = (
117+ f"https://api.github.com/repos/{ repo_owner } /{ repo_name } /"
118+ f"contents/{ alleles_filename } ?ref={ tag } "
119+ )
120+ response : requests .Response = requests .get (
121+ url ,
122+ headers = {
123+ "Accept" : "application/vnd.github.raw+json" ,
124+ "X-GitHub-Api-Version" : "2022-11-28" ,
125+ },
126+ )
108127 if response .status_code != requests .codes .ok :
109128 raise RetrieveAllelesError ()
110129 return response .text
111130
112131
132+ class CommitInfo (TypedDict ):
133+ sha : str
134+ url : str
135+
136+
137+ class TagInfo (TypedDict ):
138+ name : str
139+ commit : CommitInfo
140+ zipball_url : str
141+ tarball_url : str
142+ node_id : str
143+
144+
145+ def get_commit_hash (
146+ tag_name : str ,
147+ repo_owner : str = REPO_OWNER ,
148+ repo_name : str = REPO_NAME ,
149+ ) -> Optional [str ]:
150+ """
151+ Retrieve the commit hash of the specified tag.
152+ """
153+ url : str = f"https://api.github.com/repos/{ repo_owner } /{ repo_name } /tags"
154+ response : requests .Response = requests .get (
155+ url ,
156+ headers = {
157+ "Accept" : "application/vnd.github+json" ,
158+ "X-GitHub-Api-Version" : "2022-11-28" ,
159+ },
160+ )
161+ if response .status_code != requests .codes .ok :
162+ raise RetrieveCommitHashError ()
163+
164+ tags : list [TagInfo ] = json .loads (response .text )
165+ for tag in tags :
166+ if tag ["name" ] == tag_name :
167+ return tag ["commit" ]["sha" ]
168+
169+ return None
170+
171+
172+ def get_from_git (tag : str ) -> tuple [str , datetime , str ]:
173+ alleles_str : str
174+ retrieval_datetime : datetime
175+ for i in range (5 ):
176+ try :
177+ retrieval_datetime = datetime .now ()
178+ alleles_str = get_alleles_file (tag )
179+ except RetrieveAllelesError :
180+ if i < 4 :
181+ logger .info ("Failed to retrieve alleles; retrying in 20 seconds...." )
182+ time .sleep (20 )
183+ else :
184+ raise
185+ else :
186+ break
187+
188+ commit_hash : str
189+ for i in range (5 ):
190+ try :
191+ commit_hash = get_commit_hash (tag )
192+ except RetrieveCommitHashError :
193+ if i < 4 :
194+ logger .info (
195+ "Failed to retrieve the commit hash; retrying in 20 seconds...."
196+ )
197+ time .sleep (20 )
198+ else :
199+ raise
200+ else :
201+ break
202+
203+ return alleles_str , retrieval_datetime , commit_hash
204+
205+
113206def main ():
114207 parser : argparse .ArgumentParser = argparse .ArgumentParser (
115208 "Retrieve HLA alleles from IPD-IMGT/HLA."
@@ -178,18 +271,12 @@ def main():
178271 logger .info (f"Retrieving alleles from tag { args .tag } ...." )
179272 alleles_str : str
180273 retrieval_datetime : datetime
181- for i in range (5 ):
182- try :
183- retrieval_datetime = datetime .now ()
184- alleles_str = get_alleles_file (args .tag )
185- except RetrieveAllelesError :
186- if i < 4 :
187- logger .info ("Failed to retrieve alleles; retrying in 20 seconds...." )
188- time .sleep (20 )
189- else :
190- raise
191- else :
192- break
274+ commit_hash : str
275+ alleles_str , retrieval_datetime , commit_hash = get_from_git (args .tag )
276+ logger .info (
277+ f"Alleles (version { args .tag } , commit hash { commit_hash } ) retrieved at "
278+ f"{ retrieval_datetime } ."
279+ )
193280
194281 if args .dump_full_fasta_to != "" :
195282 logger .info (f"Dumping the full FASTA file to { args .dump_full_fasta_to } ." )
@@ -214,8 +301,9 @@ def main():
214301 logger .info ("Identifying identical HLA alleles...." )
215302 standards_for_saving : StoredHLAStandards = StoredHLAStandards (
216303 tag = args .tag ,
304+ commit_hash = commit_hash ,
217305 last_updated = retrieval_datetime ,
218- ** {
306+ standards = {
219307 locus : group_identical_alleles (raw_standards [locus ])
220308 for locus in ("A" , "B" , "C" )
221309 },
0 commit comments