11import re
22from collections .abc import Iterable
3+ from dataclasses import dataclass , field
34from operator import itemgetter
45from typing import Final , Optional
56
1415 bin2nuc ,
1516 count_forgiving_mismatches ,
1617 nuc2bin ,
18+ sort_allele_pairs ,
1719)
1820
1921
@@ -212,16 +214,17 @@ def from_frequency_entry(
212214 )
213215
214216
217+ GeneCoord = tuple [str , ...]
218+
219+
215220class AllelePairs (BaseModel ):
216221 allele_pairs : list [tuple [str , str ]]
217222
218223 def is_homozygous (self ) -> bool :
219224 """
220- Determine the homozygousness of alleles.
221-
222- Homozygousity meaning a pair is matching on both sides, ex:
223- `Cw*0722 - Cw*0722`
225+ Determine the homozygousness of these allele pairs.
224226
227+ A pair is homozygous if both elements match, e.g. C*07:22 - C*07:22.
225228 If *any* pair of alleles matches, then we declare the whole set to be
226229 homozygous.
227230
@@ -287,7 +290,7 @@ def get_protein_pairs(self) -> set[HLAProteinPair]:
287290 for e in self .get_paired_gene_coordinates (True )
288291 }
289292
290- def get_unambiguous_allele_pairs (
293+ def _get_unambiguous_allele_pairs (
291294 self ,
292295 frequencies : dict [HLAProteinPair , int ],
293296 ) -> list [tuple [str , str ]]:
@@ -333,6 +336,115 @@ def get_unambiguous_allele_pairs(
333336
334337 return reduced_set
335338
339+ @dataclass
340+ class CleanPrefixIntermediateResult :
341+ common_prefix : GeneCoord = ()
342+ second_prefix : Optional [GeneCoord ] = None
343+ remaining_prefixes : list [GeneCoord ] = field (default_factory = list )
344+
345+ @staticmethod
346+ def _identify_clean_prefix_in_pairs (
347+ unambiguous_pairs : list [tuple [GeneCoord , GeneCoord ]],
348+ ) -> CleanPrefixIntermediateResult :
349+ """
350+ Identify a "clean" gene coordinate "prefix" in the given unambiguous pairs.
351+
352+ This prefix can occur in either element of a given pair. For example,
353+ if the pairs are
354+ - B*01:01:01 - B*01:01:02:110G
355+ - B*01:01:02:99 - B*01:22
356+ then the longest common prefix is ("B*01", "01", "02").
357+
358+ If we happen to find an "exact" allele that occurs in all the pairs, then
359+ that's a "clean" allele and we report it back, even if it's shorter than
360+ the longest common prefix.
361+
362+ A precondition is that the input must be an unambiguous collection of
363+ pairs. The algorithm may not return cogent values if not.
364+
365+ Return a tuple containing this clean prefix, a second clean prefix
366+ if one is found, and a list containing all the remaining
367+ alleles in the pairs if such a second prefix is *not* found (if a
368+ second prefix is found, this list will be empty).
369+ """
370+ if len (unambiguous_pairs ) == 0 :
371+ return AllelePairs .CleanPrefixIntermediateResult ()
372+
373+ common_prefix : GeneCoord = ()
374+ second_prefix : Optional [GeneCoord ] = None
375+ remaining_prefixes : list [GeneCoord ] = []
376+
377+ max_length : int = max (
378+ [max (len (pair [0 ]), len (pair [1 ])) for pair in unambiguous_pairs ]
379+ )
380+ for i in range (max_length , 0 , - 1 ):
381+ # Note that this may not "cut down" some pairs if they're shorter
382+ # than max_length.
383+ curr_pairs = [(pair [0 ][0 :i ], pair [1 ][0 :i ]) for pair in unambiguous_pairs ]
384+
385+ # On the first iteration, we might "accidentally" find exact matches
386+ # which are shorter (or equal to) than max_length; if so, great
387+ # ¯\_(ツ)_/¯
388+ common_prefixes : set [GeneCoord ] = set (curr_pairs [0 ])
389+ for curr_pair in curr_pairs [1 :]:
390+ common_prefixes = common_prefixes & set (curr_pair )
391+
392+ if len (common_prefixes ) == 0 :
393+ continue
394+
395+ # Having reached here, we know that we found at least one common
396+ # prefix.
397+ common_prefix = common_prefixes .pop ()
398+ if len (common_prefixes ) == 1 :
399+ # The other prefix is good too.
400+ second_prefix = common_prefixes .pop ()
401+
402+ else :
403+ # Having reached here, we know that we found exactly one common
404+ # prefix, and will look for the best prefix in what remains.
405+ for curr_pair in curr_pairs :
406+ curr_unique_prefixes : set [GeneCoord ] = set (curr_pair )
407+ if len (curr_unique_prefixes ) != 1 :
408+ # There were two distinct alleles in this pair, one of which
409+ # was longest_prefix, so we retain the other one.
410+ # (If there had only been one, then it must have been a
411+ # homozygous pair "[longest_prefix] - [longest_prefix]",
412+ # so we want to retain one "copy" for the next stage.)
413+ curr_unique_prefixes .remove (common_prefix )
414+
415+ remaining_prefixes .append (curr_unique_prefixes .pop ())
416+ if i > 1 :
417+ # This is unnecessary but it gets us 100% test coverage
418+ # ¯\_(ツ)_/¯
419+ break
420+
421+ return AllelePairs .CleanPrefixIntermediateResult (
422+ common_prefix , second_prefix , remaining_prefixes
423+ )
424+
425+ @staticmethod
426+ def _identify_longest_prefix (allele_prefixes : list [GeneCoord ]) -> GeneCoord :
427+ """
428+ Identify the longest gene coordinate "prefix" in the given allele prefixes.
429+
430+ Precondition: that the input must all share at least the same first
431+ coordinate. The algorithm may not return cogent values if not.
432+
433+ Precondition: the specified allele prefixes do not all perfectly match,
434+ so we lose nothing by trimming one coordinate off the end of all of
435+ them.
436+ """
437+ longest_prefix : GeneCoord = ()
438+ if len (allele_prefixes ) > 0 :
439+ max_length : int = max ([len (allele ) for allele in allele_prefixes ])
440+ for i in range (max_length - 1 , 0 , - 1 ):
441+ curr_prefixes : set [GeneCoord ] = {allele [0 :i ] for allele in allele_prefixes }
442+ if len (curr_prefixes ) == 1 :
443+ longest_prefix = curr_prefixes .pop ()
444+ if i > 1 :
445+ break
446+ return longest_prefix
447+
336448 def best_common_allele_pair_str (
337449 self ,
338450 frequencies : dict [HLAProteinPair , int ],
@@ -342,16 +454,14 @@ def best_common_allele_pair_str(
342454
343455 The allele pairs are filtered to an unambiguous set (using the specified
344456 frequencies to determine which ones to retain). Then, the "best common
345- coordinates" for all the remaining allele allele pairs are used to build
457+ coordinates" for all the remaining allele pairs are used to build
346458 a string representation of the set.
347459
348460 Example: if, after filtering, the allele pairs remaining are:
349- ```
350- [ [A*11:02:01, A*12:01],
351- [A*11:02:02, A*12:02],
352- [A*11:02:03, A*12:03] ]
353- ```
354- we expect to get `A*11:02 - A*12`.
461+ - A*11:02:01 - A*12:01
462+ - A*11:02:02 - A*12:02
463+ - A*11:02:03 - A*12:03
464+ we expect to get "A*11:02 - A*12".
355465
356466 :return: A string representing the best common allele pair, and the
357467 unambiguous set this string represents.
@@ -360,47 +470,36 @@ def best_common_allele_pair_str(
360470 # Starting with an unambiguous set assures that we will definitely get
361471 # a result.
362472 unambiguous_aps : AllelePairs = AllelePairs (
363- allele_pairs = self .get_unambiguous_allele_pairs (frequencies )
473+ allele_pairs = self ._get_unambiguous_allele_pairs (frequencies )
364474 )
365475 paired_gene_coordinates : list [tuple [list [str ], list [str ]]] = (
366- unambiguous_aps .get_paired_gene_coordinates ()
476+ unambiguous_aps .get_paired_gene_coordinates (digits_only = False )
367477 )
368478
369- clean_allele : list [str ] = []
370- for n in [0 , 1 ]:
371- for i in [4 , 3 , 2 , 1 ]:
372- all_leading_coordinates = {
373- ":" .join (a [n ][0 :i ]) for a in paired_gene_coordinates
374- }
375- if len (all_leading_coordinates ) == 1 :
376- best_common_coords = all_leading_coordinates .pop ()
377- clean_allele .append (
378- re .sub (
379- r"[A-Z]$" ,
380- "" ,
381- best_common_coords ,
382- )
383- )
384- if i > 1 :
385- # This branch is unnecessary but it gets us 100% code
386- # coverage ¯\_(ツ)_/¯
387- break
479+ # Look for the longest common prefix present in all pairs.
480+ curr_pairs : list [tuple [GeneCoord , GeneCoord ]] = [
481+ (tuple (pair [0 ]), tuple (pair [1 ])) for pair in paired_gene_coordinates
482+ ]
388483
389- clean_allele_pair_str : str = " - " .join (clean_allele )
390- return (clean_allele_pair_str , set (unambiguous_aps .allele_pairs ))
484+ intermediate_data : AllelePairs .CleanPrefixIntermediateResult = (
485+ self ._identify_clean_prefix_in_pairs (curr_pairs )
486+ )
391487
392- def sort_pairs (self ) -> list [tuple [str , str ]]:
393- """
394- Sort the pairs according to "coordinate order".
488+ second_prefix : GeneCoord = (
489+ intermediate_data .second_prefix or self ._identify_longest_prefix (
490+ intermediate_data .remaining_prefixes
491+ )
492+ )
395493
396- If there's a tie, a last letter is used to attempt to break the tie.
397- """
398- return sorted (
399- self .allele_pairs ,
400- key = lambda pair : (
401- allele_coordinates_sort_key (pair [0 ]),
402- allele_coordinates_sort_key (pair [1 ]),
403- ),
494+ # Turn the two prefixes we found into strings and strip any trailing
495+ # letters.
496+ clean_allele_pair : list [str ] = [
497+ re .sub (r"[A-Z]$" , "" , ":" .join (allele ))
498+ for allele in (intermediate_data .common_prefix , second_prefix )
499+ ]
500+ return (
501+ " - " .join (sorted (clean_allele_pair , key = allele_coordinates_sort_key )),
502+ set (unambiguous_aps .allele_pairs ),
404503 )
405504
406505 def stringify (self , sorted = True , max_length : int = 3900 ) -> str :
@@ -415,7 +514,7 @@ def stringify(self, sorted=True, max_length: int = 3900) -> str:
415514 """
416515 allele_pairs : list [tuple [str , str ]] = self .allele_pairs
417516 if sorted :
418- allele_pairs = self .sort_pairs ( )
517+ allele_pairs = sort_allele_pairs ( self .allele_pairs )
419518 summary_str : str = ";" .join ([f"{ _a [0 ]} - { _a [1 ]} " for _a in allele_pairs ])
420519 if len (summary_str ) > max_length :
421520 summary_str = re .sub (
@@ -426,7 +525,7 @@ def stringify(self, sorted=True, max_length: int = 3900) -> str:
426525 return summary_str
427526
428527 @classmethod
429- def get_allele_pairs (
528+ def combine_allele_pairs (
430529 cls ,
431530 combined_standards : Iterable [HLACombinedStandard ],
432531 ) -> "AllelePairs" :
@@ -441,7 +540,7 @@ def get_allele_pairs(
441540 all_allele_pairs : list [tuple [str , str ]] = []
442541 for combined_std in combined_standards :
443542 all_allele_pairs .extend (combined_std .possible_allele_pairs )
444- all_allele_pairs . sort ( )
543+ all_allele_pairs = sort_allele_pairs ( all_allele_pairs )
445544 return cls (allele_pairs = all_allele_pairs )
446545
447546 def contains_allele (self , allele_name : str ) -> bool :
@@ -474,7 +573,7 @@ def best_matches(self) -> set[HLACombinedStandard]:
474573 }
475574
476575 def best_matching_allele_pairs (self ) -> AllelePairs :
477- return AllelePairs .get_allele_pairs (self .best_matches ())
576+ return AllelePairs .combine_allele_pairs (self .best_matches ())
478577
479578 def best_common_allele_pair (
480579 self ,
@@ -491,7 +590,7 @@ def best_common_allele_pair(
491590 ap_to_cs [ap ] = cs
492591
493592 # Get an unambiguous set of allele pairs from the best matches:
494- best_aps : AllelePairs = AllelePairs .get_allele_pairs (best_matches )
593+ best_aps : AllelePairs = AllelePairs .combine_allele_pairs (best_matches )
495594 clean_ap_str : str
496595 best_unambiguous : set [tuple [str , str ]]
497596 clean_ap_str , best_unambiguous = best_aps .best_common_allele_pair_str (
0 commit comments