diff --git a/src/hla_algorithm/hla_algorithm.py b/src/hla_algorithm/hla_algorithm.py index fc35cc6..5171503 100644 --- a/src/hla_algorithm/hla_algorithm.py +++ b/src/hla_algorithm/hla_algorithm.py @@ -23,8 +23,10 @@ BIN2NUC, HLA_LOCUS, StoredHLAStandards, + allele_coordinates_sort_key, count_strict_mismatches, nuc2bin, + sort_allele_pairs, ) DATE_FORMAT = "%a %b %d %H:%M:%S %Z %Y" @@ -277,7 +279,13 @@ def combine_standards_stepper( # that looks like what you get when you sequence HLA. std_bin = np.array(std_b.sequence) | np.array(std_a.sequence) allele_pair: tuple[str, str] = cast( - tuple[str, str], tuple(sorted((std_a.allele, std_b.allele))) + tuple[str, str], + tuple( + sorted( + (std_a.allele, std_b.allele), + key=allele_coordinates_sort_key, + ) + ), ) # There could be more than one combined standard with the @@ -363,7 +371,7 @@ def combine_standards( if mismatch_count <= cutoff: combined_std: HLACombinedStandard = HLACombinedStandard( standard_bin=combined_std_bin, - possible_allele_pairs=tuple(sorted(pair_list)), + possible_allele_pairs=tuple(sort_allele_pairs(pair_list)), ) result[combined_std] = mismatch_count diff --git a/src/hla_algorithm/interpret_from_json_lib.py b/src/hla_algorithm/interpret_from_json_lib.py index e8a5f27..f58d046 100644 --- a/src/hla_algorithm/interpret_from_json_lib.py +++ b/src/hla_algorithm/interpret_from_json_lib.py @@ -18,6 +18,7 @@ check_bases, check_length, nuc2bin, + sort_allele_pairs, ) @@ -143,7 +144,7 @@ def build_from_interpretation( return HLAResult( seqs=seqs, - alleles_all=[f"{x[0]} - {x[1]}" for x in aps.sort_pairs()], + alleles_all=[f"{x[0]} - {x[1]}" for x in sort_allele_pairs(aps.allele_pairs)], alleles_clean=alleles_clean, alleles_for_mismatches=f"{rep_ap[0]} - {rep_ap[1]}", mismatches=[str(x) for x in match_details.mismatches], diff --git a/src/hla_algorithm/models.py b/src/hla_algorithm/models.py index 591e007..10f4035 100644 --- a/src/hla_algorithm/models.py +++ b/src/hla_algorithm/models.py @@ -1,5 +1,6 @@ import re from collections.abc import Iterable +from dataclasses import dataclass, field from operator import itemgetter from typing import Final, Optional @@ -14,6 +15,7 @@ bin2nuc, count_forgiving_mismatches, nuc2bin, + sort_allele_pairs, ) @@ -212,16 +214,17 @@ def from_frequency_entry( ) +GeneCoord = tuple[str, ...] + + class AllelePairs(BaseModel): allele_pairs: list[tuple[str, str]] def is_homozygous(self) -> bool: """ - Determine the homozygousness of alleles. - - Homozygousity meaning a pair is matching on both sides, ex: - `Cw*0722 - Cw*0722` + Determine the homozygousness of these allele pairs. + A pair is homozygous if both elements match, e.g. C*07:22 - C*07:22. If *any* pair of alleles matches, then we declare the whole set to be homozygous. @@ -287,7 +290,7 @@ def get_protein_pairs(self) -> set[HLAProteinPair]: for e in self.get_paired_gene_coordinates(True) } - def get_unambiguous_allele_pairs( + def _get_unambiguous_allele_pairs( self, frequencies: dict[HLAProteinPair, int], ) -> list[tuple[str, str]]: @@ -333,6 +336,115 @@ def get_unambiguous_allele_pairs( return reduced_set + @dataclass + class CleanPrefixIntermediateResult: + common_prefix: GeneCoord = () + second_prefix: Optional[GeneCoord] = None + remaining_prefixes: list[GeneCoord] = field(default_factory=list) + + @staticmethod + def _identify_clean_prefix_in_pairs( + unambiguous_pairs: list[tuple[GeneCoord, GeneCoord]], + ) -> CleanPrefixIntermediateResult: + """ + Identify a "clean" gene coordinate "prefix" in the given unambiguous pairs. + + This prefix can occur in either element of a given pair. For example, + if the pairs are + - B*01:01:01 - B*01:01:02:110G + - B*01:01:02:99 - B*01:22 + then the longest common prefix is ("B*01", "01", "02"). + + If we happen to find an "exact" allele that occurs in all the pairs, then + that's a "clean" allele and we report it back, even if it's shorter than + the longest common prefix. + + A precondition is that the input must be an unambiguous collection of + pairs. The algorithm may not return cogent values if not. + + Return a tuple containing this clean prefix, a second clean prefix + if one is found, and a list containing all the remaining + alleles in the pairs if such a second prefix is *not* found (if a + second prefix is found, this list will be empty). + """ + if len(unambiguous_pairs) == 0: + return AllelePairs.CleanPrefixIntermediateResult() + + common_prefix: GeneCoord = () + second_prefix: Optional[GeneCoord] = None + remaining_prefixes: list[GeneCoord] = [] + + max_length: int = max( + [max(len(pair[0]), len(pair[1])) for pair in unambiguous_pairs] + ) + for i in range(max_length, 0, -1): + # Note that this may not "cut down" some pairs if they're shorter + # than max_length. + curr_pairs = [(pair[0][0:i], pair[1][0:i]) for pair in unambiguous_pairs] + + # On the first iteration, we might "accidentally" find exact matches + # which are shorter (or equal to) than max_length; if so, great + # ¯\_(ツ)_/¯ + common_prefixes: set[GeneCoord] = set(curr_pairs[0]) + for curr_pair in curr_pairs[1:]: + common_prefixes = common_prefixes & set(curr_pair) + + if len(common_prefixes) == 0: + continue + + # Having reached here, we know that we found at least one common + # prefix. + common_prefix = common_prefixes.pop() + if len(common_prefixes) == 1: + # The other prefix is good too. + second_prefix = common_prefixes.pop() + + else: + # Having reached here, we know that we found exactly one common + # prefix, and will look for the best prefix in what remains. + for curr_pair in curr_pairs: + curr_unique_prefixes: set[GeneCoord] = set(curr_pair) + if len(curr_unique_prefixes) != 1: + # There were two distinct alleles in this pair, one of which + # was longest_prefix, so we retain the other one. + # (If there had only been one, then it must have been a + # homozygous pair "[longest_prefix] - [longest_prefix]", + # so we want to retain one "copy" for the next stage.) + curr_unique_prefixes.remove(common_prefix) + + remaining_prefixes.append(curr_unique_prefixes.pop()) + if i > 1: + # This is unnecessary but it gets us 100% test coverage + # ¯\_(ツ)_/¯ + break + + return AllelePairs.CleanPrefixIntermediateResult( + common_prefix, second_prefix, remaining_prefixes + ) + + @staticmethod + def _identify_longest_prefix(allele_prefixes: list[GeneCoord]) -> GeneCoord: + """ + Identify the longest gene coordinate "prefix" in the given allele prefixes. + + Precondition: that the input must all share at least the same first + coordinate. The algorithm may not return cogent values if not. + + Precondition: the specified allele prefixes do not all perfectly match, + so we lose nothing by trimming one coordinate off the end of all of + them. + """ + longest_prefix: GeneCoord = () + if len(allele_prefixes) > 0: + max_length: int = max([len(allele) for allele in allele_prefixes]) + for i in range(max_length - 1, 0, -1): + curr_prefixes: set[GeneCoord] = {allele[0:i] for allele in allele_prefixes} + if len(curr_prefixes) == 1: + longest_prefix = curr_prefixes.pop() + if i > 1: + break + return longest_prefix + def best_common_allele_pair_str( self, frequencies: dict[HLAProteinPair, int], @@ -342,16 +454,14 @@ def best_common_allele_pair_str( The allele pairs are filtered to an unambiguous set (using the specified frequencies to determine which ones to retain). Then, the "best common - coordinates" for all the remaining allele allele pairs are used to build + coordinates" for all the remaining allele pairs are used to build a string representation of the set. Example: if, after filtering, the allele pairs remaining are: - ``` - [ [A*11:02:01, A*12:01], - [A*11:02:02, A*12:02], - [A*11:02:03, A*12:03] ] - ``` - we expect to get `A*11:02 - A*12`. + - A*11:02:01 - A*12:01 + - A*11:02:02 - A*12:02 + - A*11:02:03 - A*12:03 + we expect to get "A*11:02 - A*12". :return: A string representing the best common allele pair, and the unambiguous set this string represents. @@ -360,47 +470,36 @@ def best_common_allele_pair_str( # Starting with an unambiguous set assures that we will definitely get # a result. unambiguous_aps: AllelePairs = AllelePairs( - allele_pairs=self.get_unambiguous_allele_pairs(frequencies) + allele_pairs=self._get_unambiguous_allele_pairs(frequencies) ) paired_gene_coordinates: list[tuple[list[str], list[str]]] = ( - unambiguous_aps.get_paired_gene_coordinates() + unambiguous_aps.get_paired_gene_coordinates(digits_only=False) ) - clean_allele: list[str] = [] - for n in [0, 1]: - for i in [4, 3, 2, 1]: - all_leading_coordinates = { - ":".join(a[n][0:i]) for a in paired_gene_coordinates - } - if len(all_leading_coordinates) == 1: - best_common_coords = all_leading_coordinates.pop() - clean_allele.append( - re.sub( - r"[A-Z]$", - "", - best_common_coords, - ) - ) - if i > 1: - # This branch is unnecessary but it gets us 100% code - # coverage ¯\_(ツ)_/¯ - break + # Look for the longest common prefix present in all pairs. + curr_pairs: list[tuple[GeneCoord, GeneCoord]] = [ + (tuple(pair[0]), tuple(pair[1])) for pair in paired_gene_coordinates + ] - clean_allele_pair_str: str = " - ".join(clean_allele) - return (clean_allele_pair_str, set(unambiguous_aps.allele_pairs)) + intermediate_data: AllelePairs.CleanPrefixIntermediateResult = ( + self._identify_clean_prefix_in_pairs(curr_pairs) + ) - def sort_pairs(self) -> list[tuple[str, str]]: - """ - Sort the pairs according to "coordinate order". + second_prefix: GeneCoord = ( + intermediate_data.second_prefix or self._identify_longest_prefix( + intermediate_data.remaining_prefixes + ) + ) - If there's a tie, a last letter is used to attempt to break the tie. - """ - return sorted( - self.allele_pairs, - key=lambda pair: ( - allele_coordinates_sort_key(pair[0]), - allele_coordinates_sort_key(pair[1]), - ), + # Turn the two prefixes we found into strings and strip any trailing + # letters. + clean_allele_pair: list[str] = [ + re.sub(r"[A-Z]$", "", ":".join(allele)) + for allele in (intermediate_data.common_prefix, second_prefix) + ] + return ( + " - ".join(sorted(clean_allele_pair, key=allele_coordinates_sort_key)), + set(unambiguous_aps.allele_pairs), ) def stringify(self, sorted=True, max_length: int = 3900) -> str: @@ -415,7 +514,7 @@ def stringify(self, sorted=True, max_length: int = 3900) -> str: """ allele_pairs: list[tuple[str, str]] = self.allele_pairs if sorted: - allele_pairs = self.sort_pairs() + allele_pairs = sort_allele_pairs(self.allele_pairs) summary_str: str = ";".join([f"{_a[0]} - {_a[1]}" for _a in allele_pairs]) if len(summary_str) > max_length: summary_str = re.sub( @@ -426,7 +525,7 @@ def stringify(self, sorted=True, max_length: int = 3900) -> str: return summary_str @classmethod - def get_allele_pairs( + def combine_allele_pairs( cls, combined_standards: Iterable[HLACombinedStandard], ) -> "AllelePairs": @@ -441,7 +540,7 @@ def get_allele_pairs( all_allele_pairs: list[tuple[str, str]] = [] for combined_std in combined_standards: all_allele_pairs.extend(combined_std.possible_allele_pairs) - all_allele_pairs.sort() + all_allele_pairs = sort_allele_pairs(all_allele_pairs) return cls(allele_pairs=all_allele_pairs) def contains_allele(self, allele_name: str) -> bool: @@ -474,7 +573,7 @@ def best_matches(self) -> set[HLACombinedStandard]: } def best_matching_allele_pairs(self) -> AllelePairs: - return AllelePairs.get_allele_pairs(self.best_matches()) + return AllelePairs.combine_allele_pairs(self.best_matches()) def best_common_allele_pair( self, @@ -491,7 +590,7 @@ def best_common_allele_pair( ap_to_cs[ap] = cs # Get an unambiguous set of allele pairs from the best matches: - best_aps: AllelePairs = AllelePairs.get_allele_pairs(best_matches) + best_aps: AllelePairs = AllelePairs.combine_allele_pairs(best_matches) clean_ap_str: str best_unambiguous: set[tuple[str, str]] clean_ap_str, best_unambiguous = best_aps.best_common_allele_pair_str( diff --git a/src/hla_algorithm/utils.py b/src/hla_algorithm/utils.py index e7a6d55..fb263ca 100644 --- a/src/hla_algorithm/utils.py +++ b/src/hla_algorithm/utils.py @@ -368,6 +368,28 @@ def allele_coordinates_sort_key(allele: str) -> tuple[tuple[int, ...], str]: return (integer_part, letters_at_end) +def allele_pair_sort_key(pair: tuple[str, str]) -> tuple[ + tuple[int, ...], str, tuple[int, ...], str +]: + """ + Produce a sortable key for an allele pair. + + Pairs should be sorted according to "coordinate order". + If there's a tie, a last letter is used to attempt to break the tie. + """ + return ( + allele_coordinates_sort_key(pair[0]) + + allele_coordinates_sort_key(pair[1]) + ) + + +def sort_allele_pairs(allele_pairs: Iterable[tuple[str, str]]) -> list[tuple[str, str]]: + """ + Sort the pairs according to "coordinate order". + """ + return sorted(allele_pairs, key=allele_pair_sort_key) + + class HLARawStandard(BaseModel): allele: str exon2: str diff --git a/tests/hla_algorithm_test.py b/tests/hla_algorithm_test.py index 4d68e7e..e717218 100644 --- a/tests/hla_algorithm_test.py +++ b/tests/hla_algorithm_test.py @@ -131,89 +131,89 @@ def hla_algorithm(): (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), ], [0, 1, 5], - [((1, 2, 4, 8), 0, ("std_allmatch", "std_allmatch"))], + [((1, 2, 4, 8), 0, ("A*07:08:09G", "A*07:08:09G"))], id="one_combo_all_matches", ), pytest.param( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std1", + allele="B*57:01:02", two=(1, 2), three=(4, 4), mismatch=1, ) ], [0, 1, 2, 5], - [((1, 2, 4, 4), 1, ("std1", "std1"))], + [((1, 2, 4, 4), 1, ("B*57:01:02", "B*57:01:02"))], id="one_combo_retained_regardless_of_threshold", ), pytest.param( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, ) ], [0, 1, 3, 4, 5, 10], - [((8, 4, 2, 1), 4, ("std_allmismatch", "std_allmismatch"))], + [((8, 4, 2, 1), 4, ("A*11:01:01:01", "A*11:01:01:01"))], id="only_combo_retained_regardless_of_threshold_more_mismatches", ), pytest.param( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_twomatch", + allele="A*55:01", two=(1, 4), three=(2, 8), mismatch=2, ), ], [0, 1, 2, 3, 5], - [((1, 4, 2, 8), 2, ("std_twomatch", "std_twomatch"))], + [((1, 4, 2, 8), 2, ("A*55:01", "A*55:01"))], id="one_combo_two_mismatches", ), pytest.param( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_onemismatch", + allele="A*30:08:01", two=(1, 4), three=(4, 8), mismatch=1, ), ], [0], - [((1, 2, 4, 8), 0, ("std_allmatch", "std_allmatch"))], + [((1, 2, 4, 8), 0, ("A*07:08:09G", "A*07:08:09G"))], id="combo_with_mismatch_above_threshold", ), pytest.param( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_onemismatch", + allele="A*30:08:01", two=(1, 4), three=(4, 8), mismatch=1, ), HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, @@ -224,17 +224,17 @@ def hla_algorithm(): ( (1, 4, 4, 8), 1, - ("std_onemismatch", "std_onemismatch"), + ("A*30:08:01", "A*30:08:01"), ), ( (1, 6, 4, 8), 1, - ("std_allmatch", "std_onemismatch"), + ("A*07:08:09G", "A*30:08:01"), ), ( (1, 2, 4, 8), 0, - ("std_allmatch", "std_allmatch"), + ("A*07:08:09G", "A*07:08:09G"), ), ], id="combo_with_mismatch_above_threshold_previous_bests_reported", @@ -243,13 +243,49 @@ def hla_algorithm(): (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*30:100:01", + two=(1, 4), + three=(4, 8), + mismatch=1, + ), + HLAStandardMatch( + allele="A*30:21:09G", + two=(1, 2), + three=(4, 8), + mismatch=0, + ), + ], + [0], + [ + ( + (1, 4, 4, 8), + 1, + ("A*30:100:01", "A*30:100:01"), + ), + ( + (1, 6, 4, 8), + 1, + ("A*30:21:09G", "A*30:100:01"), + ), + ( + (1, 2, 4, 8), + 0, + ("A*30:21:09G", "A*30:21:09G"), + ), + ], + id="combo_with_mismatch_above_threshold_previous_bests_reported_pairs_sorted_by_coordinate", + ), + pytest.param( + (1, 2, 4, 8), + [ + HLAStandardMatch( + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_onemismatch", + allele="A*30:08:01", two=(1, 4), three=(4, 8), mismatch=1, @@ -260,17 +296,17 @@ def hla_algorithm(): ( (1, 2, 4, 8), 0, - ("std_allmatch", "std_allmatch"), + ("A*07:08:09G", "A*07:08:09G"), ), ( (1, 6, 4, 8), 1, - ("std_allmatch", "std_onemismatch"), + ("A*07:08:09G", "A*30:08:01"), ), ( (1, 4, 4, 8), 1, - ("std_onemismatch", "std_onemismatch"), + ("A*30:08:01", "A*30:08:01"), ), ], id="several_combos_all_below_threshold", @@ -279,13 +315,13 @@ def hla_algorithm(): (9, 6, 4, 6), [ HLAStandardMatch( - allele="std1", + allele="B*57:01:02", two=(1, 2), three=(4, 4), mismatch=0, ), HLAStandardMatch( - allele="std2", + allele="B*58:22:33G", two=(8, 4), three=(4, 8), mismatch=1, @@ -296,12 +332,12 @@ def hla_algorithm(): ( (1, 2, 4, 4), 3, - ("std1", "std1"), + ("B*57:01:02", "B*57:01:02"), ), ( (9, 6, 4, 12), 1, - ("std1", "std2"), + ("B*57:01:02", "B*58:22:33G"), ), ], id="first_above_threshold_second_below_rest_rejected", @@ -310,13 +346,13 @@ def hla_algorithm(): (9, 6, 4, 6), [ HLAStandardMatch( - allele="std1", + allele="B*57:01:02", two=(1, 2), three=(4, 4), mismatch=0, ), HLAStandardMatch( - allele="std2", + allele="B*58:22:33G", two=(8, 4), three=(4, 8), mismatch=1, @@ -327,17 +363,17 @@ def hla_algorithm(): ( (1, 2, 4, 4), 3, - ("std1", "std1"), + ("B*57:01:02", "B*57:01:02"), ), ( (9, 6, 4, 12), 1, - ("std1", "std2"), + ("B*57:01:02", "B*58:22:33G"), ), ( (8, 4, 4, 8), 3, - ("std2", "std2"), + ("B*58:22:33G", "B*58:22:33G"), ), ], id="all_combos_have_mismatches_below_threshold", @@ -346,45 +382,45 @@ def hla_algorithm(): (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, ), ], [0], - [((1, 2, 4, 8), 0, ("std_allmatch", "std_allmatch"))], + [((1, 2, 4, 8), 0, ("A*07:08:09G", "A*07:08:09G"))], id="more_standards_only_first_one_below_threshold", ), pytest.param( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, @@ -392,9 +428,9 @@ def hla_algorithm(): ], [0], [ - ((1, 2, 4, 4), 1, ("std_1mismatch", "std_1mismatch")), - ((1, 2, 4, 12), 1, ("std_1mismatch", "std_allmatch")), - ((1, 2, 4, 8), 0, ("std_allmatch", "std_allmatch")), + ((1, 2, 4, 4), 1, ("A*04:01:01", "A*04:01:01")), + ((1, 2, 4, 12), 1, ("A*04:01:01", "A*07:08:09G")), + ((1, 2, 4, 8), 0, ("A*07:08:09G", "A*07:08:09G")), ], id="more_standards_some_reported_early_late_ones_rejected", ), @@ -402,19 +438,19 @@ def hla_algorithm(): (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, ), HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, @@ -422,11 +458,11 @@ def hla_algorithm(): ], [0], [ - ((8, 4, 2, 1), 4, ("std_allmismatch", "std_allmismatch")), - ((9, 6, 6, 5), 4, ("std_1mismatch", "std_allmismatch")), - ((1, 2, 4, 4), 1, ("std_1mismatch", "std_1mismatch")), - ((1, 2, 4, 12), 1, ("std_1mismatch", "std_allmatch")), - ((1, 2, 4, 8), 0, ("std_allmatch", "std_allmatch")), + ((8, 4, 2, 1), 4, ("A*11:01:01:01", "A*11:01:01:01")), + ((9, 6, 6, 5), 4, ("A*04:01:01", "A*11:01:01:01")), + ((1, 2, 4, 4), 1, ("A*04:01:01", "A*04:01:01")), + ((1, 2, 4, 12), 1, ("A*04:01:01", "A*07:08:09G")), + ((1, 2, 4, 8), 0, ("A*07:08:09G", "A*07:08:09G")), ], id="running_threshold_is_reduced_in_steps", ), @@ -434,19 +470,19 @@ def hla_algorithm(): (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, @@ -454,12 +490,12 @@ def hla_algorithm(): ], [4, 5, 10], [ - ((1, 2, 4, 8), 0, ("std_allmatch", "std_allmatch")), - ((1, 2, 4, 12), 1, ("std_1mismatch", "std_allmatch")), - ((1, 2, 4, 4), 1, ("std_1mismatch", "std_1mismatch")), - ((9, 6, 6, 9), 4, ("std_allmatch", "std_allmismatch")), - ((9, 6, 6, 5), 4, ("std_1mismatch", "std_allmismatch")), - ((8, 4, 2, 1), 4, ("std_allmismatch", "std_allmismatch")), + ((1, 2, 4, 8), 0, ("A*07:08:09G", "A*07:08:09G")), + ((1, 2, 4, 12), 1, ("A*04:01:01", "A*07:08:09G")), + ((1, 2, 4, 4), 1, ("A*04:01:01", "A*04:01:01")), + ((9, 6, 6, 9), 4, ("A*07:08:09G", "A*11:01:01:01")), + ((9, 6, 6, 5), 4, ("A*04:01:01", "A*11:01:01:01")), + ((8, 4, 2, 1), 4, ("A*11:01:01:01", "A*11:01:01:01")), ], id="all_combos_below_threshold", ), @@ -467,25 +503,25 @@ def hla_algorithm(): (1, 2, 4, 8), [ HLAStandardMatch( - allele="std1", + allele="B*57:01:02", two=(2, 2), three=(4, 4), mismatch=2, ), HLAStandardMatch( - allele="std2", + allele="B*58:22:33G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std3", + allele="B*62:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std4", + allele="B*110:01:01:01N", two=(2, 2), three=(4, 8), mismatch=1, @@ -493,19 +529,62 @@ def hla_algorithm(): ], [2, 3, 10], [ - ((2, 2, 4, 4), 2, ("std1", "std1")), - ((3, 2, 4, 12), 2, ("std1", "std2")), - ((1, 2, 4, 8), 0, ("std2", "std2")), - ((3, 2, 4, 4), 2, ("std1", "std3")), - ((1, 2, 4, 12), 1, ("std2", "std3")), - ((1, 2, 4, 4), 1, ("std3", "std3")), - ((2, 2, 4, 12), 2, ("std1", "std4")), - ((3, 2, 4, 8), 1, ("std2", "std4")), - ((3, 2, 4, 12), 2, ("std3", "std4")), - ((2, 2, 4, 8), 1, ("std4", "std4")), + ((2, 2, 4, 4), 2, ("B*57:01:02", "B*57:01:02")), + ((3, 2, 4, 12), 2, ("B*57:01:02", "B*58:22:33G")), + ((1, 2, 4, 8), 0, ("B*58:22:33G", "B*58:22:33G")), + ((3, 2, 4, 4), 2, ("B*57:01:02", "B*62:01")), + ((1, 2, 4, 12), 1, ("B*58:22:33G", "B*62:01")), + ((1, 2, 4, 4), 1, ("B*62:01", "B*62:01")), + ((2, 2, 4, 12), 2, ("B*57:01:02", "B*110:01:01:01N")), + ((3, 2, 4, 8), 1, ("B*58:22:33G", "B*110:01:01:01N")), + ((3, 2, 4, 12), 2, ("B*62:01", "B*110:01:01:01N")), + ((2, 2, 4, 8), 1, ("B*110:01:01:01N", "B*110:01:01:01N")), ], id="several_standards_produce_same_sequence", ), + pytest.param( + (1, 2, 4, 8), + [ + HLAStandardMatch( + allele="B*57:01:02", + two=(2, 2), + three=(4, 4), + mismatch=2, + ), + HLAStandardMatch( + allele="B*220:100:01", + two=(1, 2), + three=(4, 8), + mismatch=0, + ), + HLAStandardMatch( + allele="B*220:22:02", + two=(1, 2), + three=(4, 4), + mismatch=1, + ), + HLAStandardMatch( + allele="B*110:01:01:01N", + two=(2, 2), + three=(4, 8), + mismatch=1, + ), + ], + [2, 3, 10], + [ + ((2, 2, 4, 4), 2, ("B*57:01:02", "B*57:01:02")), + ((3, 2, 4, 12), 2, ("B*57:01:02", "B*220:100:01")), + ((1, 2, 4, 8), 0, ("B*220:100:01", "B*220:100:01")), + ((3, 2, 4, 4), 2, ("B*57:01:02", "B*220:22:02")), + ((1, 2, 4, 12), 1, ("B*220:22:02", "B*220:100:01")), + ((1, 2, 4, 4), 1, ("B*220:22:02", "B*220:22:02")), + ((2, 2, 4, 12), 2, ("B*57:01:02", "B*110:01:01:01N")), + ((3, 2, 4, 8), 1, ("B*110:01:01:01N", "B*220:100:01")), + ((3, 2, 4, 12), 2, ("B*110:01:01:01N", "B*220:22:02")), + ((2, 2, 4, 8), 1, ("B*110:01:01:01N", "B*110:01:01:01N")), + ], + id="several_standards_produce_same_sequence_pairs_sorted_by_coordinate", + ), ], ) def test_combine_standards_stepper( @@ -532,7 +611,7 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, @@ -542,7 +621,7 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): 0, }, id="one_combo_all_matches", @@ -551,7 +630,7 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_twomatch", + allele="A*55:01", two=(1, 4), three=(2, 8), mismatch=2, @@ -561,7 +640,7 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 4, 2, 8), - possible_allele_pairs=(("std_twomatch", "std_twomatch"),), + possible_allele_pairs=(("A*55:01", "A*55:01"),), ): 2, }, id="one_combo_two_mismatches", @@ -570,13 +649,13 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_allmatch2", + allele="A*07:08:10", two=(1, 4), three=(4, 8), mismatch=1, @@ -586,7 +665,7 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): 0, }, id="combo_with_mismatch_above_threshold", @@ -595,13 +674,13 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_allmatch2", + allele="A*07:08:10", two=(1, 4), three=(4, 8), mismatch=1, @@ -611,15 +690,15 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): 0, HLACombinedStandard( standard_bin=(1, 6, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch2"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:10"),), ): 1, HLACombinedStandard( standard_bin=(1, 4, 4, 8), - possible_allele_pairs=(("std_allmatch2", "std_allmatch2"),), + possible_allele_pairs=(("A*07:08:10", "A*07:08:10"),), ): 1, }, id="several_combos_all_below_threshold", @@ -628,13 +707,13 @@ def test_combine_standards_stepper( (9, 6, 4, 6), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 4), mismatch=0, ), HLAStandardMatch( - allele="std_1mismatch2", + allele="A*04:01:02", two=(8, 4), three=(4, 8), mismatch=1, @@ -644,7 +723,7 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(9, 6, 4, 12), - possible_allele_pairs=(("std_1mismatch2", "std_allmatch"),), + possible_allele_pairs=(("A*04:01:02", "A*07:08:09G"),), ): 1, }, id="best_match_has_mismatch_others_rejected", @@ -654,13 +733,13 @@ def test_combine_standards_stepper( (9, 6, 4, 6), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 4), mismatch=0, ), HLAStandardMatch( - allele="std_1mismatch2", + allele="A*04:01:02", two=(8, 4), three=(4, 8), mismatch=1, @@ -670,15 +749,15 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(9, 6, 4, 12), - possible_allele_pairs=(("std_1mismatch2", "std_allmatch"),), + possible_allele_pairs=(("A*04:01:02", "A*07:08:09G"),), ): 1, HLACombinedStandard( standard_bin=(1, 2, 4, 4), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): 3, HLACombinedStandard( standard_bin=(8, 4, 4, 8), - possible_allele_pairs=(("std_1mismatch2", "std_1mismatch2"),), + possible_allele_pairs=(("A*04:01:02", "A*04:01:02"),), ): 3, }, id="all_combos_have_mismatches_below_threshold", @@ -688,7 +767,7 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, @@ -698,7 +777,7 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 4), - possible_allele_pairs=(("std_1mismatch", "std_1mismatch"),), + possible_allele_pairs=(("A*04:01:01", "A*04:01:01"),), ): 1 }, id="one_combo_retained_regardless_of_threshold", @@ -707,7 +786,7 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, @@ -717,7 +796,7 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(8, 4, 2, 1), - possible_allele_pairs=(("std_allmismatch", "std_allmismatch"),), + possible_allele_pairs=(("A*11:01:01:01", "A*11:01:01:01"),), ): 4, }, id="only_combo_retained_regardless_of_threshold_more_mismatches", @@ -727,19 +806,19 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, @@ -749,7 +828,7 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): 0, }, id="several_combos_only_one_below_threshold", @@ -758,19 +837,19 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, @@ -780,15 +859,15 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): 0, HLACombinedStandard( standard_bin=(1, 2, 4, 12), - possible_allele_pairs=(("std_1mismatch", "std_allmatch"),), + possible_allele_pairs=(("A*04:01:01", "A*07:08:09G"),), ): 1, HLACombinedStandard( standard_bin=(1, 2, 4, 4), - possible_allele_pairs=(("std_1mismatch", "std_1mismatch"),), + possible_allele_pairs=(("A*04:01:01", "A*04:01:01"),), ): 1, }, id="several_combos_only_ones_below_threshold_retained", @@ -797,19 +876,19 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, @@ -819,27 +898,27 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): 0, HLACombinedStandard( standard_bin=(1, 2, 4, 12), - possible_allele_pairs=(("std_1mismatch", "std_allmatch"),), + possible_allele_pairs=(("A*04:01:01", "A*07:08:09G"),), ): 1, HLACombinedStandard( standard_bin=(1, 2, 4, 4), - possible_allele_pairs=(("std_1mismatch", "std_1mismatch"),), + possible_allele_pairs=(("A*04:01:01", "A*04:01:01"),), ): 1, HLACombinedStandard( standard_bin=(9, 6, 6, 9), - possible_allele_pairs=(("std_allmatch", "std_allmismatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*11:01:01:01"),), ): 4, HLACombinedStandard( standard_bin=(9, 6, 6, 5), - possible_allele_pairs=(("std_1mismatch", "std_allmismatch"),), + possible_allele_pairs=(("A*04:01:01", "A*11:01:01:01"),), ): 4, HLACombinedStandard( standard_bin=(8, 4, 2, 1), - possible_allele_pairs=(("std_allmismatch", "std_allmismatch"),), + possible_allele_pairs=(("A*11:01:01:01", "A*11:01:01:01"),), ): 4, }, id="several_combos_all_below_threshold_retained", @@ -848,19 +927,19 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, ), HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, @@ -870,7 +949,7 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): 0, }, id="standard_as_second_part_of_combo_worse_than_already_known_combo_skipped_only_best_retained", @@ -879,19 +958,19 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, ), HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, @@ -901,15 +980,15 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): 0, HLACombinedStandard( standard_bin=(1, 2, 4, 12), - possible_allele_pairs=(("std_1mismatch", "std_allmatch"),), + possible_allele_pairs=(("A*04:01:01", "A*07:08:09G"),), ): 1, HLACombinedStandard( standard_bin=(1, 2, 4, 4), - possible_allele_pairs=(("std_1mismatch", "std_1mismatch"),), + possible_allele_pairs=(("A*04:01:01", "A*04:01:01"),), ): 1, }, id="standard_as_second_part_of_combo_worse_than_already_known_combo_skipped_under_threshold_retained", @@ -918,25 +997,25 @@ def test_combine_standards_stepper( (1, 2, 4, 8), [ HLAStandardMatch( - allele="std1", + allele="B*57:01:02", two=(2, 2), three=(4, 4), mismatch=2, ), HLAStandardMatch( - allele="std2", + allele="B*58:22:33G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std3", + allele="B*62:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std4", + allele="B*110:01:01:01N", two=(2, 2), three=(4, 8), mismatch=1, @@ -946,46 +1025,118 @@ def test_combine_standards_stepper( { HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std2", "std2"),), + possible_allele_pairs=(("B*58:22:33G", "B*58:22:33G"),), ): 0, HLACombinedStandard( standard_bin=(2, 2, 4, 4), - possible_allele_pairs=(("std1", "std1"),), + possible_allele_pairs=(("B*57:01:02", "B*57:01:02"),), ): 2, HLACombinedStandard( standard_bin=(1, 2, 4, 4), - possible_allele_pairs=(("std3", "std3"),), + possible_allele_pairs=(("B*62:01", "B*62:01"),), ): 1, HLACombinedStandard( standard_bin=(2, 2, 4, 8), - possible_allele_pairs=(("std4", "std4"),), + possible_allele_pairs=(("B*110:01:01:01N", "B*110:01:01:01N"),), ): 1, HLACombinedStandard( standard_bin=(3, 2, 4, 12), possible_allele_pairs=( - ("std1", "std2"), - ("std3", "std4"), + ("B*57:01:02", "B*58:22:33G"), + ("B*62:01", "B*110:01:01:01N"), ), ): 2, HLACombinedStandard( standard_bin=(3, 2, 4, 4), - possible_allele_pairs=(("std1", "std3"),), + possible_allele_pairs=(("B*57:01:02", "B*62:01"),), ): 2, HLACombinedStandard( standard_bin=(2, 2, 4, 12), - possible_allele_pairs=(("std1", "std4"),), + possible_allele_pairs=(("B*57:01:02", "B*110:01:01:01N"),), ): 2, HLACombinedStandard( standard_bin=(1, 2, 4, 12), - possible_allele_pairs=(("std2", "std3"),), + possible_allele_pairs=(("B*58:22:33G", "B*62:01"),), ): 1, HLACombinedStandard( standard_bin=(3, 2, 4, 8), - possible_allele_pairs=(("std2", "std4"),), + possible_allele_pairs=(("B*58:22:33G", "B*110:01:01:01N"),), ): 1, }, id="several_standards_produce_same_sequence", ), + pytest.param( + (1, 2, 4, 8), + [ + HLAStandardMatch( + allele="B*57:01:02", + two=(2, 2), + three=(4, 4), + mismatch=2, + ), + HLAStandardMatch( + allele="B*220:100:01", + two=(1, 2), + three=(4, 8), + mismatch=0, + ), + HLAStandardMatch( + allele="B*220:22:02", + two=(1, 2), + three=(4, 4), + mismatch=1, + ), + HLAStandardMatch( + allele="B*110:01:01:01N", + two=(2, 2), + three=(4, 8), + mismatch=1, + ), + ], + [2, 3, 10], + { + HLACombinedStandard( + standard_bin=(1, 2, 4, 8), + possible_allele_pairs=(("B*220:100:01", "B*220:100:01"),), + ): 0, + HLACombinedStandard( + standard_bin=(2, 2, 4, 4), + possible_allele_pairs=(("B*57:01:02", "B*57:01:02"),), + ): 2, + HLACombinedStandard( + standard_bin=(1, 2, 4, 4), + possible_allele_pairs=(("B*220:22:02", "B*220:22:02"),), + ): 1, + HLACombinedStandard( + standard_bin=(2, 2, 4, 8), + possible_allele_pairs=(("B*110:01:01:01N", "B*110:01:01:01N"),), + ): 1, + HLACombinedStandard( + standard_bin=(3, 2, 4, 12), + possible_allele_pairs=( + ("B*57:01:02", "B*220:100:01"), + ("B*110:01:01:01N", "B*220:22:02"), + ), + ): 2, + HLACombinedStandard( + standard_bin=(3, 2, 4, 4), + possible_allele_pairs=(("B*57:01:02", "B*220:22:02"),), + ): 2, + HLACombinedStandard( + standard_bin=(2, 2, 4, 12), + possible_allele_pairs=(("B*57:01:02", "B*110:01:01:01N"),), + ): 2, + HLACombinedStandard( + standard_bin=(1, 2, 4, 12), + possible_allele_pairs=(("B*220:22:02", "B*220:100:01"),), + ): 1, + HLACombinedStandard( + standard_bin=(3, 2, 4, 8), + possible_allele_pairs=(("B*110:01:01:01N", "B*220:100:01"),), + ): 1, + }, + id="several_standards_produce_same_sequence_pairs_sorted_by_coordinate", + ), ], ) def test_combine_standards( @@ -1203,17 +1354,17 @@ def test_get_mismatches_errors( "B": [], "C": [ HLAStandard( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), ), HLAStandard( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), ), HLAStandard( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), ), @@ -1231,11 +1382,11 @@ def test_get_mismatches_errors( matches={ HLACombinedStandard( standard_bin=(1, 2, 4, 8), - possible_allele_pairs=(("std_allmatch", "std_allmatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*07:08:09G"),), ): HLAMatchDetails(mismatches=[]), HLACombinedStandard( standard_bin=(1, 2, 4, 12), - possible_allele_pairs=(("std_1mismatch", "std_allmatch"),), + possible_allele_pairs=(("A*04:01:01", "A*07:08:09G"),), ): HLAMatchDetails( mismatches=[ HLAMismatch(index=4, standard_base="K", sequence_base="T"), @@ -1243,7 +1394,7 @@ def test_get_mismatches_errors( ), HLACombinedStandard( standard_bin=(1, 2, 4, 4), - possible_allele_pairs=(("std_1mismatch", "std_1mismatch"),), + possible_allele_pairs=(("A*04:01:01", "A*04:01:01"),), ): HLAMatchDetails( mismatches=[ HLAMismatch(index=4, standard_base="G", sequence_base="T"), @@ -1251,7 +1402,7 @@ def test_get_mismatches_errors( ), HLACombinedStandard( standard_bin=(9, 6, 6, 9), - possible_allele_pairs=(("std_allmatch", "std_allmismatch"),), + possible_allele_pairs=(("A*07:08:09G", "A*11:01:01:01"),), ): HLAMatchDetails( mismatches=[ HLAMismatch(index=1, standard_base="W", sequence_base="A"), @@ -1262,7 +1413,7 @@ def test_get_mismatches_errors( ), HLACombinedStandard( standard_bin=(9, 6, 6, 5), - possible_allele_pairs=(("std_1mismatch", "std_allmismatch"),), + possible_allele_pairs=(("A*04:01:01", "A*11:01:01:01"),), ): HLAMatchDetails( mismatches=[ HLAMismatch(index=1, standard_base="W", sequence_base="A"), @@ -1273,7 +1424,7 @@ def test_get_mismatches_errors( ), HLACombinedStandard( standard_bin=(8, 4, 2, 1), - possible_allele_pairs=(("std_allmismatch", "std_allmismatch"),), + possible_allele_pairs=(("A*11:01:01:01", "A*11:01:01:01"),), ): HLAMatchDetails( mismatches=[ HLAMismatch(index=1, standard_base="T", sequence_base="A"), @@ -1481,17 +1632,17 @@ def test_interpret_good_cases( "B": [], "C": [ HLAStandard( - allele="std_1", + allele="B*01:01:01", two=(2, 4, 8, 1, 10, 2), three=(8, 1, 5, 7, 11, 1), ), HLAStandard( - allele="std_2", + allele="B*02:02:02", two=(8, 4, 2, 1, 10, 2), three=(4, 8, 10, 11, 4, 1), ), HLAStandard( - allele="std_3", + allele="B*03:03:03", two=(1, 2, 4, 4, 5, 8), three=(8, 8, 5, 8, 11, 4), ), @@ -2197,11 +2348,11 @@ def test_use_config_all_defaults( # pytest.param( (1, 2, 4, 8), - [HLAStandard(allele="std_allmismatch", two=(1, 2), three=(4, 8))], + [HLAStandard(allele="A*11:01:01:01", two=(1, 2), three=(4, 8))], 5, [ HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(1, 2), three=(4, 8), mismatch=0, @@ -2211,11 +2362,11 @@ def test_use_config_all_defaults( ), pytest.param( (1, 2, 4, 8), - [HLAStandard(allele="std_allmismatch", two=(1, 2), three=(4, 4))], + [HLAStandard(allele="A*11:01:01:01", two=(1, 2), three=(4, 4))], 5, [ HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(1, 2), three=(4, 4), mismatch=1, @@ -2225,11 +2376,11 @@ def test_use_config_all_defaults( ), pytest.param( (1, 3, 4, 8), - [HLAStandard(allele="std_mixturematch", two=(1, 2), three=(4, 8))], + [HLAStandard(allele="C*72:01:03", two=(1, 2), three=(4, 8))], 5, [ HLAStandardMatch( - allele="std_mixturematch", + allele="C*72:01:03", two=(1, 2), three=(4, 8), mismatch=0, @@ -2239,11 +2390,11 @@ def test_use_config_all_defaults( ), pytest.param( (1, 2, 4, 8), - [HLAStandard(allele="std_allmismatch", two=(8, 4), three=(2, 1))], + [HLAStandard(allele="A*11:01:01:01", two=(8, 4), three=(2, 1))], 5, [ HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, @@ -2255,7 +2406,7 @@ def test_use_config_all_defaults( (1, 2, 4, 8, 3, 5, 7, 9), [ HLAStandard( - allele="std_mismatch_over_threshold", + allele="C*55:01:02", two=(1, 2, 8, 4), three=(4, 8, 8, 1), ) @@ -2267,26 +2418,26 @@ def test_use_config_all_defaults( pytest.param( (1, 2, 4, 8), [ - HLAStandard(allele="std_allmatch", two=(1, 2), three=(4, 8)), - HLAStandard(allele="std_1mismatch", two=(1, 2), three=(4, 4)), - HLAStandard(allele="std_allmismatch", two=(8, 4), three=(2, 1)), + HLAStandard(allele="A*07:08:09G", two=(1, 2), three=(4, 8)), + HLAStandard(allele="A*04:01:01", two=(1, 2), three=(4, 4)), + HLAStandard(allele="A*11:01:01:01", two=(8, 4), three=(2, 1)), ], 5, [ HLAStandardMatch( - allele="std_allmatch", + allele="A*07:08:09G", two=(1, 2), three=(4, 8), mismatch=0, ), HLAStandardMatch( - allele="std_1mismatch", + allele="A*04:01:01", two=(1, 2), three=(4, 4), mismatch=1, ), HLAStandardMatch( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4), three=(2, 1), mismatch=4, @@ -2298,22 +2449,22 @@ def test_use_config_all_defaults( (1, 3, 4, 8, 2, 5, 4, 1), [ HLAStandard( - allele="std_mixturematch", + allele="C*72:01:03", two=(1, 2, 4, 8), three=(2, 1, 4, 1), ), HLAStandard( - allele="std_2mismatch", + allele="A*02:02:02", two=(1, 4, 4, 4), three=(2, 4, 4, 1), ), HLAStandard( - allele="std_allmismatch", + allele="A*11:01:01:01", two=(8, 4, 2, 1), three=(1, 8, 8, 8), ), HLAStandard( - allele="std_4mismatch", + allele="A*04:04:04:04", two=(8, 4, 2, 1), three=(2, 1, 4, 1), ), @@ -2321,19 +2472,19 @@ def test_use_config_all_defaults( 5, [ HLAStandardMatch( - allele="std_mixturematch", + allele="C*72:01:03", two=(1, 2, 4, 8), three=(2, 1, 4, 1), mismatch=0, ), HLAStandardMatch( - allele="std_2mismatch", + allele="A*02:02:02", two=(1, 4, 4, 4), three=(2, 4, 4, 1), mismatch=2, ), HLAStandardMatch( - allele="std_4mismatch", + allele="A*04:04:04:04", two=(8, 4, 2, 1), three=(2, 1, 4, 1), mismatch=4, diff --git a/tests/models_test.py b/tests/models_test.py index 8edb221..d3e020e 100644 --- a/tests/models_test.py +++ b/tests/models_test.py @@ -5,6 +5,7 @@ from hla_algorithm.models import ( AllelePairs, + GeneCoord, HLACombinedStandard, HLAInterpretation, HLAMatchDetails, @@ -732,9 +733,530 @@ def test_get_unambiguous_allele_pairs( exp_result: list[tuple[str, str]], ): allele_pairs = AllelePairs(allele_pairs=raw_alleles) - result = allele_pairs.get_unambiguous_allele_pairs(frequencies) + result = allele_pairs._get_unambiguous_allele_pairs(frequencies) assert result == exp_result + @pytest.mark.parametrize( + ( + "unambiguous_pairs, expected_common_prefix, " + "expected_second_prefix, expected_remaining_prefixes" + ), + [ + pytest.param([], (), None, [], id="empty_input"), + pytest.param( + [(("B*01", "01", "04"), ("B*02", "03"))], + ("B*01", "01", "04"), + ("B*02", "03"), + [], + id="single_pair_different_coord_counts", + ), + pytest.param( + [(("B*01"), ("B*02"))], + ("B*01"), + ("B*02"), + [], + id="single_pair_both_one_coord", + ), + pytest.param( + [(("B*01", "02"), ("B*02", "03"))], + ("B*01", "02"), + ("B*02", "03"), + [], + id="single_pair_both_one_coord", + ), + pytest.param( + [(("B*01", "01", "04"), ("B*02", "03", "07N"))], + ("B*01", "01", "04"), + ("B*02", "03", "07N"), + [], + id="single_pair_both_three_coords", + ), + pytest.param( + [(("B*01", "01", "03", "04"), ("B*02", "03", "05", "07N"))], + ("B*01", "01", "03", "04"), + ("B*02", "03", "05", "07N"), + [], + id="single_pair_both_four_coords", + ), + pytest.param( + [(("B*01", "01", "03", "04"), ("B*01", "01", "03", "04"))], + ("B*01", "01", "03", "04"), + None, + [("B*01", "01", "03", "04")], + id="single_pair_homozygous", + ), + pytest.param( + [(("B*01", "01"), ("B*01", "01"))], + ("B*01", "01"), + None, + [("B*01", "01")], + id="single_pair_homozygous_shorter_allele", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*02", "03", "05", "07N")), + (("B*01", "01", "03", "04"), ("B*02", "03", "11")), + ], + ("B*01", "01", "03", "04"), + None, + [("B*02", "03", "05", "07N"), ("B*02", "03", "11")], + id="two_pairs_best_both_first_four_coords", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*02", "03", "05", "07N")), + (("B*01", "01", "03", "11"), ("B*02", "03", "05", "07N")), + ], + ("B*02", "03", "05", "07N"), + None, + [("B*01", "01", "03", "04"), ("B*01", "01", "03", "11")], + id="two_pairs_best_both_second_four_coords", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "03", "05", "07N"), ("B*01", "04", "11", "110N")), + ], + ("B*01", "03", "05", "07N"), + None, + [("B*01", "01", "03", "04"), ("B*01", "04", "11", "110N")], + id="two_pairs_best_different_positions_four_coords", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")), + ], + ("B*01", "01", "03", "04"), + ("B*01", "03", "05", "07N"), + [], + id="two_exact_full_length_matches", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "01", "03", "04")), + (("B*01", "01", "03", "04"), ("B*01", "01", "03", "04")), + ], + ("B*01", "01", "03", "04"), + None, + [("B*01", "01", "03", "04"), ("B*01", "01", "03", "04")], + id="two_exact_full_length_matches_homozygous", + ), + pytest.param( + [ + (("B*01", "01", "03"), ("B*01", "01", "03")), + (("B*01", "01", "03"), ("B*01", "01", "03")), + ], + ("B*01", "01", "03"), + None, + [("B*01", "01", "03"), ("B*01", "01", "03")], + id="two_exact_shorter_matches_homozygous", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03")), + (("B*01", "01", "03", "04"), ("B*01", "03")), + ], + ("B*01", "01", "03", "04"), + ("B*01", "03"), + [], + id="two_exact_matches_second_shorter", + ), + pytest.param( + [ + (("B*01", "01", "22"), ("B*01", "03", "05", "07N")), + (("B*01", "01", "22"), ("B*01", "03", "05", "07N")), + ], + ("B*01", "01", "22"), + ("B*01", "03", "05", "07N"), + [], + id="two_exact_matches_first_shorter", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "01", "03", "04"), ("B*01", "03", "11", "22")), + ], + ("B*01", "01", "03", "04"), + None, + [("B*01", "03", "05", "07N"), ("B*01", "03", "11", "22")], + id="one_exact_match_first_element", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "01", "100", "01"), ("B*01", "03", "05", "07N")), + ], + ("B*01", "03", "05", "07N"), + None, + [("B*01", "01", "03", "04"), ("B*01", "01", "100", "01")], + id="one_exact_match_second_element", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "03", "05", "07N"), ("B*01", "03", "05", "07N")), + ], + ("B*01", "03", "05", "07N"), + None, + [("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")], + id="one_exact_match_one_entire_side_and_another_appearance", + ), + pytest.param( + [ + (("B*01", "01"), ("B*01", "03", "05", "07N")), + (("B*01", "01"), ("B*01", "03", "11", "22")), + ], + ("B*01", "01"), + None, + [("B*01", "03", "05", "07N"), ("B*01", "03", "11", "22")], + id="one_exact_short_match_first_element", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05")), + (("B*01", "01", "100", "01"), ("B*01", "03", "05")), + ], + ("B*01", "03", "05"), + None, + [("B*01", "01", "03", "04"), ("B*01", "01", "100", "01")], + id="one_exact_short_match_second_element", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05")), + (("B*01", "03", "05"), ("B*01", "22", "44", "66G")), + ], + ("B*01", "03", "05"), + None, + [("B*01", "01", "03", "04"), ("B*01", "22", "44", "66G")], + id="one_exact_short_match_different_positions", + ), + pytest.param( + [ + (("B*01", "01"), ("B*01", "01")), + (("B*01", "01"), ("B*01", "03", "44", "66G")), + ], + ("B*01", "01"), + None, + [("B*01", "01"), ("B*01", "03", "44", "66G")], + id="one_exact_short_match_one_entire_side_and_another_appearance", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "15", "01")), + (("B*01", "01", "03", "07N"), ("B*01", "03", "10", "02")), + ], + ("B*01", "01", "03"), + None, + [("B*01", "03", "15"), ("B*01", "03", "10")], + id="nonexact_best_prefix_both_first_position", + ), + pytest.param( + [ + (("B*01", "01", "03"), ("B*01", "03", "15", "01")), + (("B*01", "01", "03", "07N"), ("B*01", "03", "10", "02")), + ], + ("B*01", "01", "03"), + None, + [("B*01", "03", "15"), ("B*01", "03", "10")], + id="nonexact_best_prefix_different_length_pairs_both_first_position", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "01", "100", "01"), ("B*01", "03", "05", "110")), + ], + ("B*01", "03", "05"), + None, + [("B*01", "01", "03"), ("B*01", "01", "100")], + id="nonexact_best_prefix_both_second_position", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "01", "100", "01"), ("B*01", "03", "05")), + ], + ("B*01", "03", "05"), + None, + [("B*01", "01", "03"), ("B*01", "01", "100")], + id="nonexact_best_prefix_different_length_pairs_both_second_position", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "03", "05", "01"), ("B*01", "03", "110", "01")), + ], + ("B*01", "03", "05"), + None, + [("B*01", "01", "03"), ("B*01", "03", "110")], + id="nonexact_best_prefix_match_different_positions", + ), + pytest.param( + [ + (("B*01", "01", "03", "04"), ("B*01", "03", "05")), + (("B*01", "03", "05", "01"), ("B*01", "03", "110", "01")), + ], + ("B*01", "03", "05"), + None, + [("B*01", "01", "03"), ("B*01", "03", "110")], + id="nonexact_best_prefix_match_different_lengths_different_positions", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "03", "05", "18"), ("B*01", "03", "110", "01")), + ], + ("B*01", "03", "05"), + None, + [("B*01", "03", "05"), ("B*01", "03", "110")], + id="nonexact_best_prefix_match_entire_side_and_another_appearance", + ), + pytest.param( + [ + (("B*01", "03", "05"), ("B*01", "03", "05", "07N")), + (("B*01", "03", "05", "18"), ("B*01", "03", "110", "01")), + ], + ("B*01", "03", "05"), + None, + [("B*01", "03", "05"), ("B*01", "03", "110")], + id="nonexact_best_prefix_different_lengths_match_entire_side_and_another_appearance", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*01", "04", "05", "07N")), + (("B*01", "03", "05", "18"), ("B*01", "04", "05", "01")), + ], + ("B*01", "03", "05"), + ("B*01", "04", "05"), + [], + id="two_nonexact_best_prefixes", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*01", "04", "05", "07N")), + (("B*01", "03", "05"), ("B*01", "04", "05")), + ], + ("B*01", "03", "05"), + ("B*01", "04", "05"), + [], + id="two_nonexact_best_prefixes_different_lengths", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*01", "04", "05", "07N")), + (("B*01", "03", "08", "18"), ("B*01", "04", "08", "01")), + ], + ("B*01", "03"), + ("B*01", "04"), + [], + id="two_nonexact_shorter_best_prefixes", + ), + pytest.param( + [ + (("B*01", "03", "05"), ("B*01", "04", "05", "07N")), + (("B*01", "03", "08", "18"), ("B*01", "04")), + ], + ("B*01", "03"), + ("B*01", "04"), + [], + id="two_nonexact_shorter_best_prefixes_different_lengths", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*01", "03", "05", "07N")), + (("B*01", "03", "05", "18"), ("B*01", "03", "05", "20")), + ], + ("B*01", "03", "05"), + None, + [("B*01", "03", "05"), ("B*01", "03", "05")], + id="one_nonexact_best_prefix_all_places", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*01", "03", "08", "07N")), + (("B*01", "03", "22", "18"), ("B*01", "03", "100", "20")), + ], + ("B*01", "03"), + None, + [("B*01", "03"), ("B*01", "03")], + id="one_nonexact_shorter_best_prefix_all_places", + ), + pytest.param( + [ + (("B*01", "03"), ("B*01", "03", "08")), + (("B*01", "03", "22"), ("B*01", "03", "100", "20")), + ], + ("B*01", "03"), + None, + [("B*01", "03"), ("B*01", "03")], + id="one_nonexact_shorter_best_prefix_all_places_different_lengths", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*02", "08", "08", "07N")), + (("B*01", "15", "22", "18"), ("B*02", "03", "100", "20")), + ], + ("B*01",), + ("B*02",), + [], + id="only_first_positions_match_different_first_coordinate", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*02", "08", "08")), + (("B*01", "15"), ("B*02", "03", "100", "20")), + ], + ("B*01",), + ("B*02",), + [], + id="only_first_positions_match_different_first_coordinate_different_lengths", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*01", "08", "08", "07N")), + (("B*01", "15", "22", "18"), ("B*01", "30", "100", "20")), + ], + ("B*01",), + None, + [("B*01",), ("B*01",)], + id="only_first_positions_match_same_first_coordinate", + ), + pytest.param( + [ + (("B*01",), ("B*01", "08", "08")), + (("B*01", "15", "22", "18"), ("B*01", "30", "100")), + ], + ("B*01",), + None, + [("B*01",), ("B*01",)], + id="only_first_positions_match_same_first_coordinate_different_lengths", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*02", "08")), + (("B*01", "03", "22", "18"), ("B*02", "30", "100")), + (("B*01", "03", "55", "01G"), ("B*02", "30", "100", "20")), + ], + ("B*01", "03"), + None, + [("B*02", "08"), ("B*02", "30"), ("B*02", "30")], + id="typical_case_no_exact_match_one_prefix_found", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*02", "30")), + (("B*01", "03", "22", "18"), ("B*02", "30", "100")), + (("B*01", "03", "55", "01G"), ("B*02", "30", "100", "20")), + ], + ("B*01", "03"), + ("B*02", "30"), + [], + id="typical_case_two_prefixes_found", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*02", "30")), + (("B*01", "03", "22"), ("B*02", "30")), + (("B*01", "03"), ("B*02", "30")), + ], + ("B*02", "30"), + None, + [("B*01", "03", "05", "04"), ("B*01", "03", "22"), ("B*01", "03")], + id="typical_case_exact_prefix_found", + ), + pytest.param( + [ + (("B*01", "03", "05", "04"), ("B*01", "30")), + (("B*01", "03", "22"), ("B*01", "30")), + (("B*01", "30"), ("B*01", "30")), + ], + ("B*01", "30"), + None, + [("B*01", "03", "05", "04"), ("B*01", "03", "22"), ("B*01", "30")], + id="typical_case_exact_prefix_found_one_whole_side_and_other_appearance", + ), + ], + ) + def test_identify_clean_prefix_in_pairs( + self, + unambiguous_pairs: list[tuple[GeneCoord, GeneCoord]], + expected_common_prefix: GeneCoord, + expected_second_prefix: Optional[GeneCoord], + expected_remaining_prefixes: list[GeneCoord], + ): + intermediate_result: AllelePairs.CleanPrefixIntermediateResult = ( + AllelePairs._identify_clean_prefix_in_pairs(unambiguous_pairs) + ) + if expected_second_prefix is not None: + assert {intermediate_result.common_prefix, intermediate_result.second_prefix} == { + expected_common_prefix, + expected_second_prefix, + } + else: + assert intermediate_result.common_prefix == expected_common_prefix + assert intermediate_result.second_prefix == expected_second_prefix + assert intermediate_result.remaining_prefixes == expected_remaining_prefixes + + @pytest.mark.parametrize( + "allele_prefixes, expected_result", + [ + pytest.param( + [], + (), + id="trivial_case", + ), + # Note: we have no single allele tests because that contradicts one + # of our preconditions. + pytest.param( + [("C*01", "02", "03", "04G"), ("C*01", "02", "03", "110N")], + ("C*01", "02", "03"), + id="best_match_length_3", + ), + pytest.param( + [("C*01", "02", "03"), ("C*01", "02", "03", "110N")], + ("C*01", "02", "03"), + id="best_match_length_3_different_length_inputs", + ), + pytest.param( + [("C*01", "02", "03", "04G"), ("C*01", "02", "11", "110N")], + ("C*01", "02"), + id="best_match_length_2", + ), + pytest.param( + [("C*01", "02", "03", "04G"), ("C*01", "02", "11")], + ("C*01", "02"), + id="best_match_length_2_different_lengths_both_with_excess", + ), + pytest.param( + [("C*01", "02", "03", "04G"), ("C*01", "02")], + ("C*01", "02"), + id="best_match_length_2_different_lengths_one_with_no_excess", + ), + pytest.param( + [("C*01", "02", "03", "04G"), ("C*01", "07", "01", "110N")], + ("C*01",), + id="best_match_length_1", + ), + pytest.param( + [("C*01", "02", "03"), ("C*01", "07")], + ("C*01",), + id="best_match_length_1_different_lengths_both_with_excess", + ), + pytest.param( + [("C*01",), ("C*01", "07", "01")], + ("C*01",), + id="best_match_length_1_different_lengths_one_with_no_excess", + ), + ], + ) + def test_identify_longest_prefix( + self, + allele_prefixes: list[GeneCoord], + expected_result: GeneCoord, + ): + assert AllelePairs._identify_longest_prefix(allele_prefixes) == expected_result + @pytest.mark.parametrize( "raw_allele_pairs, frequencies, expected_result, expected_unambiguous_set", [ @@ -1035,6 +1557,31 @@ def test_get_unambiguous_allele_pairs( }, id="ambiguous_set_frequencies_dictate_best_allele_choice", ), + pytest.param( + [ + ("B*01:33", "B*100:123"), + ("B*04:33", "B*04:123"), # this is the "best" one + ("B*04:123", "B*04:123:22G"), + ("B*04:123:22", "B*04:133:45:66N"), + ("B*04:123:22", "B*04:152"), + ], + { + HLAProteinPair( + first_field_1="04", + first_field_2="33", + second_field_1="04", + second_field_2="123", + ): 150, + }, + "B*04 - B*04:123", + { + ("B*04:33", "B*04:123"), + ("B*04:123", "B*04:123:22G"), + ("B*04:123:22", "B*04:133:45:66N"), + ("B*04:123:22", "B*04:152"), + }, + id="best_match_identified_on_two_sides", + ), ], ) def test_best_common_allele_pair_str( @@ -1056,7 +1603,7 @@ def test_best_common_allele_pair_str( @pytest.mark.parametrize( "combined_standards, exp_alleles", [ - ( + pytest.param( [ HLACombinedStandard( standard_bin=(1, 4, 9, 4), @@ -1083,19 +1630,20 @@ def test_best_common_allele_pair_str( ("A*02:01:02", "A*03:01:12"), ("A*02:01:36", "A*03:01:38"), ("A*02:01:52", "A*03:01:03"), - ("A*02:195", "A*03:23:01"), ("A*02:20:01", "A*03:157"), - ("A*02:237", "A*03:05:01"), ("A*02:24:01", "A*03:17:01"), ("A*02:26", "A*03:07"), - ("A*02:338", "A*03:95"), ("A*02:34", "A*03:08"), ("A*02:35:01", "A*03:108"), ("A*02:86", "A*03:123"), ("A*02:90", "A*03:09"), + ("A*02:195", "A*03:23:01"), + ("A*02:237", "A*03:05:01"), + ("A*02:338", "A*03:95"), ], + id="single_standard_pairs_need_sorting", ), - ( + pytest.param( [ HLACombinedStandard( standard_bin=(1, 4, 5, 9), @@ -1113,8 +1661,45 @@ def test_best_common_allele_pair_str( ("A*11:19", "A*26:13"), ("A*11:40", "A*66:01G"), ], + id="single_standard_pairs_sorted_trivially", ), - ( + pytest.param( + [ + HLACombinedStandard( + standard_bin=(1, 4, 5, 9), + possible_allele_pairs=( + ("A*11:01:01G", "A*26:01:01G"), + ("A*11:01:07", "A*26:01:17"), + ("A*11:190", "A*26:13"), + ("A*11:40", "A*66:101G"), + ), + ) + ], + [ + ("A*11:01:01G", "A*26:01:01G"), + ("A*11:01:07", "A*26:01:17"), + ("A*11:40", "A*66:101G"), + ("A*11:190", "A*26:13"), + ], + id="single_standard_pairs_sorted_with_three_digit_coordinate", + ), + pytest.param( + [ + HLACombinedStandard( + standard_bin=(1, 4, 5, 9), + possible_allele_pairs=( + ("A*11:01:01G", "A*100:01:17"), + ("A*11:01:01G", "A*26:01:01G"), + ), + ) + ], + [ + ("A*11:01:01G", "A*26:01:01G"), + ("A*11:01:01G", "A*100:01:17"), + ], + id="single_standard_pairs_sorted_with_three_digit_coordinate_second_element_checked", + ), + pytest.param( [ HLACombinedStandard( standard_bin=(1, 4, 5, 9), @@ -1143,102 +1728,48 @@ def test_best_common_allele_pair_str( ("A*24:25:26", "A*27:28:32"), ("A*32:42", "A*113:110:02:13N"), ], - ), - ], - ) - def test_get_allele_pairs( - self, - combined_standards: list[HLACombinedStandard], - exp_alleles: list[tuple[str, str]], - ): - result_alleles = AllelePairs.get_allele_pairs(combined_standards) - assert result_alleles.allele_pairs == exp_alleles - - @pytest.mark.parametrize( - "raw_allele_pairs, exp_result", - [ - pytest.param( - [], - [], - id="empty_list", - ), - pytest.param( - [ - ("A*11:01:01G", "A*26:01:01G"), - ], - [ - ("A*11:01:01G", "A*26:01:01G"), - ], - id="single_element", - ), - pytest.param( - [ - ("A*11:01:01", "A*26:01:01"), - ("A*12:01:01", "A*26:01:01"), - ], - [ - ("A*11:01:01", "A*26:01:01"), - ("A*12:01:01", "A*26:01:01"), - ], - id="two_elements_trivial_sort", - ), - pytest.param( - [ - ("A*12:01:01", "A*26:01:01"), - ("A*11:01:01", "A*26:01:01"), - ], - [ - ("A*11:01:01", "A*26:01:01"), - ("A*12:01:01", "A*26:01:01"), - ], - id="two_elements_nontrivial_sort", - ), - pytest.param( - [ - ("A*11:01:01G", "A*25:01:01"), - ("A*11:01:01", "A*26:01:01"), - ], - [ - ("A*11:01:01", "A*26:01:01"), - ("A*11:01:01G", "A*25:01:01"), - ], - id="two_elements_letter_vs_no_letter", - ), - pytest.param( - [ - ("A*11:01:01N", "A*25:01:01"), - ("A*11:01:01G", "A*26:01:01"), - ], - [ - ("A*11:01:01G", "A*26:01:01"), - ("A*11:01:01N", "A*25:01:01"), - ], - id="two_elements_letter_tiebreak", + id="several_standards_pairs_sorted_without_three_digit_coordinates", ), pytest.param( [ - ("A*11:01:01G", "A*26:01:01N"), - ("A*11:01:01G", "A*26:01:01G"), - ("A*11:01:07", "A*26:01:17"), - ("A*11:40", "A*66:01G"), + HLACombinedStandard( + standard_bin=(1, 4, 5, 9), + possible_allele_pairs=( + ("A*11:01:01G", "A*26:01:01G"), + ("A*11:01:01G", "A*100:01:17"), + ("A*11:19", "A*26:13"), + ("A*11:40", "A*66:01G"), + ), + ), + HLACombinedStandard( + standard_bin=(1, 4, 5, 9), + possible_allele_pairs=( + ("A*22:33:44:55G", "A*01:02:03"), + ("A*101:25:26", "A*27:28:32"), + ("A*32:42", "A*113:110:02:13N"), + ), + ), ], [ ("A*11:01:01G", "A*26:01:01G"), - ("A*11:01:01G", "A*26:01:01N"), - ("A*11:01:07", "A*26:01:17"), + ("A*11:01:01G", "A*100:01:17"), + ("A*11:19", "A*26:13"), ("A*11:40", "A*66:01G"), + ("A*22:33:44:55G", "A*01:02:03"), + ("A*32:42", "A*113:110:02:13N"), + ("A*101:25:26", "A*27:28:32"), ], - id="typical_case", + id="several_standards_pairs_sorted_with_three_digit_coordinates", ), ], ) - def test_sort_pairs( + def test_combine_allele_pairs( self, - raw_allele_pairs: list[tuple[str, str]], - exp_result: list[tuple[str, str]], + combined_standards: list[HLACombinedStandard], + exp_alleles: list[tuple[str, str]], ): - ap: AllelePairs = AllelePairs(allele_pairs=raw_allele_pairs) - assert ap.sort_pairs() == exp_result + result_alleles = AllelePairs.combine_allele_pairs(combined_standards) + assert result_alleles.allele_pairs == exp_alleles @pytest.mark.parametrize( "raw_allele_pairs, sorted, max_length, exp_stringification", diff --git a/tests/utils_test.py b/tests/utils_test.py index f07cc37..43b33a8 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -24,6 +24,7 @@ InvalidBaseException, StoredHLAStandards, allele_coordinates_sort_key, + allele_pair_sort_key, bin2nuc, calc_padding, check_bases, @@ -37,6 +38,7 @@ nuc2bin, pad_short, prepare_for_checksum, + sort_allele_pairs, ) @@ -1093,6 +1095,121 @@ def test_allele_coordinates_sort_key( assert allele_coordinates_sort_key(allele) == expected_coords +@pytest.mark.parametrize( + "allele_pair, expected_sort_key", + [ + pytest.param( + ("A*01", "A*01"), ((1,), "", (1,), ""), id="both_single_coordinate" + ), + pytest.param( + ("A*01N", "A*01"), + ((1,), "N", (1,), ""), + id="single_coords_first_with_suffix", + ), + pytest.param( + ("A*01", "A*01N"), + ((1,), "", (1,), "N"), + id="single_coords_second_with_suffix", + ), + pytest.param( + ("B*57:02G", "B*57:02:15:02"), + ((57, 2), "G", (57, 2, 15, 2), ""), + id="typical_case", + ), + ], +) +def test_allele_pair_sort_key( + allele_pair: tuple[str, str], + expected_sort_key: tuple[tuple[int, ...], str, tuple[int, ...], str], +): + assert allele_pair_sort_key(allele_pair) == expected_sort_key + + +@pytest.mark.parametrize( + "allele_pairs, exp_result", + [ + pytest.param( + [], + [], + id="empty_list", + ), + pytest.param( + [ + ("A*11:01:01G", "A*26:01:01G"), + ], + [ + ("A*11:01:01G", "A*26:01:01G"), + ], + id="single_element", + ), + pytest.param( + [ + ("A*11:01:01", "A*26:01:01"), + ("A*12:01:01", "A*26:01:01"), + ], + [ + ("A*11:01:01", "A*26:01:01"), + ("A*12:01:01", "A*26:01:01"), + ], + id="two_elements_trivial_sort", + ), + pytest.param( + [ + ("A*12:01:01", "A*26:01:01"), + ("A*11:01:01", "A*26:01:01"), + ], + [ + ("A*11:01:01", "A*26:01:01"), + ("A*12:01:01", "A*26:01:01"), + ], + id="two_elements_nontrivial_sort", + ), + pytest.param( + [ + ("A*11:01:01G", "A*25:01:01"), + ("A*11:01:01", "A*26:01:01"), + ], + [ + ("A*11:01:01", "A*26:01:01"), + ("A*11:01:01G", "A*25:01:01"), + ], + id="two_elements_letter_vs_no_letter", + ), + pytest.param( + [ + ("A*11:01:01N", "A*25:01:01"), + ("A*11:01:01G", "A*26:01:01"), + ], + [ + ("A*11:01:01G", "A*26:01:01"), + ("A*11:01:01N", "A*25:01:01"), + ], + id="two_elements_letter_tiebreak", + ), + pytest.param( + [ + ("A*11:01:01G", "A*26:01:01N"), + ("A*11:01:01G", "A*26:01:01G"), + ("A*11:01:07", "A*26:01:17"), + ("A*11:40", "A*66:01G"), + ], + [ + ("A*11:01:01G", "A*26:01:01G"), + ("A*11:01:01G", "A*26:01:01N"), + ("A*11:01:07", "A*26:01:17"), + ("A*11:40", "A*66:01G"), + ], + id="typical_case", + ), + ], +) +def test_sort_allele_pairs( + allele_pairs: list[tuple[str, str]], + exp_result: list[tuple[str, str]], +): + assert sort_allele_pairs(allele_pairs) == exp_result + + EXON_REFERENCES: dict[HLA_LOCUS, dict[EXON_NAME, str]] = { "A": { "exon2": "ACGT" * 10,