Skip to content

Commit af4045e

Browse files
authored
Merge pull request #10 from cfe-lab/FoldClinicalIn
Closes #9.
2 parents 57794af + 0e31c25 commit af4045e

File tree

7 files changed

+1245
-316
lines changed

7 files changed

+1245
-316
lines changed

src/hla_algorithm/hla_algorithm.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
BIN2NUC,
2424
HLA_LOCUS,
2525
StoredHLAStandards,
26+
allele_coordinates_sort_key,
2627
count_strict_mismatches,
2728
nuc2bin,
29+
sort_allele_pairs,
2830
)
2931

3032
DATE_FORMAT = "%a %b %d %H:%M:%S %Z %Y"
@@ -277,7 +279,13 @@ def combine_standards_stepper(
277279
# that looks like what you get when you sequence HLA.
278280
std_bin = np.array(std_b.sequence) | np.array(std_a.sequence)
279281
allele_pair: tuple[str, str] = cast(
280-
tuple[str, str], tuple(sorted((std_a.allele, std_b.allele)))
282+
tuple[str, str],
283+
tuple(
284+
sorted(
285+
(std_a.allele, std_b.allele),
286+
key=allele_coordinates_sort_key,
287+
)
288+
),
281289
)
282290

283291
# There could be more than one combined standard with the
@@ -363,7 +371,7 @@ def combine_standards(
363371
if mismatch_count <= cutoff:
364372
combined_std: HLACombinedStandard = HLACombinedStandard(
365373
standard_bin=combined_std_bin,
366-
possible_allele_pairs=tuple(sorted(pair_list)),
374+
possible_allele_pairs=tuple(sort_allele_pairs(pair_list)),
367375
)
368376
result[combined_std] = mismatch_count
369377

src/hla_algorithm/interpret_from_json_lib.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
check_bases,
1919
check_length,
2020
nuc2bin,
21+
sort_allele_pairs,
2122
)
2223

2324

@@ -143,7 +144,7 @@ def build_from_interpretation(
143144

144145
return HLAResult(
145146
seqs=seqs,
146-
alleles_all=[f"{x[0]} - {x[1]}" for x in aps.sort_pairs()],
147+
alleles_all=[f"{x[0]} - {x[1]}" for x in sort_allele_pairs(aps.allele_pairs)],
147148
alleles_clean=alleles_clean,
148149
alleles_for_mismatches=f"{rep_ap[0]} - {rep_ap[1]}",
149150
mismatches=[str(x) for x in match_details.mismatches],

src/hla_algorithm/models.py

Lines changed: 150 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
from collections.abc import Iterable
3+
from dataclasses import dataclass, field
34
from operator import itemgetter
45
from typing import Final, Optional
56

@@ -14,6 +15,7 @@
1415
bin2nuc,
1516
count_forgiving_mismatches,
1617
nuc2bin,
18+
sort_allele_pairs,
1719
)
1820

1921

@@ -212,16 +214,17 @@ def from_frequency_entry(
212214
)
213215

214216

217+
GeneCoord = tuple[str, ...]
218+
219+
215220
class AllelePairs(BaseModel):
216221
allele_pairs: list[tuple[str, str]]
217222

218223
def is_homozygous(self) -> bool:
219224
"""
220-
Determine the homozygousness of alleles.
221-
222-
Homozygousity meaning a pair is matching on both sides, ex:
223-
`Cw*0722 - Cw*0722`
225+
Determine the homozygousness of these allele pairs.
224226
227+
A pair is homozygous if both elements match, e.g. C*07:22 - C*07:22.
225228
If *any* pair of alleles matches, then we declare the whole set to be
226229
homozygous.
227230
@@ -287,7 +290,7 @@ def get_protein_pairs(self) -> set[HLAProteinPair]:
287290
for e in self.get_paired_gene_coordinates(True)
288291
}
289292

290-
def get_unambiguous_allele_pairs(
293+
def _get_unambiguous_allele_pairs(
291294
self,
292295
frequencies: dict[HLAProteinPair, int],
293296
) -> list[tuple[str, str]]:
@@ -333,6 +336,115 @@ def get_unambiguous_allele_pairs(
333336

334337
return reduced_set
335338

339+
@dataclass
340+
class CleanPrefixIntermediateResult:
341+
common_prefix: GeneCoord = ()
342+
second_prefix: Optional[GeneCoord] = None
343+
remaining_prefixes: list[GeneCoord] = field(default_factory=list)
344+
345+
@staticmethod
346+
def _identify_clean_prefix_in_pairs(
347+
unambiguous_pairs: list[tuple[GeneCoord, GeneCoord]],
348+
) -> CleanPrefixIntermediateResult:
349+
"""
350+
Identify a "clean" gene coordinate "prefix" in the given unambiguous pairs.
351+
352+
This prefix can occur in either element of a given pair. For example,
353+
if the pairs are
354+
- B*01:01:01 - B*01:01:02:110G
355+
- B*01:01:02:99 - B*01:22
356+
then the longest common prefix is ("B*01", "01", "02").
357+
358+
If we happen to find an "exact" allele that occurs in all the pairs, then
359+
that's a "clean" allele and we report it back, even if it's shorter than
360+
the longest common prefix.
361+
362+
A precondition is that the input must be an unambiguous collection of
363+
pairs. The algorithm may not return cogent values if not.
364+
365+
Return a tuple containing this clean prefix, a second clean prefix
366+
if one is found, and a list containing all the remaining
367+
alleles in the pairs if such a second prefix is *not* found (if a
368+
second prefix is found, this list will be empty).
369+
"""
370+
if len(unambiguous_pairs) == 0:
371+
return AllelePairs.CleanPrefixIntermediateResult()
372+
373+
common_prefix: GeneCoord = ()
374+
second_prefix: Optional[GeneCoord] = None
375+
remaining_prefixes: list[GeneCoord] = []
376+
377+
max_length: int = max(
378+
[max(len(pair[0]), len(pair[1])) for pair in unambiguous_pairs]
379+
)
380+
for i in range(max_length, 0, -1):
381+
# Note that this may not "cut down" some pairs if they're shorter
382+
# than max_length.
383+
curr_pairs = [(pair[0][0:i], pair[1][0:i]) for pair in unambiguous_pairs]
384+
385+
# On the first iteration, we might "accidentally" find exact matches
386+
# which are shorter (or equal to) than max_length; if so, great
387+
# ¯\_(ツ)_/¯
388+
common_prefixes: set[GeneCoord] = set(curr_pairs[0])
389+
for curr_pair in curr_pairs[1:]:
390+
common_prefixes = common_prefixes & set(curr_pair)
391+
392+
if len(common_prefixes) == 0:
393+
continue
394+
395+
# Having reached here, we know that we found at least one common
396+
# prefix.
397+
common_prefix = common_prefixes.pop()
398+
if len(common_prefixes) == 1:
399+
# The other prefix is good too.
400+
second_prefix = common_prefixes.pop()
401+
402+
else:
403+
# Having reached here, we know that we found exactly one common
404+
# prefix, and will look for the best prefix in what remains.
405+
for curr_pair in curr_pairs:
406+
curr_unique_prefixes: set[GeneCoord] = set(curr_pair)
407+
if len(curr_unique_prefixes) != 1:
408+
# There were two distinct alleles in this pair, one of which
409+
# was longest_prefix, so we retain the other one.
410+
# (If there had only been one, then it must have been a
411+
# homozygous pair "[longest_prefix] - [longest_prefix]",
412+
# so we want to retain one "copy" for the next stage.)
413+
curr_unique_prefixes.remove(common_prefix)
414+
415+
remaining_prefixes.append(curr_unique_prefixes.pop())
416+
if i > 1:
417+
# This is unnecessary but it gets us 100% test coverage
418+
# ¯\_(ツ)_/¯
419+
break
420+
421+
return AllelePairs.CleanPrefixIntermediateResult(
422+
common_prefix, second_prefix, remaining_prefixes
423+
)
424+
425+
@staticmethod
426+
def _identify_longest_prefix(allele_prefixes: list[GeneCoord]) -> GeneCoord:
427+
"""
428+
Identify the longest gene coordinate "prefix" in the given allele prefixes.
429+
430+
Precondition: that the input must all share at least the same first
431+
coordinate. The algorithm may not return cogent values if not.
432+
433+
Precondition: the specified allele prefixes do not all perfectly match,
434+
so we lose nothing by trimming one coordinate off the end of all of
435+
them.
436+
"""
437+
longest_prefix: GeneCoord = ()
438+
if len(allele_prefixes) > 0:
439+
max_length: int = max([len(allele) for allele in allele_prefixes])
440+
for i in range(max_length - 1, 0, -1):
441+
curr_prefixes: set[GeneCoord] = {allele[0:i] for allele in allele_prefixes}
442+
if len(curr_prefixes) == 1:
443+
longest_prefix = curr_prefixes.pop()
444+
if i > 1:
445+
break
446+
return longest_prefix
447+
336448
def best_common_allele_pair_str(
337449
self,
338450
frequencies: dict[HLAProteinPair, int],
@@ -342,16 +454,14 @@ def best_common_allele_pair_str(
342454
343455
The allele pairs are filtered to an unambiguous set (using the specified
344456
frequencies to determine which ones to retain). Then, the "best common
345-
coordinates" for all the remaining allele allele pairs are used to build
457+
coordinates" for all the remaining allele pairs are used to build
346458
a string representation of the set.
347459
348460
Example: if, after filtering, the allele pairs remaining are:
349-
```
350-
[ [A*11:02:01, A*12:01],
351-
[A*11:02:02, A*12:02],
352-
[A*11:02:03, A*12:03] ]
353-
```
354-
we expect to get `A*11:02 - A*12`.
461+
- A*11:02:01 - A*12:01
462+
- A*11:02:02 - A*12:02
463+
- A*11:02:03 - A*12:03
464+
we expect to get "A*11:02 - A*12".
355465
356466
:return: A string representing the best common allele pair, and the
357467
unambiguous set this string represents.
@@ -360,47 +470,36 @@ def best_common_allele_pair_str(
360470
# Starting with an unambiguous set assures that we will definitely get
361471
# a result.
362472
unambiguous_aps: AllelePairs = AllelePairs(
363-
allele_pairs=self.get_unambiguous_allele_pairs(frequencies)
473+
allele_pairs=self._get_unambiguous_allele_pairs(frequencies)
364474
)
365475
paired_gene_coordinates: list[tuple[list[str], list[str]]] = (
366-
unambiguous_aps.get_paired_gene_coordinates()
476+
unambiguous_aps.get_paired_gene_coordinates(digits_only=False)
367477
)
368478

369-
clean_allele: list[str] = []
370-
for n in [0, 1]:
371-
for i in [4, 3, 2, 1]:
372-
all_leading_coordinates = {
373-
":".join(a[n][0:i]) for a in paired_gene_coordinates
374-
}
375-
if len(all_leading_coordinates) == 1:
376-
best_common_coords = all_leading_coordinates.pop()
377-
clean_allele.append(
378-
re.sub(
379-
r"[A-Z]$",
380-
"",
381-
best_common_coords,
382-
)
383-
)
384-
if i > 1:
385-
# This branch is unnecessary but it gets us 100% code
386-
# coverage ¯\_(ツ)_/¯
387-
break
479+
# Look for the longest common prefix present in all pairs.
480+
curr_pairs: list[tuple[GeneCoord, GeneCoord]] = [
481+
(tuple(pair[0]), tuple(pair[1])) for pair in paired_gene_coordinates
482+
]
388483

389-
clean_allele_pair_str: str = " - ".join(clean_allele)
390-
return (clean_allele_pair_str, set(unambiguous_aps.allele_pairs))
484+
intermediate_data: AllelePairs.CleanPrefixIntermediateResult = (
485+
self._identify_clean_prefix_in_pairs(curr_pairs)
486+
)
391487

392-
def sort_pairs(self) -> list[tuple[str, str]]:
393-
"""
394-
Sort the pairs according to "coordinate order".
488+
second_prefix: GeneCoord = (
489+
intermediate_data.second_prefix or self._identify_longest_prefix(
490+
intermediate_data.remaining_prefixes
491+
)
492+
)
395493

396-
If there's a tie, a last letter is used to attempt to break the tie.
397-
"""
398-
return sorted(
399-
self.allele_pairs,
400-
key=lambda pair: (
401-
allele_coordinates_sort_key(pair[0]),
402-
allele_coordinates_sort_key(pair[1]),
403-
),
494+
# Turn the two prefixes we found into strings and strip any trailing
495+
# letters.
496+
clean_allele_pair: list[str] = [
497+
re.sub(r"[A-Z]$", "", ":".join(allele))
498+
for allele in (intermediate_data.common_prefix, second_prefix)
499+
]
500+
return (
501+
" - ".join(sorted(clean_allele_pair, key=allele_coordinates_sort_key)),
502+
set(unambiguous_aps.allele_pairs),
404503
)
405504

406505
def stringify(self, sorted=True, max_length: int = 3900) -> str:
@@ -415,7 +514,7 @@ def stringify(self, sorted=True, max_length: int = 3900) -> str:
415514
"""
416515
allele_pairs: list[tuple[str, str]] = self.allele_pairs
417516
if sorted:
418-
allele_pairs = self.sort_pairs()
517+
allele_pairs = sort_allele_pairs(self.allele_pairs)
419518
summary_str: str = ";".join([f"{_a[0]} - {_a[1]}" for _a in allele_pairs])
420519
if len(summary_str) > max_length:
421520
summary_str = re.sub(
@@ -426,7 +525,7 @@ def stringify(self, sorted=True, max_length: int = 3900) -> str:
426525
return summary_str
427526

428527
@classmethod
429-
def get_allele_pairs(
528+
def combine_allele_pairs(
430529
cls,
431530
combined_standards: Iterable[HLACombinedStandard],
432531
) -> "AllelePairs":
@@ -441,7 +540,7 @@ def get_allele_pairs(
441540
all_allele_pairs: list[tuple[str, str]] = []
442541
for combined_std in combined_standards:
443542
all_allele_pairs.extend(combined_std.possible_allele_pairs)
444-
all_allele_pairs.sort()
543+
all_allele_pairs = sort_allele_pairs(all_allele_pairs)
445544
return cls(allele_pairs=all_allele_pairs)
446545

447546
def contains_allele(self, allele_name: str) -> bool:
@@ -474,7 +573,7 @@ def best_matches(self) -> set[HLACombinedStandard]:
474573
}
475574

476575
def best_matching_allele_pairs(self) -> AllelePairs:
477-
return AllelePairs.get_allele_pairs(self.best_matches())
576+
return AllelePairs.combine_allele_pairs(self.best_matches())
478577

479578
def best_common_allele_pair(
480579
self,
@@ -491,7 +590,7 @@ def best_common_allele_pair(
491590
ap_to_cs[ap] = cs
492591

493592
# Get an unambiguous set of allele pairs from the best matches:
494-
best_aps: AllelePairs = AllelePairs.get_allele_pairs(best_matches)
593+
best_aps: AllelePairs = AllelePairs.combine_allele_pairs(best_matches)
495594
clean_ap_str: str
496595
best_unambiguous: set[tuple[str, str]]
497596
clean_ap_str, best_unambiguous = best_aps.best_common_allele_pair_str(

src/hla_algorithm/utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,28 @@ def allele_coordinates_sort_key(allele: str) -> tuple[tuple[int, ...], str]:
368368
return (integer_part, letters_at_end)
369369

370370

371+
def allele_pair_sort_key(pair: tuple[str, str]) -> tuple[
372+
tuple[int, ...], str, tuple[int, ...], str
373+
]:
374+
"""
375+
Produce a sortable key for an allele pair.
376+
377+
Pairs should be sorted according to "coordinate order".
378+
If there's a tie, a last letter is used to attempt to break the tie.
379+
"""
380+
return (
381+
allele_coordinates_sort_key(pair[0])
382+
+ allele_coordinates_sort_key(pair[1])
383+
)
384+
385+
386+
def sort_allele_pairs(allele_pairs: Iterable[tuple[str, str]]) -> list[tuple[str, str]]:
387+
"""
388+
Sort the pairs according to "coordinate order".
389+
"""
390+
return sorted(allele_pairs, key=allele_pair_sort_key)
391+
392+
371393
class HLARawStandard(BaseModel):
372394
allele: str
373395
exon2: str

0 commit comments

Comments
 (0)